diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -2,118 +2,89 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import random -import gzip -import json -import requests -from pathlib import Path -from collections import defaultdict - -from .models import GNUModel +import logging from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister +from swh.lister.gnu.models import GNUModel +from swh.lister.gnu.tree import GNUTree + + +logger = logging.getLogger(__name__) + class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' - TREE_URL = 'https://ftp.gnu.org/tree.json.gz' - BASE_URL = 'https://ftp.gnu.org' instance = 'gnu' - tarballs = defaultdict(dict) # Dict of key with project name value the - # associated is list of tarballs of package to ingest from the gnu mirror + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz') def task_dict(self, origin_type, origin_url, **kwargs): - """ - Return task format dict + """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. + + This creates tasks with args and kwargs set, for example: + + .. code-block:: python + + args: ['https://ftp.gnu.org/gnu/3dldf/'] + kwargs: { + 'tarballs': [{ + 'archive': 'https://...', + 'time': 1071002600, + 'length': 128}, + ... + ]} + """ + tarballs = self.gnu_tree.artifacts[origin_url] return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), - kwargs.get('name'), origin_url, - tarballs=self.tarballs[kwargs.get('name')]) + tarballs=tarballs) def safely_issue_request(self, identifier): - ''' - Download and unzip tree.json.gz file and returns its content - in JSON format + """Bypass the implementation. It's now the GNUTree which deals with + querying the gnu mirror. - File content in dictionary format + As an implementation detail, we cannot change simply the base + SimpleLister as other implementation still uses it. This shall be part + of another refactoring pass. - Args: - identifier: resource identifier (unused) - - Returns: - Server response - - ''' - response = requests.get(self.TREE_URL, - allow_redirects=True) - uncompressed_content = gzip.decompress(response.content) - return json.loads(uncompressed_content.decode('utf-8')) + """ + return None def list_packages(self, response): - """ - List the actual gnu origins with their names,url and the list - of all the tarball for a package from the response. + """List the actual gnu origins (package name) with their name, url and + associated tarballs. Args: - response : File structure of the website - in dictionary format + response: Unused Returns: - A list of all the packages with their names, url of their root - directory and the tarballs present for the particular package. - [ - {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'tarballs': - [ - {'archive': - 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'date': '1071002600'}, - {'archive': - 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'date': '1071078759'}} - ] - }, - {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'tarballs': - [ - {'archive': - 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'date': '1461357336'}, - {'archive': - 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', - 'date': '1480991830'} - ] - ] + List of packages name, url, last modification time + + .. code-block:: python + + [ + {'name': '3dldf', + 'url': 'https://ftp.gnu.org/gnu/3dldf/', + 'time_modified': 1071002600}, + {'name': '8sync', + 'url': 'https://ftp.gnu.org/gnu/8sync/', + 'time_modified': 1480991830}, + ... + ] + """ - response = filter_directories(response) - packages = [] - for directory in response: - content = directory['contents'] - for repo in content: - if repo['type'] == 'directory': - package_url = '%s/%s/%s/' % (self.BASE_URL, - directory['name'], - repo['name']) - package_tarballs = find_tarballs( - repo['contents'], package_url) - if package_tarballs != []: - repo_details = { - 'name': repo['name'], - 'url': package_url, - 'time_modified': repo['time'], - } - self.tarballs[repo['name']] = package_tarballs - packages.append(repo_details) - random.shuffle(packages) - return packages + return list(self.gnu_tree.projects.values()) def get_model_from_repo(self, repo): """Transform from repository representation to model @@ -128,89 +99,3 @@ 'time_last_updated': int(repo['time_modified']), 'origin_type': 'tar', } - - -def find_tarballs(package_file_structure, url): - '''Recursively lists tarballs present in the folder and subfolders for a - particular package url. - - Args - package_file_structure: File structure of the package root directory - url: URL of the corresponding package - - Returns - List of tarball urls and their associated metadata (time, length). - For example: - - [ - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'time': 1071002600, - 'length': 543}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'time': 1071078759, - 'length': 456}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', - 'time': 1074278633, - 'length': 251}, - ... - ] - - ''' - tarballs = [] - for single_file in package_file_structure: - filetype = single_file['type'] - filename = single_file['name'] - if filetype == 'file': - if file_extension_check(filename): - tarballs.append({ - 'archive': url + filename, - 'time': int(single_file['time']), - 'length': int(single_file['size']), - }) - # It will recursively check for tarballs in all sub-folders - elif filetype == 'directory': - tarballs_in_dir = find_tarballs( - single_file['contents'], - url + filename + '/') - tarballs.extend(tarballs_in_dir) - - return tarballs - - -def filter_directories(response): - ''' - Keep only gnu and old-gnu folders from JSON - ''' - final_response = [] - file_system = response[0]['contents'] - for directory in file_system: - if directory['name'] in ('gnu', 'old-gnu'): - final_response.append(directory) - return final_response - - -def file_extension_check(file_name): - ''' - Check for the extension of the file, if the file is of zip format of - .tar.x format, where x could be anything, then returns true. - - Args: - file_name : name of the file for which the extensions is needs to - be checked. - - Returns: - True or False - - example - file_extension_check('abc.zip') will return True - file_extension_check('abc.tar.gz') will return True - file_extension_check('abc.tar.gz.sig') will return False - - ''' - file_suffixes = Path(file_name).suffixes - if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip': - return True - elif len(file_suffixes) > 1: - if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': - return True - return False diff --git a/swh/lister/gnu/tests/__init__.py b/swh/lister/gnu/tests/__init__.py --- a/swh/lister/gnu/tests/__init__.py +++ b/swh/lister/gnu/tests/__init__.py @@ -0,0 +1,25 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from os.path import join, dirname, isfile +from urllib.parse import urlparse + + +DATADIR = join(dirname(__file__), 'data') + + +def get_response_cb(request, context): + """Mock the http request to send back the on-disk file handler + + """ + url = urlparse(request.url) + dirname = url.hostname + filename = url.path[1:].replace('/', '_') + filepath = join(DATADIR, dirname, filename) + if not isfile(filepath): + context.status_code = 404 + return None + return open(filepath, 'rb') diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -1,29 +1,15 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json +import logging import re -from os.path import join, dirname -from urllib.parse import urlparse +from swh.lister.gnu.tests import get_response_cb -from swh.lister.gnu.lister import ( - find_tarballs, filter_directories, file_extension_check -) - -DATADIR = join(dirname(__file__), 'data') - - -def get_response_cb(request, context): - """Mock the http request to send back the on-disk file handler - - """ - url = urlparse(request.url) - dirname = url.hostname - filename = url.path[1:].replace('/', '_') - return open(join(DATADIR, dirname, filename), 'rb') +logger = logging.getLogger(__name__) def test_lister_no_page_check_results(swh_listers, requests_mock): @@ -33,38 +19,20 @@ lister.run() r = lister.scheduler.search_tasks(task_type='load-tar') - assert len(r) == 382 - # r - # ('type', 'load-tar'), - # ('arguments', - # {'args': ['libmatheval', 'https://ftp.gnu.org/gnu/libmatheval/'], - # 'kwargs': {'tarballs': [{'archive': 'https://ftp.gnu.org/gnu/libmatheval/libmatheval-1.0.0.tar.gz', # noqa - # 'length': 362222, - # 'time': 1068754394}, - # ... - # ] - # ('next_run', - # datetime.datetime(2019, 10, 5, 12, 23, 28, 889293, - # tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=120, name=None))), - # ('current_interval', datetime.timedelta(days=64)), - # ('status', 'next_run_not_scheduled'), - # ('policy', 'oneshot'), - # ('retries_left', 3), - # ('priority', None)]) + assert len(r) == 383 for row in r: assert row['type'] == 'load-tar' # arguments check args = row['arguments']['args'] - assert len(args) == 2 - - package = args[0] - assert package != '' - url = args[1] + assert len(args) == 1 + url = args[0] assert url.startswith('https://ftp.gnu.org') - assert package in url + + url_suffix = url.split('https://ftp.gnu.org')[1] + assert 'gnu' in url_suffix or 'old-gnu' in url_suffix # kwargs kwargs = row['arguments']['kwargs'] @@ -76,54 +44,3 @@ assert set(tarball.keys()) == set(['archive', 'length', 'time']) assert row['policy'] == 'oneshot' - - -def test_filter_directories(): - f = open('swh/lister/gnu/tests/api_response.json') - api_response = json.load(f) - cleared_api_response = filter_directories(api_response) - for directory in cleared_api_response: - if directory['name'] not in ('gnu', 'old-gnu'): - assert False - - -def test_find_tarballs_small_sample(): - expected_tarballs = [ - { - 'archive': '/root/artanis/artanis-0.2.1.tar.bz2', - 'time': 1495205979, - 'length': 424081, - }, - { - 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa - 'time': 898422900, - 'length': 1514448 - }, - { - 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa - 'time': 869814000, - 'length': 450164, - }, - { - 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa - 'time': 898422900, - 'length': 514951, - }, - ] - - file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json')) - actual_tarballs = find_tarballs(file_structure, '/root/') - assert actual_tarballs == expected_tarballs - - -def test_find_tarballs(): - file_structure = json.load(open('swh/lister/gnu/tests/tree.json')) - actual_tarballs = find_tarballs(file_structure, '/root/') - assert len(actual_tarballs) == 42 + 3 # tar + zip - - -def test_file_extension_check(): - assert file_extension_check('abc.xy.zip') - assert file_extension_check('cvb.zip') - assert file_extension_check('abc.tar.bz2') - assert file_extension_check('abc') is False diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/tests/test_tree.py @@ -0,0 +1,147 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import re + +import pytest + +from os import path +from swh.lister.gnu.tree import ( + GNUTree, find_artifacts, check_filename_is_archive, load_raw_data +) + +from swh.lister.gnu.tests import get_response_cb + +from . import DATADIR + + +def test_load_raw_data_from_query(requests_mock): + requests_mock.get(re.compile('https://ftp.gnu.org'), body=get_response_cb) + + actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz') + assert actual_json is not None + assert isinstance(actual_json, list) + assert len(actual_json) == 2 + + +def test_load_raw_data_from_query_failure(requests_mock): + inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz' + requests_mock.get(re.compile('https://ftp2.gnu.org'), body=get_response_cb) + + with pytest.raises(ValueError, match='Error during query'): + load_raw_data(inexistant_url) + + +def test_load_raw_data_from_file(): + filepath = path.join(DATADIR, 'ftp.gnu.org', 'tree.json.gz') + actual_json = load_raw_data(filepath) + assert actual_json is not None + assert isinstance(actual_json, list) + assert len(actual_json) == 2 + + +def test_load_raw_data_from_file_failure(): + unknown_path = path.join(DATADIR, 'ftp.gnu.org2', 'tree.json.gz') + with pytest.raises(FileNotFoundError): + load_raw_data(unknown_path) + + +def test_tree_json(requests_mock): + requests_mock.get(re.compile('https://ftp.gnu.org'), body=get_response_cb) + + tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz') + + assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == { + 'name': '8sync', + 'time_modified': '1489817408', + 'url': 'https://ftp.gnu.org/gnu/8sync/' + } + + assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == { + 'name': '3dldf', + 'time_modified': '1386961236', + 'url': 'https://ftp.gnu.org/gnu/3dldf/' + } + + assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == { + 'name': 'a2ps', + 'time_modified': '1198900505', + 'url': 'https://ftp.gnu.org/gnu/a2ps/' + } + + assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == { + 'name': 'xshogi', + 'time_modified': '1059822922', + 'url': 'https://ftp.gnu.org/old-gnu/xshogi/' + } + + assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [ + { + 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa + 'length': 90106, + 'time': 857980800 + }, + { + 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa + 'length': 89625, + 'time': 860396400 + } + ] + + +def test_tree_json_failures(requests_mock): + requests_mock.get(re.compile('https://unknown'), body=get_response_cb) + url = 'https://unknown/tree.json.gz' + tree_json = GNUTree(url) + + with pytest.raises(ValueError, match='Error during query to %s' % url): + tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/'] + + with pytest.raises(ValueError, match='Error during query to %s' % url): + tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] + + +def test_find_artifacts_small_sample(): + expected_tarballs = [ + { + 'archive': '/root/artanis/artanis-0.2.1.tar.bz2', + 'time': 1495205979, + 'length': 424081, + }, + { + 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa + 'time': 898422900, + 'length': 1514448 + }, + { + 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa + 'time': 869814000, + 'length': 450164, + }, + { + 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa + 'time': 898422900, + 'length': 514951, + }, + ] + + file_structure = json.load(open('swh/lister/gnu/tests/tree.min.json')) + actual_tarballs = find_artifacts(file_structure, '/root/') + assert actual_tarballs == expected_tarballs + + +def test_find_artifacts(): + file_structure = json.load(open('swh/lister/gnu/tests/tree.json')) + actual_tarballs = find_artifacts(file_structure, '/root/') + assert len(actual_tarballs) == 42 + 3 # tar + zip + + +def test_check_filename_is_archive(): + for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']: + assert check_filename_is_archive(ext) is True + + for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']: + assert check_filename_is_archive(ext) is False diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py new file mode 100644 --- /dev/null +++ b/swh/lister/gnu/tree.py @@ -0,0 +1,187 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import gzip +import json +import logging +import requests + +from pathlib import Path +from typing import Dict, Tuple, List +from urllib.parse import urlparse + + +logger = logging.getLogger(__name__) + + +def load_raw_data(url: str) -> List[Dict]: + """Load the raw json from the tree.json.gz + + Args: + url: Tree.json.gz url or path + + Returns: + The raw json list + + """ + if url.startswith('http://') or url.startswith('https://'): + response = requests.get(url, allow_redirects=True) + if not response.ok: + raise ValueError('Error during query to %s' % url) + raw = gzip.decompress(response.content) + else: + with gzip.open(url, 'r') as f: + raw = f.read() + raw_data = json.loads(raw.decode('utf-8')) + return raw_data + + +class GNUTree: + """Gnu Tree's representation + + """ + def __init__(self, url: str): + self.url = url # filepath or uri + u = urlparse(url) + self.base_url = '%s://%s' % (u.scheme, u.netloc) + # Interesting top level directories + self.top_level_directories = ['gnu', 'old-gnu'] + # internal state + self._artifacts = {} # type: Dict + self._projects = {} # type: Dict + + @property + def projects(self) -> Dict: + if not self._projects: + self._projects, self._artifacts = self._load() + return self._projects + + @property + def artifacts(self) -> Dict: + if not self._artifacts: + self._projects, self._artifacts = self._load() + return self._artifacts + + def _load(self) -> Tuple[Dict, Dict]: + """Compute projects and artifacts per project + + Returns: + Tuple of dict projects (key project url, value the associated + information) and a dict artifacts (key project url, value the + info_file list) + + """ + projects = {} + artifacts = {} + + raw_data = load_raw_data(self.url)[0] + for directory in raw_data['contents']: + if directory['name'] not in self.top_level_directories: + continue + infos = directory['contents'] + for info in infos: + if info['type'] == 'directory': + package_url = '%s/%s/%s/' % ( + self.base_url, directory['name'], info['name']) + package_artifacts = find_artifacts( + info['contents'], package_url) + if package_artifacts != []: + repo_details = { + 'name': info['name'], + 'url': package_url, + 'time_modified': info['time'], + } + artifacts[package_url] = package_artifacts + projects[package_url] = repo_details + + return projects, artifacts + + +def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]: + """Recursively list artifacts present in the folder and subfolders for a + particular package url. + + Args: + + filesystem: File structure of the package root directory. This is a + list of Dict representing either file or directory information as + dict (keys: name, size, time, type). + url: URL of the corresponding package + + Returns + List of tarball urls and their associated metadata (time, length). + For example: + + .. code-block:: python + + [ + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', + 'time': 1071002600, + 'length': 543}, + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', + 'time': 1071078759, + 'length': 456}, + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', + 'time': 1074278633, + 'length': 251}, + ... + ] + + """ + artifacts = [] + for info_file in filesystem: + filetype = info_file['type'] + filename = info_file['name'] + if filetype == 'file': + if check_filename_is_archive(filename): + artifacts.append({ + 'archive': url + filename, + 'time': int(info_file['time']), + 'length': int(info_file['size']), + }) + # It will recursively check for artifacts in all sub-folders + elif filetype == 'directory': + tarballs_in_dir = find_artifacts( + info_file['contents'], + url + filename + '/') + artifacts.extend(tarballs_in_dir) + + return artifacts + + +def check_filename_is_archive(filename: str) -> bool: + """ + Check for the extension of the file, if the file is of zip format of + .tar.x format, where x could be anything, then returns true. + + Args: + filename: name of the file for which the extensions is needs to + be checked. + + Returns: + Whether filename is an archive or not + + Example: + + >>> check_filename_is_archive('abc.zip') + True + >>> check_filename_is_archive('abc.tar.gz') + True + >>> check_filename_is_archive('bac.tar') + True + >>> check_filename_is_archive('abc.tar.gz.sig') + False + >>> check_filename_is_archive('foobar.tar.') + False + + """ + file_suffixes = Path(filename).suffixes + logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) + if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): + return True + elif len(file_suffixes) > 1: + if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': + return True + return False