diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -2,118 +2,89 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import random -import gzip -import json -import requests -from pathlib import Path -from collections import defaultdict - -from .models import GNUModel +import logging from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister +from swh.lister.gnu.models import GNUModel +from swh.lister.gnu.tree import GNUTree + + +logger = logging.getLogger(__name__) + class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' - TREE_URL = 'https://ftp.gnu.org/tree.json.gz' - BASE_URL = 'https://ftp.gnu.org' instance = 'gnu' - tarballs = defaultdict(dict) # Dict of key with project name value the - # associated is list of tarballs of package to ingest from the gnu mirror + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz') def task_dict(self, origin_type, origin_url, **kwargs): - """ - Return task format dict + """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. + + This creates tasks with args and kwargs set, for example: + + .. code-block:: python + + args: ['https://ftp.gnu.org/gnu/3dldf/'] + kwargs: { + 'tarballs': [{ + 'archive': 'https://...', + 'time': 1071002600, + 'length': 128}, + ... + ]} + """ + tarballs = self.gnu_tree.artifacts[origin_url] return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), - kwargs.get('name'), origin_url, - tarballs=self.tarballs[kwargs.get('name')]) + tarballs=tarballs) def safely_issue_request(self, identifier): - ''' - Download and unzip tree.json.gz file and returns its content - in JSON format + """Bypass the implementation. It's now the GNUTree which deals with + querying the gnu mirror. - File content in dictionary format + As an implementation detail, we cannot change simply the base + SimpleLister as other implementation still uses it. This shall be part + of another refactoring pass. - Args: - identifier: resource identifier (unused) - - Returns: - Server response - - ''' - response = requests.get(self.TREE_URL, - allow_redirects=True) - uncompressed_content = gzip.decompress(response.content) - return json.loads(uncompressed_content.decode('utf-8')) + """ + return None def list_packages(self, response): - """ - List the actual gnu origins with their names,url and the list - of all the tarball for a package from the response. + """List the actual gnu origins (package name) with their name, url and + associated tarballs. Args: - response : File structure of the website - in dictionary format + response: Unused Returns: - A list of all the packages with their names, url of their root - directory and the tarballs present for the particular package. - [ - {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'tarballs': - [ - {'archive': - 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'date': '1071002600'}, - {'archive': - 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'date': '1071078759'}} - ] - }, - {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'tarballs': - [ - {'archive': - 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'date': '1461357336'}, - {'archive': - 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', - 'date': '1480991830'} - ] - ] + List of packages name, url, last modification time + + .. code-block:: python + + [ + {'name': '3dldf', + 'url': 'https://ftp.gnu.org/gnu/3dldf/', + 'time_modified': 1071002600}, + {'name': '8sync', + 'url': 'https://ftp.gnu.org/gnu/8sync/', + 'time_modified': 1480991830}, + ... + ] + """ - response = filter_directories(response) - packages = [] - for directory in response: - content = directory['contents'] - for repo in content: - if repo['type'] == 'directory': - package_url = '%s/%s/%s/' % (self.BASE_URL, - directory['name'], - repo['name']) - package_tarballs = find_tarballs( - repo['contents'], package_url) - if package_tarballs != []: - repo_details = { - 'name': repo['name'], - 'url': package_url, - 'time_modified': repo['time'], - } - self.tarballs[repo['name']] = package_tarballs - packages.append(repo_details) - random.shuffle(packages) - return packages + return list(self.gnu_tree.projects.values()) def get_model_from_repo(self, repo): """Transform from repository representation to model @@ -128,89 +99,3 @@ 'time_last_updated': int(repo['time_modified']), 'origin_type': 'tar', } - - -def find_tarballs(package_file_structure, url): - '''Recursively lists tarballs present in the folder and subfolders for a - particular package url. - - Args - package_file_structure: File structure of the package root directory - url: URL of the corresponding package - - Returns - List of tarball urls and their associated metadata (time, length). - For example: - - [ - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'time': 1071002600, - 'length': 543}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'time': 1071078759, - 'length': 456}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.1.tar.gz', - 'time': 1074278633, - 'length': 251}, - ... - ] - - ''' - tarballs = [] - for single_file in package_file_structure: - filetype = single_file['type'] - filename = single_file['name'] - if filetype == 'file': - if file_extension_check(filename): - tarballs.append({ - 'archive': url + filename, - 'time': int(single_file['time']), - 'length': int(single_file['size']), - }) - # It will recursively check for tarballs in all sub-folders - elif filetype == 'directory': - tarballs_in_dir = find_tarballs( - single_file['contents'], - url + filename + '/') - tarballs.extend(tarballs_in_dir) - - return tarballs - - -def filter_directories(response): - ''' - Keep only gnu and old-gnu folders from JSON - ''' - final_response = [] - file_system = response[0]['contents'] - for directory in file_system: - if directory['name'] in ('gnu', 'old-gnu'): - final_response.append(directory) - return final_response - - -def file_extension_check(file_name): - ''' - Check for the extension of the file, if the file is of zip format of - .tar.x format, where x could be anything, then returns true. - - Args: - file_name : name of the file for which the extensions is needs to - be checked. - - Returns: - True or False - - example - file_extension_check('abc.zip') will return True - file_extension_check('abc.tar.gz') will return True - file_extension_check('abc.tar.gz.sig') will return False - - ''' - file_suffixes = Path(file_name).suffixes - if len(file_suffixes) == 1 and file_suffixes[-1] == '.zip': - return True - elif len(file_suffixes) > 1: - if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': - return True - return False diff --git a/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz b/swh/lister/gnu/tests/data/ftp.gnu.org/tree.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ List[Dict]: + """Load the raw json from the tree.json.gz + + Args: + url: Tree.json.gz url or path + + Returns: + The raw json list + + """ + if url.startswith('http://') or url.startswith('https://'): + response = requests.get(url, allow_redirects=True) + if not response.ok: + raise ValueError('Error during query to %s' % url) + raw = gzip.decompress(response.content) + else: + with gzip.open(url, 'r') as f: + raw = f.read() + raw_data = json.loads(raw.decode('utf-8')) + return raw_data + + +class GNUTree: + """Gnu Tree's representation + + """ + def __init__(self, url: str): + self.url = url # filepath or uri + u = urlparse(url) + self.base_url = '%s://%s' % (u.scheme, u.netloc) + # Interesting top level directories + self.top_level_directories = ['gnu', 'old-gnu'] + # internal state + self._artifacts = {} # type: Dict + self._projects = {} # type: Dict + + @property + def projects(self) -> Dict: + if not self._projects: + self._projects, self._artifacts = self._load() + return self._projects + + @property + def artifacts(self) -> Dict: + if not self._artifacts: + self._projects, self._artifacts = self._load() + return self._artifacts + + def _load(self) -> Tuple[Dict, Dict]: + """Compute projects and artifacts per project + + Returns: + Tuple of dict projects (key project url, value the associated + information) and a dict artifacts (key project url, value the + info_file list) + + """ + projects = {} + artifacts = {} + + raw_data = load_raw_data(self.url)[0] + for directory in raw_data['contents']: + if directory['name'] not in self.top_level_directories: + continue + infos = directory['contents'] + for info in infos: + if info['type'] == 'directory': + package_url = '%s/%s/%s/' % ( + self.base_url, directory['name'], info['name']) + package_artifacts = find_artifacts( + info['contents'], package_url) + if package_artifacts != []: + repo_details = { + 'name': info['name'], + 'url': package_url, + 'time_modified': info['time'], + } + artifacts[package_url] = package_artifacts + projects[package_url] = repo_details + + return projects, artifacts + + +def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]: + """Recursively list artifacts present in the folder and subfolders for a + particular package url. + + Args: + + filesystem: File structure of the package root directory. This is a + list of Dict representing either file or directory information as + dict (keys: name, size, time, type). + url: URL of the corresponding package + + Returns + List of tarball urls and their associated metadata (time, length). + For example: + + .. code-block:: python + + [ + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', + 'time': 1071002600, + 'length': 543}, + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', + 'time': 1071078759, + 'length': 456}, + {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', + 'time': 1074278633, + 'length': 251}, + ... + ] + + """ + artifacts = [] + for info_file in filesystem: + filetype = info_file['type'] + filename = info_file['name'] + if filetype == 'file': + if check_filename_is_archive(filename): + artifacts.append({ + 'archive': url + filename, + 'time': int(info_file['time']), + 'length': int(info_file['size']), + }) + # It will recursively check for artifacts in all sub-folders + elif filetype == 'directory': + tarballs_in_dir = find_artifacts( + info_file['contents'], + url + filename + '/') + artifacts.extend(tarballs_in_dir) + + return artifacts + + +def check_filename_is_archive(filename: str) -> bool: + """ + Check for the extension of the file, if the file is of zip format of + .tar.x format, where x could be anything, then returns true. + + Args: + filename: name of the file for which the extensions is needs to + be checked. + + Returns: + Whether filename is an archive or not + + Example: + + >>> check_filename_is_archive('abc.zip') + True + >>> check_filename_is_archive('abc.tar.gz') + True + >>> check_filename_is_archive('bac.tar') + True + >>> check_filename_is_archive('abc.tar.gz.sig') + False + >>> check_filename_is_archive('foobar.tar.') + False + + """ + file_suffixes = Path(filename).suffixes + logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) + if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): + return True + elif len(file_suffixes) > 1: + if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': + return True + return False