diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 5179462..f64c4a1 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,102 +1,105 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.gnu.models import GNUModel from swh.lister.gnu.tree import GNUTree logger = logging.getLogger(__name__) class GNULister(SimpleLister): MODEL = GNUModel LISTER_NAME = 'gnu' instance = 'gnu' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz') def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. This creates tasks with args and kwargs set, for example: .. code-block:: python args: kwargs: { 'url': 'https://ftp.gnu.org/gnu/3dldf/', 'artifacts': [{ 'url': 'https://...', - 'time': 1071002600, - 'length': 128}, + 'time': '2003-12-09T21:43:20+00:00', + 'length': 128, + 'version': '1.0.1', + 'filename': 'something-1.0.1.tar.gz', + }, ... - ]} + ] + } """ artifacts = self.gnu_tree.artifacts[origin_url] return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), url=origin_url, artifacts=artifacts) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the GNUTree which deals with querying the gnu mirror. As an implementation detail, we cannot change simply the base SimpleLister as other implementation still uses it. This shall be part of another refactoring pass. """ return None def list_packages(self, response): """List the actual gnu origins (package name) with their name, url and associated tarballs. Args: response: Unused Returns: List of packages name, url, last modification time .. code-block:: python - [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'time_modified': 1071002600}, + 'time_modified': '2003-12-09T20:43:20+00:00'}, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'time_modified': 1480991830}, + 'time_modified': '2016-12-06T02:37:10+00:00'}, ... ] """ return list(self.gnu_tree.projects.values()) def get_model_from_repo(self, repo): """Transform from repository representation to model """ return { 'uid': repo['name'], 'name': repo['name'], 'full_name': repo['name'], 'html_url': repo['url'], 'origin_url': repo['url'], 'time_last_updated': repo['time_modified'], 'origin_type': 'tar', } diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py index bb78f85..38c47ae 100644 --- a/swh/lister/gnu/models.py +++ b/swh/lister/gnu/models.py @@ -1,17 +1,17 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from sqlalchemy import Column, DateTime, Integer, String +from sqlalchemy import Column, DateTime, String from ..core.models import ModelBase class GNUModel(ModelBase): """a GNU repository representation """ __tablename__ = 'gnu_repo' uid = Column(String, primary_key=True) time_last_updated = Column(DateTime) diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py index 0e6193b..ea25515 100644 --- a/swh/lister/gnu/tests/test_tree.py +++ b/swh/lister/gnu/tests/test_tree.py @@ -1,223 +1,226 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import pytest from os import path from swh.lister.gnu.tree import ( GNUTree, find_artifacts, check_filename_is_archive, load_raw_data, get_version, format_date ) def test_load_raw_data_from_query(requests_mock_datadir): actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz') assert actual_json is not None assert isinstance(actual_json, list) assert len(actual_json) == 2 def test_load_raw_data_from_query_failure(requests_mock_datadir): inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz' with pytest.raises(ValueError, match='Error during query'): load_raw_data(inexistant_url) def test_load_raw_data_from_file(datadir): filepath = path.join(datadir, 'https_ftp.gnu.org', 'tree.json.gz') actual_json = load_raw_data(filepath) assert actual_json is not None assert isinstance(actual_json, list) assert len(actual_json) == 2 def test_load_raw_data_from_file_failure(datadir): unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz') with pytest.raises(FileNotFoundError): load_raw_data(unknown_path) def test_tree_json(requests_mock_datadir): tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz') assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == { 'name': '8sync', 'time_modified': '2017-03-18T06:10:08+00:00', 'url': 'https://ftp.gnu.org/gnu/8sync/' } assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == { 'name': '3dldf', 'time_modified': '2013-12-13T19:00:36+00:00', 'url': 'https://ftp.gnu.org/gnu/3dldf/' } assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == { 'name': 'a2ps', 'time_modified': '2007-12-29T03:55:05+00:00', 'url': 'https://ftp.gnu.org/gnu/a2ps/' } assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == { 'name': 'xshogi', 'time_modified': '2003-08-02T11:15:22+00:00', 'url': 'https://ftp.gnu.org/old-gnu/xshogi/' } assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [ { 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa 'length': 90106, 'time': '1997-03-10T08:00:00+00:00', 'filename': 'zlibc-0.9b.tar.gz', 'version': '0.9b', }, { 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa 'length': 89625, 'time': '1997-04-07T07:00:00+00:00', 'filename': 'zlibc-0.9e.tar.gz', 'version': '0.9e', } ] def test_tree_json_failures(requests_mock_datadir): url = 'https://unknown/tree.json.gz' tree_json = GNUTree(url) with pytest.raises(ValueError, match='Error during query to %s' % url): tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/'] with pytest.raises(ValueError, match='Error during query to %s' % url): tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] def test_find_artifacts_small_sample(datadir): expected_artifacts = [ { 'url': '/root/artanis/artanis-0.2.1.tar.bz2', 'time': '2017-05-19T14:59:39+00:00', 'length': 424081, 'version': '0.2.1', 'filename': 'artanis-0.2.1.tar.bz2', }, { 'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa 'time': '1998-06-21T09:55:00+00:00', 'length': 1514448, 'version': '4_0_0-src', 'filename': 'winboard-4_0_0-src.zip', }, { 'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa 'time': '1997-07-25T07:00:00+00:00', 'length': 450164, 'version': '3.6.2', 'filename': 'xboard-3.6.2.tar.gz', }, { 'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa 'time': '1998-06-21T09:55:00+00:00', 'length': 514951, 'version': '4.0.0', 'filename': 'xboard-4.0.0.tar.gz', }, ] file_structure = json.load(open(path.join(datadir, 'tree.min.json'))) actual_artifacts = find_artifacts(file_structure, '/root/') assert actual_artifacts == expected_artifacts def test_find_artifacts(datadir): file_structure = json.load(open(path.join(datadir, 'tree.json'))) actual_artifacts = find_artifacts(file_structure, '/root/') assert len(actual_artifacts) == 42 + 3 # tar + zip def test_check_filename_is_archive(): for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']: assert check_filename_is_archive(ext) is True for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']: assert check_filename_is_archive(ext) is False def test_get_version(): - """From url to branch name should yield something relevant + """Parsing version from url should yield some form of "sensible" version + + Given the dataset, it's not a simple task to extract correctly the version. """ for url, expected_branchname in [ ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'), ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'), ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'), ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'), ('https://ftp.org/gnu/aris-w32.zip', 'w32'), ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'), ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'), ('https://ftp.org/gnu/crypto-build-demo.tar.gz', 'crypto-build-demo'), ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz', 'clue+clio+xit.clisp'), ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz', 'clue+clio.for-pcl'), ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz', 'hppa2.0-hp-hpux10.20'), ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'), ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'), ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'), ('clisp-powerpc-unknown-linuxlibc6.tar.gz', 'powerpc-unknown-linuxlibc6'), ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'), ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'), ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'), ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz', 'sparc-sun-sunos4.1.3_U1'), ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz', '2.25.1-powerpc-apple-MacOSX'), ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz', '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'), ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz', '2.27-i686-unknown-Linux-2.2.19'), ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz', '2.28-i386-i386-freebsd-4.3-RELEASE'), ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz', '2.28-i686-unknown-cygwin_me-4.90-1.3.10'), ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz', '2.29-i386-i386-freebsd-4.6-STABLE'), ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz', '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'), ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip', '2.5.3-ansi-japi-xdr.20030701_mingw32'), ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), ('sather-logo_images.tar.gz', 'sather-logo_images'), - ('sather-specification-000328.html.tar.gz', '000328.html') + ('sather-specification-000328.html.tar.gz', '000328.html'), + ('something-10.1.0.7z', '10.1.0'), ]: actual_branchname = get_version(url) assert actual_branchname == expected_branchname def test_format_date(): for timestamp, expected_isoformat_date in [ (1489817408, '2017-03-18T06:10:08+00:00'), (1386961236, '2013-12-13T19:00:36+00:00'), ('1198900505', '2007-12-29T03:55:05+00:00'), (1059822922, '2003-08-02T11:15:22+00:00'), ('1489817408', '2017-03-18T06:10:08+00:00'), ]: actual_date = format_date(timestamp) assert actual_date == expected_isoformat_date with pytest.raises(ValueError): format_date('') with pytest.raises(TypeError): format_date(None) diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py index 5a4991f..8ef6bd6 100644 --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -1,309 +1,308 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import gzip import json import logging import requests import re from datetime import datetime from os import path from pathlib import Path from pytz import utc -from typing import Any, Dict, List, Mapping, Tuple +from typing import Any, List, Mapping, Sequence, Tuple from urllib.parse import urlparse logger = logging.getLogger(__name__) class GNUTree: """Gnu Tree's representation """ def __init__(self, url: str): self.url = url # filepath or uri u = urlparse(url) self.base_url = '%s://%s' % (u.scheme, u.netloc) # Interesting top level directories self.top_level_directories = ['gnu', 'old-gnu'] # internal state - self._artifacts = {} # type: Dict - self._projects = {} # type: Dict + self._artifacts = {} # type: Mapping[str, Any] + self._projects = {} # type: Mapping[str, Any] @property - def projects(self) -> Dict: + def projects(self) -> Mapping[str, Any]: if not self._projects: self._projects, self._artifacts = self._load() return self._projects @property - def artifacts(self) -> Dict: + def artifacts(self) -> Mapping[str, Any]: if not self._artifacts: self._projects, self._artifacts = self._load() return self._artifacts - def _load(self) -> Tuple[Dict, Dict]: + def _load(self) -> Tuple[Mapping[str, Any], Mapping[str, Any]]: """Compute projects and artifacts per project Returns: Tuple of dict projects (key project url, value the associated information) and a dict artifacts (key project url, value the info_file list) """ projects = {} artifacts = {} raw_data = load_raw_data(self.url)[0] for directory in raw_data['contents']: if directory['name'] not in self.top_level_directories: continue infos = directory['contents'] for info in infos: if info['type'] == 'directory': package_url = '%s/%s/%s/' % ( self.base_url, directory['name'], info['name']) package_artifacts = find_artifacts( info['contents'], package_url) if package_artifacts != []: repo_details = { 'name': info['name'], 'url': package_url, 'time_modified': format_date(info['time']) } artifacts[package_url] = package_artifacts projects[package_url] = repo_details return projects, artifacts -def find_artifacts( - filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]: +def find_artifacts(filesystem: List[Mapping[str, Any]], + url: str) -> List[Mapping[str, Any]]: """Recursively list artifacts present in the folder and subfolders for a particular package url. Args: filesystem: File structure of the package root directory. This is a list of Dict representing either file or directory information as dict (keys: name, size, time, type). url: URL of the corresponding package Returns List of tarball urls and their associated metadata (time, length, etc...). For example: .. code-block:: python [ { 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', 'time': 1071002600, 'filename': '3DLDF-1.1.3.tar.gz', 'version': '1.1.3', 'length': 543 }, { 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', 'time': 1071078759, 'filename: '3DLDF-1.1.4.tar.gz', 'version': '1.1.4', 'length': 456 }, { 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', 'time': 1074278633, 'filename': '3DLDF-1.1.5.tar.gz', 'version': '1.1.5' 'length': 251 }, ... ] """ - artifacts = [] + artifacts = [] # type: List[Mapping[str, Any]] for info_file in filesystem: filetype = info_file['type'] filename = info_file['name'] if filetype == 'file': if check_filename_is_archive(filename): uri = url + filename artifacts.append({ 'url': uri, 'filename': filename, 'time': format_date(info_file['time']), 'length': int(info_file['size']), 'version': get_version(filename), }) # It will recursively check for artifacts in all sub-folders elif filetype == 'directory': tarballs_in_dir = find_artifacts( info_file['contents'], url + filename + '/') artifacts.extend(tarballs_in_dir) return artifacts def check_filename_is_archive(filename: str) -> bool: """ Check for the extension of the file, if the file is of zip format of .tar.x format, where x could be anything, then returns true. Args: filename: name of the file for which the extensions is needs to be checked. Returns: Whether filename is an archive or not Example: >>> check_filename_is_archive('abc.zip') True >>> check_filename_is_archive('abc.tar.gz') True >>> check_filename_is_archive('bac.tar') True >>> check_filename_is_archive('abc.tar.gz.sig') False >>> check_filename_is_archive('foobar.tar.') False """ file_suffixes = Path(filename).suffixes - logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): return True elif len(file_suffixes) > 1: if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': return True return False # to recognize existing naming pattern -extensions = [ +EXTENSIONS = [ 'zip', 'tar', 'gz', 'tgz', 'bz2', 'bzip2', 'lzma', 'lz', 'xz', - 'Z', + 'Z', '7z', ] -version_keywords = [ +VERSION_KEYWORDS = [ 'cygwin_me', 'w32', 'win32', 'nt', 'cygwin', 'mingw', 'latest', 'alpha', 'beta', 'release', 'stable', 'hppa', 'solaris', 'sunos', 'sun4u', 'sparc', 'sun', 'aix', 'ibm', 'rs6000', 'i386', 'i686', 'linux', 'redhat', 'linuxlibc', 'mips', 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh', 'unknown', 'netbsd', 'freebsd', 'sgi', 'irix', ] # Match a filename into components. # # We use Debian's release number heuristic: A release number starts # with a digit, and is followed by alphanumeric characters or any of # ., +, :, ~ and - # # We hardcode a list of possible extensions, as this release number # scheme would match them too... We match on any combination of those. # # Greedy matching is done right to left (we only match the extension # greedily with +, software_name and release_number are matched lazily # with +? and *?). -pattern = r''' +PATTERN = r''' ^ (?: # We have a software name and a release number, separated with a # -, _ or dot. (?P.+?[-_.]) - (?P(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+) + (?P({vkeywords}|[0-9][0-9a-zA-Z_.+:~-]*?)+) | # We couldn't match a release number, put everything in the # software name. (?P.+?) ) -(?P(?:\.(?:%(extensions)s))+) +(?P(?:\.(?:{extensions}))+) $ -''' % { - 'extensions': '|'.join(extensions), - 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords), -} +'''.format( + extensions='|'.join(EXTENSIONS), + vkeywords='|'.join('%s[-]?' % k for k in VERSION_KEYWORDS), +) def get_version(uri: str) -> str: """Extract branch name from tarball uri Args: uri (str): Tarball URI Returns: Version detected Example: For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz >>> get_version(uri) '0.2.0' For uri = 8sync-0.3.0.tar.gz >>> get_version(uri) '0.3.0' """ filename = path.split(uri)[-1] - m = re.match(pattern, filename, + m = re.match(PATTERN, filename, flags=re.VERBOSE | re.IGNORECASE) if m: d = m.groupdict() if d['software_name1'] and d['release_number']: return d['release_number'] if d['software_name2']: return d['software_name2'] return '' -def load_raw_data(url: str) -> List[Dict]: +def load_raw_data(url: str) -> Sequence[Mapping]: """Load the raw json from the tree.json.gz Args: url: Tree.json.gz url or path Returns: The raw json list """ if url.startswith('http://') or url.startswith('https://'): response = requests.get(url, allow_redirects=True) if not response.ok: raise ValueError('Error during query to %s' % url) raw = gzip.decompress(response.content) else: with gzip.open(url, 'r') as f: raw = f.read() raw_data = json.loads(raw.decode('utf-8')) return raw_data def format_date(timestamp: str) -> str: """Format a string timestamp to an isoformat string """ return datetime.fromtimestamp(int(timestamp), tz=utc).isoformat()