diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ xmltodict iso8601 beautifulsoup4 +pytz diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -33,22 +33,27 @@ .. code-block:: python - args: ['https://ftp.gnu.org/gnu/3dldf/'] + args: kwargs: { - 'tarballs': [{ - 'archive': 'https://...', - 'time': 1071002600, - 'length': 128}, + 'url': 'https://ftp.gnu.org/gnu/3dldf/', + 'artifacts': [{ + 'url': 'https://...', + 'time': '2003-12-09T21:43:20+00:00', + 'length': 128, + 'version': '1.0.1', + 'filename': 'something-1.0.1.tar.gz', + }, ... - ]} + ] + } """ - tarballs = self.gnu_tree.artifacts[origin_url] + artifacts = self.gnu_tree.artifacts[origin_url] return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), - origin_url, - tarballs=tarballs) + url=origin_url, + artifacts=artifacts) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the GNUTree which deals with @@ -72,14 +77,13 @@ List of packages name, url, last modification time .. code-block:: python - [ {'name': '3dldf', 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'time_modified': 1071002600}, + 'time_modified': '2003-12-09T20:43:20+00:00'}, {'name': '8sync', 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'time_modified': 1480991830}, + 'time_modified': '2016-12-06T02:37:10+00:00'}, ... ] @@ -96,6 +100,6 @@ 'full_name': repo['name'], 'html_url': repo['url'], 'origin_url': repo['url'], - 'time_last_updated': int(repo['time_modified']), + 'time_last_updated': repo['time_modified'], 'origin_type': 'tar', } diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py --- a/swh/lister/gnu/models.py +++ b/swh/lister/gnu/models.py @@ -2,7 +2,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from sqlalchemy import Column, String, Integer +from sqlalchemy import Column, DateTime, String from ..core.models import ModelBase @@ -14,4 +14,4 @@ __tablename__ = 'gnu_repo' uid = Column(String, primary_key=True) - time_last_updated = Column(Integer) + time_last_updated = Column(DateTime) diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -def test_lister_no_page_check_results(swh_listers, requests_mock_datadir): +def test_gnu_lister(swh_listers, requests_mock_datadir): lister = swh_listers['gnu'] lister.run() @@ -21,21 +21,31 @@ assert row['type'] == 'load-tar' # arguments check args = row['arguments']['args'] - assert len(args) == 1 + assert len(args) == 0 - url = args[0] + # kwargs + kwargs = row['arguments']['kwargs'] + assert set(kwargs.keys()) == {'url', 'artifacts'} + + url = kwargs['url'] assert url.startswith('https://ftp.gnu.org') url_suffix = url.split('https://ftp.gnu.org')[1] assert 'gnu' in url_suffix or 'old-gnu' in url_suffix - # kwargs - kwargs = row['arguments']['kwargs'] - assert list(kwargs.keys()) == ['tarballs'] + artifacts = kwargs['artifacts'] + # check the artifact's structure + artifact = artifacts[0] + assert set(artifact.keys()) == { + 'url', 'length', 'time', 'filename', 'version' + } - tarballs = kwargs['tarballs'] - # check the tarball's structure - tarball = tarballs[0] - assert set(tarball.keys()) == set(['archive', 'length', 'time']) + for artifact in artifacts: + logger.debug(artifact) + # 'time' is an isoformat string now + for key in ['url', 'time', 'filename', 'version']: + assert isinstance(artifact[key], str) + assert isinstance(artifact['length'], int) assert row['policy'] == 'oneshot' + assert row['priority'] is None diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py --- a/swh/lister/gnu/tests/test_tree.py +++ b/swh/lister/gnu/tests/test_tree.py @@ -9,7 +9,8 @@ from os import path from swh.lister.gnu.tree import ( - GNUTree, find_artifacts, check_filename_is_archive, load_raw_data + GNUTree, find_artifacts, check_filename_is_archive, load_raw_data, + get_version, format_date ) @@ -45,38 +46,42 @@ assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == { 'name': '8sync', - 'time_modified': '1489817408', + 'time_modified': '2017-03-18T06:10:08+00:00', 'url': 'https://ftp.gnu.org/gnu/8sync/' } assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == { 'name': '3dldf', - 'time_modified': '1386961236', + 'time_modified': '2013-12-13T19:00:36+00:00', 'url': 'https://ftp.gnu.org/gnu/3dldf/' } assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == { 'name': 'a2ps', - 'time_modified': '1198900505', + 'time_modified': '2007-12-29T03:55:05+00:00', 'url': 'https://ftp.gnu.org/gnu/a2ps/' } assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == { 'name': 'xshogi', - 'time_modified': '1059822922', + 'time_modified': '2003-08-02T11:15:22+00:00', 'url': 'https://ftp.gnu.org/old-gnu/xshogi/' } assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [ { - 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa + 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa 'length': 90106, - 'time': 857980800 + 'time': '1997-03-10T08:00:00+00:00', + 'filename': 'zlibc-0.9b.tar.gz', + 'version': '0.9b', }, { - 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa + 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa 'length': 89625, - 'time': 860396400 + 'time': '1997-04-07T07:00:00+00:00', + 'filename': 'zlibc-0.9e.tar.gz', + 'version': '0.9e', } ] @@ -93,38 +98,46 @@ def test_find_artifacts_small_sample(datadir): - expected_tarballs = [ + expected_artifacts = [ { - 'archive': '/root/artanis/artanis-0.2.1.tar.bz2', - 'time': 1495205979, + 'url': '/root/artanis/artanis-0.2.1.tar.bz2', + 'time': '2017-05-19T14:59:39+00:00', 'length': 424081, + 'version': '0.2.1', + 'filename': 'artanis-0.2.1.tar.bz2', }, { - 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa - 'time': 898422900, - 'length': 1514448 + 'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa + 'time': '1998-06-21T09:55:00+00:00', + 'length': 1514448, + 'version': '4_0_0-src', + 'filename': 'winboard-4_0_0-src.zip', }, { - 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa - 'time': 869814000, + 'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa + 'time': '1997-07-25T07:00:00+00:00', 'length': 450164, + 'version': '3.6.2', + 'filename': 'xboard-3.6.2.tar.gz', }, { - 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa - 'time': 898422900, + 'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa + 'time': '1998-06-21T09:55:00+00:00', 'length': 514951, + 'version': '4.0.0', + 'filename': 'xboard-4.0.0.tar.gz', }, ] file_structure = json.load(open(path.join(datadir, 'tree.min.json'))) - actual_tarballs = find_artifacts(file_structure, '/root/') - assert actual_tarballs == expected_tarballs + actual_artifacts = find_artifacts(file_structure, '/root/') + assert actual_artifacts == expected_artifacts def test_find_artifacts(datadir): file_structure = json.load(open(path.join(datadir, 'tree.json'))) - actual_tarballs = find_artifacts(file_structure, '/root/') - assert len(actual_tarballs) == 42 + 3 # tar + zip + actual_artifacts = find_artifacts(file_structure, '/root/') + assert len(actual_artifacts) == 42 + 3 # tar + zip def test_check_filename_is_archive(): @@ -133,3 +146,81 @@ for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']: assert check_filename_is_archive(ext) is False + + +def test_get_version(): + """Parsing version from url should yield some form of "sensible" version + + Given the dataset, it's not a simple task to extract correctly the version. + + """ + for url, expected_branchname in [ + ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'), + ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'), + ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), + ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'), + ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'), + ('https://ftp.org/gnu/aris-w32.zip', 'w32'), + ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'), + ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'), + ('https://ftp.org/gnu/crypto-build-demo.tar.gz', + 'crypto-build-demo'), + ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz', + 'clue+clio+xit.clisp'), + ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz', + 'clue+clio.for-pcl'), + ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz', + 'hppa2.0-hp-hpux10.20'), + ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'), + ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'), + ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'), + ('clisp-powerpc-unknown-linuxlibc6.tar.gz', + 'powerpc-unknown-linuxlibc6'), + + ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'), + ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'), + ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'), + ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz', + 'sparc-sun-sunos4.1.3_U1'), + ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz', + '2.25.1-powerpc-apple-MacOSX'), + ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz', + '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'), + ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz', + '2.27-i686-unknown-Linux-2.2.19'), + ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz', + '2.28-i386-i386-freebsd-4.3-RELEASE'), + ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz', + '2.28-i686-unknown-cygwin_me-4.90-1.3.10'), + ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz', + '2.29-i386-i386-freebsd-4.6-STABLE'), + ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz', + '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'), + ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip', + '2.5.3-ansi-japi-xdr.20030701_mingw32'), + ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), + ('sather-logo_images.tar.gz', 'sather-logo_images'), + ('sather-specification-000328.html.tar.gz', '000328.html'), + ('something-10.1.0.7z', '10.1.0'), + + ]: + actual_branchname = get_version(url) + + assert actual_branchname == expected_branchname + + +def test_format_date(): + for timestamp, expected_isoformat_date in [ + (1489817408, '2017-03-18T06:10:08+00:00'), + (1386961236, '2013-12-13T19:00:36+00:00'), + ('1198900505', '2007-12-29T03:55:05+00:00'), + (1059822922, '2003-08-02T11:15:22+00:00'), + ('1489817408', '2017-03-18T06:10:08+00:00'), + ]: + actual_date = format_date(timestamp) + assert actual_date == expected_isoformat_date + + with pytest.raises(ValueError): + format_date('') + with pytest.raises(TypeError): + format_date(None) diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -7,37 +7,19 @@ import json import logging import requests +import re +from datetime import datetime +from os import path from pathlib import Path -from typing import Dict, Tuple, List +from pytz import utc +from typing import Any, Dict, List, Mapping, Tuple from urllib.parse import urlparse logger = logging.getLogger(__name__) -def load_raw_data(url: str) -> List[Dict]: - """Load the raw json from the tree.json.gz - - Args: - url: Tree.json.gz url or path - - Returns: - The raw json list - - """ - if url.startswith('http://') or url.startswith('https://'): - response = requests.get(url, allow_redirects=True) - if not response.ok: - raise ValueError('Error during query to %s' % url) - raw = gzip.decompress(response.content) - else: - with gzip.open(url, 'r') as f: - raw = f.read() - raw_data = json.loads(raw.decode('utf-8')) - return raw_data - - class GNUTree: """Gnu Tree's representation @@ -91,7 +73,7 @@ repo_details = { 'name': info['name'], 'url': package_url, - 'time_modified': info['time'], + 'time_modified': format_date(info['time']) } artifacts[package_url] = package_artifacts projects[package_url] = repo_details @@ -99,7 +81,8 @@ return projects, artifacts -def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]: +def find_artifacts( + filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]: """Recursively list artifacts present in the folder and subfolders for a particular package url. @@ -111,21 +94,33 @@ url: URL of the corresponding package Returns - List of tarball urls and their associated metadata (time, length). - For example: + List of tarball urls and their associated metadata (time, length, + etc...). For example: .. code-block:: python [ - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'time': 1071002600, - 'length': 543}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'time': 1071078759, - 'length': 456}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', - 'time': 1074278633, - 'length': 251}, + { + 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', + 'time': 1071002600, + 'filename': '3DLDF-1.1.3.tar.gz', + 'version': '1.1.3', + 'length': 543 + }, + { + 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', + 'time': 1071078759, + 'filename: '3DLDF-1.1.4.tar.gz', + 'version': '1.1.4', + 'length': 456 + }, + { + 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', + 'time': 1074278633, + 'filename': '3DLDF-1.1.5.tar.gz', + 'version': '1.1.5' + 'length': 251 + }, ... ] @@ -136,10 +131,13 @@ filename = info_file['name'] if filetype == 'file': if check_filename_is_archive(filename): + uri = url + filename artifacts.append({ - 'archive': url + filename, - 'time': int(info_file['time']), + 'url': uri, + 'filename': filename, + 'time': format_date(info_file['time']), 'length': int(info_file['size']), + 'version': get_version(filename), }) # It will recursively check for artifacts in all sub-folders elif filetype == 'directory': @@ -178,10 +176,133 @@ """ file_suffixes = Path(filename).suffixes - logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): return True elif len(file_suffixes) > 1: if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': return True return False + + +# to recognize existing naming pattern +EXTENSIONS = [ + 'zip', + 'tar', + 'gz', 'tgz', + 'bz2', 'bzip2', + 'lzma', 'lz', + 'xz', + 'Z', '7z', +] + +VERSION_KEYWORDS = [ + 'cygwin_me', + 'w32', 'win32', 'nt', 'cygwin', 'mingw', + 'latest', 'alpha', 'beta', + 'release', 'stable', + 'hppa', + 'solaris', 'sunos', 'sun4u', 'sparc', 'sun', + 'aix', 'ibm', 'rs6000', + 'i386', 'i686', + 'linux', 'redhat', 'linuxlibc', + 'mips', + 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh', + 'unknown', + 'netbsd', 'freebsd', + 'sgi', 'irix', +] + +# Match a filename into components. +# +# We use Debian's release number heuristic: A release number starts +# with a digit, and is followed by alphanumeric characters or any of +# ., +, :, ~ and - +# +# We hardcode a list of possible extensions, as this release number +# scheme would match them too... We match on any combination of those. +# +# Greedy matching is done right to left (we only match the extension +# greedily with +, software_name and release_number are matched lazily +# with +? and *?). + +PATTERN = r''' +^ +(?: + # We have a software name and a release number, separated with a + # -, _ or dot. + (?P.+?[-_.]) + (?P({vkeywords}|[0-9][0-9a-zA-Z_.+:~-]*?)+) +| + # We couldn't match a release number, put everything in the + # software name. + (?P.+?) +) +(?P(?:\.(?:{extensions}))+) +$ +'''.format( + extensions='|'.join(EXTENSIONS), + vkeywords='|'.join('%s[-]?' % k for k in VERSION_KEYWORDS), +) + + +def get_version(uri: str) -> str: + """Extract branch name from tarball uri + + Args: + uri (str): Tarball URI + + Returns: + Version detected + + Example: + For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz + + >>> get_version(uri) + '0.2.0' + + For uri = 8sync-0.3.0.tar.gz + + >>> get_version(uri) + '0.3.0' + + """ + filename = path.split(uri)[-1] + m = re.match(PATTERN, filename, + flags=re.VERBOSE | re.IGNORECASE) + if m: + d = m.groupdict() + if d['software_name1'] and d['release_number']: + return d['release_number'] + if d['software_name2']: + return d['software_name2'] + + return '' + + +def load_raw_data(url: str) -> List[Dict]: + """Load the raw json from the tree.json.gz + + Args: + url: Tree.json.gz url or path + + Returns: + The raw json list + + """ + if url.startswith('http://') or url.startswith('https://'): + response = requests.get(url, allow_redirects=True) + if not response.ok: + raise ValueError('Error during query to %s' % url) + raw = gzip.decompress(response.content) + else: + with gzip.open(url, 'r') as f: + raw = f.read() + raw_data = json.loads(raw.decode('utf-8')) + return raw_data + + +def format_date(timestamp: str) -> str: + """Format a string timestamp to an isoformat string + + """ + return datetime.fromtimestamp(int(timestamp), tz=utc).isoformat()