diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -43,12 +43,12 @@ ]} """ - tarballs = self.gnu_tree.artifacts[origin_url] + artifacts = self.gnu_tree.artifacts[origin_url] return utils.create_task_dict( 'load-%s' % origin_type, kwargs.get('policy', 'oneshot'), - origin_url, - tarballs=tarballs) + url=origin_url, + artifacts=artifacts) def safely_issue_request(self, identifier): """Bypass the implementation. It's now the GNUTree which deals with diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -def test_lister_no_page_check_results(swh_listers, requests_mock_datadir): +def test_gnu_lister(swh_listers, requests_mock_datadir): lister = swh_listers['gnu'] lister.run() @@ -21,21 +21,23 @@ assert row['type'] == 'load-tar' # arguments check args = row['arguments']['args'] - assert len(args) == 1 + assert len(args) == 0 - url = args[0] + # kwargs + kwargs = row['arguments']['kwargs'] + assert set(kwargs.keys()) == {'url', 'artifacts'} + + url = kwargs['url'] assert url.startswith('https://ftp.gnu.org') url_suffix = url.split('https://ftp.gnu.org')[1] assert 'gnu' in url_suffix or 'old-gnu' in url_suffix - # kwargs - kwargs = row['arguments']['kwargs'] - assert list(kwargs.keys()) == ['tarballs'] - - tarballs = kwargs['tarballs'] - # check the tarball's structure - tarball = tarballs[0] - assert set(tarball.keys()) == set(['archive', 'length', 'time']) + artifacts = kwargs['artifacts'] + # check the artifact's structure + artifact = artifacts[0] + assert set(artifact.keys()) == { + 'url', 'length', 'time', 'filename', 'version' + } assert row['policy'] == 'oneshot' diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py --- a/swh/lister/gnu/tests/test_tree.py +++ b/swh/lister/gnu/tests/test_tree.py @@ -9,7 +9,8 @@ from os import path from swh.lister.gnu.tree import ( - GNUTree, find_artifacts, check_filename_is_archive, load_raw_data + GNUTree, find_artifacts, check_filename_is_archive, load_raw_data, + get_version ) @@ -69,14 +70,18 @@ assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [ { - 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa + 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa 'length': 90106, - 'time': 857980800 + 'time': 857980800, + 'filename': 'zlibc-0.9b.tar.gz', + 'version': '0.9b', }, { - 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa + 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa 'length': 89625, - 'time': 860396400 + 'time': 860396400, + 'filename': 'zlibc-0.9e.tar.gz', + 'version': '0.9e', } ] @@ -93,38 +98,46 @@ def test_find_artifacts_small_sample(datadir): - expected_tarballs = [ + expected_artifacts = [ { - 'archive': '/root/artanis/artanis-0.2.1.tar.bz2', + 'url': '/root/artanis/artanis-0.2.1.tar.bz2', 'time': 1495205979, 'length': 424081, + 'version': '0.2.1', + 'filename': 'artanis-0.2.1.tar.bz2', }, { - 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa + 'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa 'time': 898422900, - 'length': 1514448 + 'length': 1514448, + 'version': '4_0_0-src', + 'filename': 'winboard-4_0_0-src.zip', }, { - 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa + 'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa 'time': 869814000, 'length': 450164, + 'version': '3.6.2', + 'filename': 'xboard-3.6.2.tar.gz', }, { - 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa + 'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa 'time': 898422900, 'length': 514951, + 'version': '4.0.0', + 'filename': 'xboard-4.0.0.tar.gz', }, ] file_structure = json.load(open(path.join(datadir, 'tree.min.json'))) - actual_tarballs = find_artifacts(file_structure, '/root/') - assert actual_tarballs == expected_tarballs + actual_artifacts = find_artifacts(file_structure, '/root/') + assert actual_artifacts == expected_artifacts def test_find_artifacts(datadir): file_structure = json.load(open(path.join(datadir, 'tree.json'))) - actual_tarballs = find_artifacts(file_structure, '/root/') - assert len(actual_tarballs) == 42 + 3 # tar + zip + actual_artifacts = find_artifacts(file_structure, '/root/') + assert len(actual_artifacts) == 42 + 3 # tar + zip def test_check_filename_is_archive(): @@ -133,3 +146,61 @@ for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']: assert check_filename_is_archive(ext) is False + + +def test_get_version(): + """From url to branch name should yield something relevant + + """ + for url, expected_branchname in [ + ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'), + ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'), + ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), + ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'), + ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'), + ('https://ftp.org/gnu/aris-w32.zip', 'w32'), + ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'), + ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'), + ('https://ftp.org/gnu/crypto-build-demo.tar.gz', + 'crypto-build-demo'), + ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz', + 'clue+clio+xit.clisp'), + ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz', + 'clue+clio.for-pcl'), + ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz', + 'hppa2.0-hp-hpux10.20'), + ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'), + ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'), + ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'), + ('clisp-powerpc-unknown-linuxlibc6.tar.gz', + 'powerpc-unknown-linuxlibc6'), + + ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'), + ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'), + ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'), + ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz', + 'sparc-sun-sunos4.1.3_U1'), + ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz', + '2.25.1-powerpc-apple-MacOSX'), + ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz', + '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'), + ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz', + '2.27-i686-unknown-Linux-2.2.19'), + ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz', + '2.28-i386-i386-freebsd-4.3-RELEASE'), + ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz', + '2.28-i686-unknown-cygwin_me-4.90-1.3.10'), + ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz', + '2.29-i386-i386-freebsd-4.6-STABLE'), + ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz', + '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'), + ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip', + '2.5.3-ansi-japi-xdr.20030701_mingw32'), + ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), + ('sather-logo_images.tar.gz', 'sather-logo_images'), + ('sather-specification-000328.html.tar.gz', '000328.html') + + ]: + actual_branchname = get_version(url) + + assert actual_branchname == expected_branchname diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py --- a/swh/lister/gnu/tree.py +++ b/swh/lister/gnu/tree.py @@ -7,15 +7,112 @@ import json import logging import requests +import re +from os import path from pathlib import Path -from typing import Dict, Tuple, List +from typing import Any, Dict, List, Mapping, Tuple from urllib.parse import urlparse logger = logging.getLogger(__name__) +# to recognize existing naming pattern +extensions = [ + 'zip', + 'tar', + 'gz', 'tgz', + 'bz2', 'bzip2', + 'lzma', 'lz', + 'xz', + 'Z', +] + +version_keywords = [ + 'cygwin_me', + 'w32', 'win32', 'nt', 'cygwin', 'mingw', + 'latest', 'alpha', 'beta', + 'release', 'stable', + 'hppa', + 'solaris', 'sunos', 'sun4u', 'sparc', 'sun', + 'aix', 'ibm', 'rs6000', + 'i386', 'i686', + 'linux', 'redhat', 'linuxlibc', + 'mips', + 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh', + 'unknown', + 'netbsd', 'freebsd', + 'sgi', 'irix', +] + +# Match a filename into components. +# +# We use Debian's release number heuristic: A release number starts +# with a digit, and is followed by alphanumeric characters or any of +# ., +, :, ~ and - +# +# We hardcode a list of possible extensions, as this release number +# scheme would match them too... We match on any combination of those. +# +# Greedy matching is done right to left (we only match the extension +# greedily with +, software_name and release_number are matched lazily +# with +? and *?). + +pattern = r''' +^ +(?: + # We have a software name and a release number, separated with a + # -, _ or dot. + (?P.+?[-_.]) + (?P(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+) +| + # We couldn't match a release number, put everything in the + # software name. + (?P.+?) +) +(?P(?:\.(?:%(extensions)s))+) +$ +''' % { + 'extensions': '|'.join(extensions), + 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords), +} + + +def get_version(uri: str) -> str: + """Extract branch name from tarball uri + + Args: + uri (str): Tarball URI + + Returns: + Version detected + + Example: + For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz + + >>> get_version(uri) + '0.2.0' + + For uri = 8sync-0.3.0.tar.gz + + >>> get_version(uri) + '0.3.0' + + """ + filename = path.split(uri)[-1] + m = re.match(pattern, filename, + flags=re.VERBOSE | re.IGNORECASE) + if m: + d = m.groupdict() + if d['software_name1'] and d['release_number']: + return d['release_number'] + if d['software_name2']: + return d['software_name2'] + + return '' + + def load_raw_data(url: str) -> List[Dict]: """Load the raw json from the tree.json.gz @@ -99,7 +196,8 @@ return projects, artifacts -def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]: +def find_artifacts( + filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]: """Recursively list artifacts present in the folder and subfolders for a particular package url. @@ -111,21 +209,33 @@ url: URL of the corresponding package Returns - List of tarball urls and their associated metadata (time, length). - For example: + List of tarball urls and their associated metadata (time, length, + etc...). For example: .. code-block:: python [ - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', - 'time': 1071002600, - 'length': 543}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', - 'time': 1071078759, - 'length': 456}, - {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', - 'time': 1074278633, - 'length': 251}, + { + 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', + 'time': 1071002600, + 'filename': '3DLDF-1.1.3.tar.gz', + 'version': '1.1.3', + 'length': 543 + }, + { + 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', + 'time': 1071078759, + 'filename: '3DLDF-1.1.4.tar.gz', + 'version': '1.1.4', + 'length': 456 + }, + { + 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', + 'time': 1074278633, + 'filename': '3DLDF-1.1.5.tar.gz', + 'version': '1.1.5' + 'length': 251 + }, ... ] @@ -136,10 +246,13 @@ filename = info_file['name'] if filetype == 'file': if check_filename_is_archive(filename): + uri = url + filename artifacts.append({ - 'archive': url + filename, + 'url': uri, + 'filename': filename, 'time': int(info_file['time']), 'length': int(info_file['size']), + 'version': get_version(filename), }) # It will recursively check for artifacts in all sub-folders elif filetype == 'directory':