diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py deleted file mode 100644 --- a/swh/loader/package/gnu.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import logging -import re - -from os import path - -from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple - -from swh.loader.package.loader import PackageLoader -from swh.loader.package.utils import release_name - -from swh.model.identifiers import normalize_timestamp - - -logger = logging.getLogger(__name__) - - -# to recognize existing naming pattern -extensions = [ - 'zip', - 'tar', - 'gz', 'tgz', - 'bz2', 'bzip2', - 'lzma', 'lz', - 'xz', - 'Z', -] - -version_keywords = [ - 'cygwin_me', - 'w32', 'win32', 'nt', 'cygwin', 'mingw', - 'latest', 'alpha', 'beta', - 'release', 'stable', - 'hppa', - 'solaris', 'sunos', 'sun4u', 'sparc', 'sun', - 'aix', 'ibm', 'rs6000', - 'i386', 'i686', - 'linux', 'redhat', 'linuxlibc', - 'mips', - 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh', - 'unknown', - 'netbsd', 'freebsd', - 'sgi', 'irix', -] - -# Match a filename into components. -# -# We use Debian's release number heuristic: A release number starts -# with a digit, and is followed by alphanumeric characters or any of -# ., +, :, ~ and - -# -# We hardcode a list of possible extensions, as this release number -# scheme would match them too... We match on any combination of those. -# -# Greedy matching is done right to left (we only match the extension -# greedily with +, software_name and release_number are matched lazily -# with +? and *?). - -pattern = r''' -^ -(?: - # We have a software name and a release number, separated with a - # -, _ or dot. - (?P.+?[-_.]) - (?P(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+) -| - # We couldn't match a release number, put everything in the - # software name. - (?P.+?) -) -(?P(?:\.(?:%(extensions)s))+) -$ -''' % { - 'extensions': '|'.join(extensions), - 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords), -} - - -def get_version(url: str) -> str: - """Extract branch name from tarball url - - Args: - url (str): Tarball URL - - Returns: - byte: Branch name - - Example: - For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz - - >>> get_version(url) - '0.2.0' - - """ - filename = path.split(url)[-1] - m = re.match(pattern, filename, - flags=re.VERBOSE | re.IGNORECASE) - if m: - d = m.groupdict() - if d['software_name1'] and d['release_number']: - return d['release_number'] - if d['software_name2']: - return d['software_name2'] - - return '' - - -class GNULoader(PackageLoader): - visit_type = 'gnu' - SWH_PERSON = { - 'name': b'Software Heritage', - 'fullname': b'Software Heritage', - 'email': b'robot@softwareheritage.org' - } - REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' - - def __init__(self, package_url: str, tarballs: Sequence): - """Loader constructor. - - For now, this is the lister's task output. - - Args: - package_url: Origin url - - tarballs: List of dict with keys `date` (date) and `archive` (str) - the url to retrieve one versioned archive - - """ - super().__init__(url=package_url) - self.tarballs = list(sorted(tarballs, key=lambda v: v['time'])) - - def get_versions(self) -> Sequence[str]: - versions = [] - for archive in self.tarballs: - v = get_version(archive['archive']) - if v: - versions.append(v) - return versions - - def get_default_version(self) -> str: - # It's the most recent, so for this loader, it's the last one - return get_version(self.tarballs[-1]['archive']) - - def get_package_info(self, version: str) -> Generator[ - Tuple[str, Mapping[str, Any]], None, None]: - for a_metadata in self.tarballs: - url = a_metadata['archive'] - package_version = get_version(url) - if version == package_version: - p_info = { - 'url': url, - 'filename': path.split(url)[-1], - 'raw': a_metadata, - } - # FIXME: this code assumes we have only 1 artifact per - # versioned package - yield release_name(version), p_info - - def resolve_revision_from( - self, known_artifacts: Dict, artifact_metadata: Dict) \ - -> Optional[bytes]: - def pk(d): - return [d.get(k) for k in ['time', 'archive', 'length']] - - artifact_pk = pk(artifact_metadata) - for rev_id, known_artifact in known_artifacts.items(): - logging.debug('known_artifact: %s', known_artifact) - known_pk = pk(known_artifact['extrinsic']['raw']) - if artifact_pk == known_pk: - return rev_id - - def build_revision( - self, a_metadata: Mapping[str, Any], - uncompressed_path: str) -> Dict: - normalized_date = normalize_timestamp(int(a_metadata['time'])) - return { - 'type': 'tar', - 'message': self.REVISION_MESSAGE, - 'date': normalized_date, - 'author': self.SWH_PERSON, - 'committer': self.SWH_PERSON, - 'committer_date': normalized_date, - 'parents': [], - 'metadata': { - 'intrinsic': {}, - 'extrinsic': { - 'provider': self.url, - 'when': self.visit_date.isoformat(), - 'raw': a_metadata, - }, - }, - } diff --git a/swh/loader/package/tar.py b/swh/loader/package/tar.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/tar.py @@ -0,0 +1,132 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import iso8601 +import logging + +from os import path +from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple + +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import release_name +from swh.model.identifiers import normalize_timestamp + + +logger = logging.getLogger(__name__) +SWH_PERSON = { + 'name': b'Software Heritage', + 'fullname': b'Software Heritage', + 'email': b'robot@softwareheritage.org' +} +REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' + + +class ArchiveLoader(PackageLoader): + visit_type = 'tar' + + def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]], + pk_artifact_keys: Optional[Sequence[str]] = None): + """Loader constructor. + + For now, this is the lister's task output. + + Args: + url: Origin url + artifacts: List of artifact information with keys: + + **time**: last modification time as either isoformat date string + or timestamp + **url**: the artifact url to retrieve filename + **artifact's filename version**: artifact's version length + **artifact's size + + pk_artifact_keys: Optional List of keys forming a composite primary + key for an artifact + + """ + super().__init__(url=url) + self.artifacts = artifacts # assume order is enforced in the lister + if not pk_artifact_keys: + # default keys for gnu + pk_artifact_keys = ['time', 'url', 'length', 'version'] + self.pk_artifact_keys = pk_artifact_keys + + def get_versions(self) -> Sequence[str]: + versions = [] + for archive in self.artifacts: + v = archive.get('version') + if v: + versions.append(v) + return versions + + def get_default_version(self) -> str: + # It's the most recent, so for this loader, it's the last one + return self.artifacts[-1]['version'] + + def get_package_info(self, version: str) -> Generator[ + Tuple[str, Mapping[str, Any]], None, None]: + for a_metadata in self.artifacts: + url = a_metadata['url'] + package_version = a_metadata['version'] + if version == package_version: + filename = a_metadata.get('filename') + p_info = { + 'url': url, + 'filename': filename if filename else path.split(url)[-1], + 'raw': a_metadata, + } + # FIXME: this code assumes we have only 1 artifact per + # versioned package + yield release_name(version), p_info + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + artifact_pk = pk(artifact_metadata, pk_keys=self.pk_artifact_keys) + for rev_id, known_artifact in known_artifacts.items(): + logging.debug('known_artifact: %s', known_artifact) + reference_artifact = known_artifact['extrinsic']['raw'] + known_pk = pk(reference_artifact, pk_keys=self.pk_artifact_keys) + if artifact_pk == known_pk: + return rev_id + + def build_revision(self, a_metadata: Mapping[str, Any], + uncompressed_path: str) -> Dict: + time = a_metadata['time'] # assume it's a timestamp + if isinstance(time, str): # otherwise, assume it's a parsable date + time = iso8601.parse_date(time) + normalized_time = normalize_timestamp(time) + return { + 'type': 'tar', + 'message': REVISION_MESSAGE, + 'date': normalized_time, + 'author': SWH_PERSON, + 'committer': SWH_PERSON, + 'committer_date': normalized_time, + 'parents': [], + 'metadata': { + 'intrinsic': {}, + 'extrinsic': { + 'provider': self.url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + }, + } + + +def pk(d: Mapping[str, Any], pk_keys: Sequence[str]) -> Sequence[Any]: + """Compute the primary key for a dict using the pk_keys as primary key + composite. + + Args: + d: A dict entry to compute the primary key on + pk_keys: Sequence of keys to use as primary key + + Returns: + The primary key for that dict entry + + """ + return [d.get(k) for k in pk_keys] diff --git a/swh/loader/package/tasks.py b/swh/loader/package/tasks.py --- a/swh/loader/package/tasks.py +++ b/swh/loader/package/tasks.py @@ -4,9 +4,9 @@ # See top-level LICENSE file for more information from celery import current_app as app -from swh.loader.package.gnu import GNULoader +from swh.loader.package.tar import ArchiveLoader -@app.task(name=__name__ + '.LoadGNU') -def load_gnu(name, origin_url=None, tarballs=None): - return GNULoader(origin_url, tarballs).load() +@app.task(name=__name__ + '.LoadTar') +def load_tar(url=None, artifacts=None, pk_keys=None): + return ArchiveLoader(url, artifacts, pk_keys=pk_keys).load() diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_tar.py rename from swh/loader/package/tests/test_gnu.py rename to swh/loader/package/tests/test_tar.py --- a/swh/loader/package/tests/test_gnu.py +++ b/swh/loader/package/tests/test_tar.py @@ -3,74 +3,24 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os -import re - from swh.model.hashutil import hash_to_bytes -from swh.loader.package.gnu import GNULoader, get_version +from swh.loader.package.tar import ArchiveLoader, pk from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) -def test_get_version(): - """From url to branch name should yield something relevant - - """ - for url, expected_branchname in [ - ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'), - ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'), - ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), - ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'), - ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'), - ('https://ftp.org/gnu/aris-w32.zip', 'w32'), - ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'), - ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'), - ('https://ftp.org/gnu/crypto-build-demo.tar.gz', - 'crypto-build-demo'), - ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz', - 'clue+clio+xit.clisp'), - ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz', - 'clue+clio.for-pcl'), - ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz', - 'hppa2.0-hp-hpux10.20'), - ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'), - ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'), - ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'), - ('clisp-powerpc-unknown-linuxlibc6.tar.gz', - 'powerpc-unknown-linuxlibc6'), - - ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'), - ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'), - ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'), - ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz', - 'sparc-sun-sunos4.1.3_U1'), - ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz', - '2.25.1-powerpc-apple-MacOSX'), - ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz', - '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'), - ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz', - '2.27-i686-unknown-Linux-2.2.19'), - ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz', - '2.28-i386-i386-freebsd-4.3-RELEASE'), - ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz', - '2.28-i686-unknown-cygwin_me-4.90-1.3.10'), - ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz', - '2.29-i386-i386-freebsd-4.6-STABLE'), - ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz', - '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'), - ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip', - '2.5.3-ansi-japi-xdr.20030701_mingw32'), - ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), - ('sather-logo_images.tar.gz', 'sather-logo_images'), - ('sather-specification-000328.html.tar.gz', '000328.html') - - ]: - actual_branchname = get_version(url) - - assert actual_branchname == expected_branchname - +URL = 'https://ftp.gnu.org/gnu/8sync/' +GNU_ARTIFACTS = [ + { + 'time': 944729610, + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } +] _expected_new_contents_first_visit = [ 'e9258d81faf5881a2f96a77ba609396f82cb97ad', @@ -134,16 +84,18 @@ _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa -def test_visit_with_no_artifact_found(swh_config, requests_mock): - package_url = 'https://ftp.gnu.org/gnu/8sync/' - tarballs = [{ - 'time': '944729610', - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'length': 221837, - }] - - loader = GNULoader(package_url, tarballs) - requests_mock.get(re.compile('https://'), status_code=404) +def visit_with_no_artifact_found(swh_config, requests_mock_datadir): + url = URL + unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz' + loader = ArchiveLoader(url, artifacts=[ + { + 'time': 944729610, + 'url': unknown_artifact_url, # unknown artifact + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + ]) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' @@ -161,19 +113,12 @@ 'snapshot': 1, } == stats - origin_visit = next(loader.storage.origin_visit_get(package_url)) + origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): - package_url = 'https://ftp.gnu.org/gnu/8sync/' - tarballs = [{ - 'time': '944729610', - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'length': 221837, - }] - - loader = GNULoader(package_url, tarballs) + loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' @@ -205,15 +150,7 @@ """With no prior visit, load a gnu project ends up with 1 snapshot """ - assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini - package_url = 'https://ftp.gnu.org/gnu/8sync/' - tarballs = [{ - 'time': 944729610, - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'length': 221837, - }] - - loader = GNULoader(package_url, tarballs) + loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' @@ -253,15 +190,9 @@ """With no prior visit, load a gnu project ends up with 1 snapshot """ - assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini - url = 'https://ftp.gnu.org/gnu/8sync/' - tarballs = [{ - 'time': 944729610, - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'length': 221837, - }] + url = URL + loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) - loader = GNULoader(url, tarballs) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' origin_visit = list(loader.storage.origin_visit_get(url))[-1] @@ -283,15 +214,10 @@ """With no prior visit, load a gnu project ends up with 1 snapshot """ - assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini - url = 'https://ftp.gnu.org/gnu/8sync/' - tarball1 = { - 'time': 944729610, - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', - 'length': 221837, - } + url = URL + artifact1 = GNU_ARTIFACTS[0] + loader = ArchiveLoader(url, [artifact1]) - loader = GNULoader(url, [tarball1]) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' origin_visit = list(loader.storage.origin_visit_get(url))[-1] @@ -316,12 +242,15 @@ ] assert len(urls) == 1 - tarball2 = { + artifact2 = { 'time': 1480991830, - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'length': 238466, + 'filename': '8sync-0.2.0.tar.gz', + 'version': '0.2.0', } - loader2 = GNULoader(url, [tarball1, tarball2]) + + loader2 = ArchiveLoader(url, [artifact1, artifact2]) # implementation detail: share the storage in between visits loader2.storage = loader.storage stats2 = get_stats(loader2.storage) @@ -352,3 +281,63 @@ ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 + + +def test_pk(): + """Compute primary key should return the right pk + + """ + data = { + 'a': 1, + 'b': 2, + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + + for pk_keys, expected_pk in [ + (['a', 'b'], [1, 2]), + ([], []), + (['a', 'key-that-does-not-exist'], [1, None]) + ]: + actual_pk = pk(data, pk_keys=pk_keys) + assert actual_pk == expected_pk + + +def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): + """Load a project archive (not gnu) ends up with 1 snapshot + + """ + url = 'https://something.else.org/8sync/' + artifacts = [ # this is not a gnu artifact + { + 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp + 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa + # keep a gnu artifact reference to avoid adding other test files + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'length': 238466, + 'filename': '8sync-0.2.0.tar.gz', + 'version': '0.2.0', + } + ] + + # Here the loader defines the pk_keys to use for existence in the snapshot + # It's not the default archive loader which + loader = ArchiveLoader( + url, artifacts=artifacts, pk_artifact_keys=['sha256', 'length', 'url']) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://ftp.gnu.org') + ] + assert len(urls) == 1