diff --git a/swh/loader/package/gnu.py b/swh/loader/package/archive.py similarity index 51% rename from swh/loader/package/gnu.py rename to swh/loader/package/archive.py index 8b7f4a1..6c3557f 100644 --- a/swh/loader/package/gnu.py +++ b/swh/loader/package/archive.py @@ -1,107 +1,135 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import iso8601 import logging +from os import path from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import release_name - from swh.model.identifiers import normalize_timestamp logger = logging.getLogger(__name__) SWH_PERSON = { 'name': b'Software Heritage', 'fullname': b'Software Heritage', 'email': b'robot@softwareheritage.org' } REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' -class GNULoader(PackageLoader): +class ArchiveLoader(PackageLoader): visit_type = 'tar' - def __init__(self, url: str, artifacts: Sequence): + def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]], + identity_artifact_keys: Optional[Sequence[str]] = None): """Loader constructor. For now, this is the lister's task output. Args: url: Origin url - artifacts: List of dict with keys: + artifacts: List of artifact information with keys: + + **time**: last modification time as either isoformat date string + or timestamp + **url**: the artifact url to retrieve filename + **artifact's filename version**: artifact's version length + **length**: artifact's length - **time**: last modification time - **url**: the artifact url to retrieve - **filename**: artifact's filename - **version**: artifact's version - **length**: artifact's size + identity_artifact_keys: Optional List of keys forming the + "identity" of an artifact """ super().__init__(url=url) - self.artifacts = list(sorted(artifacts, key=lambda v: v['time'])) + self.artifacts = artifacts # assume order is enforced in the lister + if not identity_artifact_keys: + # default keys for gnu + identity_artifact_keys = ['time', 'url', 'length', 'version'] + self.identity_artifact_keys = identity_artifact_keys def get_versions(self) -> Sequence[str]: versions = [] for archive in self.artifacts: v = archive.get('version') if v: versions.append(v) return versions def get_default_version(self) -> str: # It's the most recent, so for this loader, it's the last one return self.artifacts[-1]['version'] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: for a_metadata in self.artifacts: - url = a_metadata['archive'] - package_version = get_version(url) + url = a_metadata['url'] + package_version = a_metadata['version'] if version == package_version: + filename = a_metadata.get('filename') p_info = { 'url': url, - 'filename': path.split(url)[-1], + 'filename': filename if filename else path.split(url)[-1], 'raw': a_metadata, } # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: - def pk(d): - return [d.get(k) for k in ['time', 'url', 'length', 'version']] - - artifact_pk = pk(artifact_metadata) + identity = artifact_identity( + artifact_metadata, id_keys=self.identity_artifact_keys) for rev_id, known_artifact in known_artifacts.items(): logging.debug('known_artifact: %s', known_artifact) - known_pk = pk(known_artifact['extrinsic']['raw']) - if artifact_pk == known_pk: + reference_artifact = known_artifact['extrinsic']['raw'] + known_identity = artifact_identity( + reference_artifact, id_keys=self.identity_artifact_keys) + if identity == known_identity: return rev_id - def build_revision( - self, a_metadata: Mapping[str, Any], - uncompressed_path: str) -> Dict: - normalized_date = normalize_timestamp(int(a_metadata['time'])) + def build_revision(self, a_metadata: Mapping[str, Any], + uncompressed_path: str) -> Dict: + time = a_metadata['time'] # assume it's a timestamp + if isinstance(time, str): # otherwise, assume it's a parsable date + time = iso8601.parse_date(time) + normalized_time = normalize_timestamp(time) return { 'type': 'tar', 'message': REVISION_MESSAGE, - 'date': normalized_date, + 'date': normalized_time, 'author': SWH_PERSON, 'committer': SWH_PERSON, - 'committer_date': normalized_date, + 'committer_date': normalized_time, 'parents': [], 'metadata': { 'intrinsic': {}, 'extrinsic': { 'provider': self.url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, }, } + + +def artifact_identity(d: Mapping[str, Any], + id_keys: Sequence[str]) -> Sequence[Any]: + """Compute the primary key for a dict using the id_keys as primary key + composite. + + Args: + d: A dict entry to compute the primary key on + id_keys: Sequence of keys to use as primary key + + Returns: + The identity for that dict entry + + """ + return [d.get(k) for k in id_keys] diff --git a/swh/loader/package/tasks.py b/swh/loader/package/tasks.py index c39ec40..eeeb5ea 100644 --- a/swh/loader/package/tasks.py +++ b/swh/loader/package/tasks.py @@ -1,37 +1,43 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task from swh.loader.package.debian import DebianLoader from swh.loader.package.deposit import DepositLoader -from swh.loader.package.gnu import GNULoader from swh.loader.package.npm import NpmLoader from swh.loader.package.pypi import PyPILoader +from swh.loader.package.archive import ArchiveLoader + + +@shared_task(name=__name__ + '.LoadArchive') +def load_archive(url=None, artifacts=None, identity_artifact_keys=None): + return ArchiveLoader(url, artifacts, + identity_artifact_keys=identity_artifact_keys).load() @shared_task(name=__name__ + '.LoadDebian') def load_debian(*, url, date, packages): return DebianLoader(url, date, packages).load() @shared_task(name=__name__ + '.LoadDeposit') def load_deposit(*, url, deposit_id): return DepositLoader(url, deposit_id).load() @shared_task(name=__name__ + '.LoadGNU') def load_gnu(*, url, tarballs): return GNULoader(url, tarballs).load() @shared_task(name=__name__ + '.LoadNpm') def load_npm(*, package_name, package_url, package_metadata_url): return NpmLoader(package_name, package_url, package_metadata_url).load() @shared_task(name=__name__ + '.LoadPyPI') def load_pypi(*, url=None): return PyPILoader(url).load() diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_archive.py similarity index 73% rename from swh/loader/package/tests/test_gnu.py rename to swh/loader/package/tests/test_archive.py index fd7ba80..d128a44 100644 --- a/swh/loader/package/tests/test_gnu.py +++ b/swh/loader/package/tests/test_archive.py @@ -1,276 +1,344 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import re - from swh.model.hashutil import hash_to_bytes -from swh.loader.package.gnu import GNULoader +from swh.loader.package.archive import ArchiveLoader, artifact_identity from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) + URL = 'https://ftp.gnu.org/gnu/8sync/' -ARTIFACTS = [ +GNU_ARTIFACTS = [ { - 'time': '944729610', - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + 'time': 944729610, + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', 'length': 221837, 'filename': '8sync-0.1.0.tar.gz', 'version': '0.1.0', } ] _expected_new_contents_first_visit = [ 'e9258d81faf5881a2f96a77ba609396f82cb97ad', '1170cf105b04b7e2822a0e09d2acf71da7b9a130', 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', '0057bec9b5422aff9256af240b177ac0e3ac2608', '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', 'd64e64d4c73679323f8d4cde2643331ba6c20af9', '7a756602914be889c0a2d3952c710144b3e64cb0', '84fb589b554fcb7f32b806951dcf19518d67b08f', '8624bcdae55baeef00cd11d5dfcfa60f68710a02', 'e08441aeab02704cfbd435d6445f7c072f8f524e', 'f67935bc3a83a67259cda4b2d43373bd56703844', '809788434b433eb2e3cfabd5d591c9a659d5e3d8', '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', 'b99fec102eb24bffd53ab61fc30d59e810f116a2', '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', 'f0c97052e567948adf03e641301e9983c478ccff', '7fb724242e2b62b85ca64190c31dcae5303e19b3', '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', '7350628ccf194c2c3afba4ac588c33e3f3ac778d', '0bb892d9391aa706dc2c3b1906567df43cbe06a2', '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', '3046e5d1f70297e2a507b98224b6222c9688d610', '1572607d456d7f633bc6065a2b3048496d679a31', ] _expected_new_directories_first_visit = [ 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', '263be23b4a8101d3ad0d9831319a3e0f2b065f36', '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', '4db0a3ecbc976083e2dac01a62f93729698429a3', 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', 'eca971d346ea54d95a6e19d5051f900237fafdaa', '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', ] _expected_new_revisions_first_visit = { '44183488c0774ce3c957fa19ba695cf18a4a42b3': '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' } _expected_branches_first_visit = { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0', }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', }, } # hash is different then before as we changed the snapshot # gnu used to use `release/` (singular) instead of plural _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa -def test_visit_with_no_artifact_found(swh_config, requests_mock): +def visit_with_no_artifact_found(swh_config, requests_mock_datadir): url = URL - loader = GNULoader(url, artifacts=ARTIFACTS) - requests_mock.get(re.compile('https://'), status_code=404) + unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz' + loader = ArchiveLoader(url, artifacts=[ + { + 'time': 944729610, + 'url': unknown_artifact_url, # unknown artifact + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + ]) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): - loader = GNULoader(url=URL, artifacts=ARTIFACTS) + loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' expected_revision_id = hash_to_bytes( '44183488c0774ce3c957fa19ba695cf18a4a42b3') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_visit_with_release_artifact_no_prior_visit( swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ - loader = GNULoader(url=URL, artifacts=ARTIFACTS) + loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(loader.storage.content_missing_per_sha1(expected_contents)) \ == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(loader.storage.directory_missing(expected_dirs)) == [] expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) assert list(loader.storage.revision_missing(expected_revs)) == [] expected_snapshot = { 'id': _expected_new_snapshot_first_visit_id, 'branches': _expected_branches_first_visit, } check_snapshot(expected_snapshot, loader.storage) def test_2_visits_without_change(swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL - loader = GNULoader(url, artifacts=ARTIFACTS) + loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL - artifact1 = ARTIFACTS[0] - loader = GNULoader(url, [artifact1]) + artifact1 = GNU_ARTIFACTS[0] + loader = ArchiveLoader(url, [artifact1]) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 artifact2 = { 'time': 1480991830, - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'length': 238466, 'filename': '8sync-0.2.0.tar.gz', 'version': '0.2.0', } - loader2 = GNULoader(url, [artifact1, artifact2]) + loader2 = ArchiveLoader(url, [artifact1, artifact2]) # implementation detail: share the storage in between visits loader2.storage = loader.storage stats2 = get_stats(loader2.storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2['status'] == 'eventful' stats2 = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 8, 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 1, 'skipped_content': 0, 'snapshot': 1 + 1, } == stats2 origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 + + +def test_artifact_identity(): + """Compute primary key should return the right identity + + """ + data = { + 'a': 1, + 'b': 2, + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + + for id_keys, expected_id in [ + (['a', 'b'], [1, 2]), + ([], []), + (['a', 'key-that-does-not-exist'], [1, None]) + ]: + actual_id = artifact_identity(data, id_keys=id_keys) + assert actual_id == expected_id + + +def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): + """Load a project archive (not gnu) ends up with 1 snapshot + + """ + url = 'https://something.else.org/8sync/' + artifacts = [ # this is not a gnu artifact + { + 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp + 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa + # keep a gnu artifact reference to avoid adding other test files + 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + 'length': 238466, + 'filename': '8sync-0.2.0.tar.gz', + 'version': '0.2.0', + } + ] + + # Here the loader defines the id_keys to use for existence in the snapshot + # It's not the default archive loader which + loader = ArchiveLoader( + url, artifacts=artifacts, identity_artifact_keys=[ + 'sha256', 'length', 'url']) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://ftp.gnu.org') + ] + assert len(urls) == 1 diff --git a/swh/loader/package/tests/test_tasks.py b/swh/loader/package/tests/test_tasks.py index 4c0cca3..57acbc4 100644 --- a/swh/loader/package/tests/test_tasks.py +++ b/swh/loader/package/tests/test_tasks.py @@ -1,83 +1,83 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -@patch('swh.loader.package.debian.DebianLoader.load') -def test_debian_loader( +@patch('swh.loader.package.archive.ArchiveLoader.load') +def test_gnu_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( - 'swh.loader.package.tasks.LoadDebian', - (), dict(url='some-url', date='some-date', packages={})) + 'swh.loader.package.tasks.LoadArchive', + (), dict(url='some-url', artifacts=[])) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'} -@patch('swh.loader.package.deposit.DepositLoader.load') -def test_deposit_loader( +@patch('swh.loader.package.debian.DebianLoader.load') +def test_debian_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( - 'swh.loader.package.tasks.LoadDeposit', - (), dict(url='some-url', deposit_id='some-d-id')) + 'swh.loader.package.tasks.LoadDebian', + (), dict(url='some-url', date='some-date', packages={})) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'} -@patch('swh.loader.package.gnu.GNULoader.load') -def test_gnu_loader( +@patch('swh.loader.package.deposit.DepositLoader.load') +def test_deposit_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( - 'swh.loader.package.tasks.LoadGNU', - (), dict(url='some-url', tarballs=[])) + 'swh.loader.package.tasks.LoadDeposit', + (), dict(url='some-url', deposit_id='some-d-id')) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'} @patch('swh.loader.package.npm.NpmLoader.load') def test_npm_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( 'swh.loader.package.tasks.LoadNpm', (), dict(package_name='some-package', package_url='some', package_metadata_url='something')) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'} @patch('swh.loader.package.pypi.PyPILoader.load') def test_pypi_loader( mock_loader, swh_app, celery_session_worker, swh_config): mock_loader.return_value = {'status': 'eventful'} res = swh_app.send_task( 'swh.loader.package.tasks.LoadPyPI', (), dict(url='some-url')) assert res res.wait() assert res.successful() assert res.result == {'status': 'eventful'}