diff --git a/swh/loader/package/tar.py b/swh/loader/package/tar.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/tar.py @@ -0,0 +1,102 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import iso8601 +import logging + +from os import path +from typing import Dict, Generator, Optional, Mapping, Sequence, Tuple + +from swh.loader.package.loader import PackageLoader +from swh.model.identifiers import normalize_timestamp + + +logger = logging.getLogger(__name__) + + +SWH_PERSON = { + 'name': b'Software Heritage', + 'fullname': b'Software Heritage', + 'email': b'robot@softwareheritage.org' +} + + +REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' + + +class TarLoader(PackageLoader): + visit_type = 'tar' + + def __init__(self, url: str, packages: Sequence[Mapping[str, str]]): + """Loader constructor. + + Args: + url: Origin url + + packages: List of dict with keys: + - uri: the url to retrieve one versioned archive + - date (isoformat date string) + - sha256: integrity hash + + """ + super().__init__(url=url) + # sorting per date + self.packages = list(sorted(packages, key=lambda v: v['date'])) + + def get_versions(self) -> Sequence[str]: + versions = [] + for package in self.packages: + v = package.get('version') + if v: + versions.append(v) + return versions + + def get_default_release(self) -> str: + # It's the most recent, so for this loader, it's the last one + return self.packages[-1]['version'] + + def get_artifacts(self, version: str) -> Generator[ + Tuple[str, str, Dict], None, None]: + for a_metadata in self.packages: + url = a_metadata['url'] + artifact_version = a_metadata['version'] + if version == artifact_version: + filename = path.split(url)[-1] + yield filename, url, a_metadata + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + def pk(d): + return [d.get(k) for k in ['date', 'sha256', 'url']] + + artifact_pk = pk(artifact_metadata) + for rev_id, known_artifact in known_artifacts.items(): + logging.debug('known_artifact: %s', known_artifact) + known_pk = pk(known_artifact['extrinsic']['raw']) + if artifact_pk == known_pk: + logger.debug('Revision %s found!', rev_id) + return rev_id + + def build_revision( + self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: + normalized_date = normalize_timestamp( + iso8601.parse_date(a_metadata['date'])) + return { + 'message': REVISION_MESSAGE, + 'date': normalized_date, + 'author': SWH_PERSON, + 'committer': SWH_PERSON, + 'committer_date': normalized_date, + 'parents': [], + 'metadata': { + 'intrinsic': {}, + 'extrinsic': { + 'provider': self.url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + }, + } diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py --- a/swh/loader/package/tests/test_gnu.py +++ b/swh/loader/package/tests/test_gnu.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import os -import re from swh.model.hashutil import hash_to_bytes @@ -134,16 +133,15 @@ _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa -def test_visit_with_no_artifact_found(swh_config, requests_mock): +def test_visit_with_no_artifact_found(swh_config, requests_mock_datadir): package_url = 'https://ftp.gnu.org/gnu/8sync/' tarballs = [{ 'time': '944729610', - 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-unknown-0.1.0.tar.gz', 'length': 221837, }] loader = GNULoader(package_url, tarballs) - requests_mock.get(re.compile('https://'), status_code=404) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' diff --git a/swh/loader/package/tests/test_tar.py b/swh/loader/package/tests/test_tar.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/tests/test_tar.py @@ -0,0 +1,160 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Tuple + +from swh.core.pytest_plugin import requests_mock_datadir_factory +from swh.loader.package.tar import TarLoader + +from swh.loader.package.tests.common import check_snapshot + + +URL = 'https://deposit.softwareheritage.org/hello/2.10.orig.tar.gz' + + +PACKAGES = [ + { + 'version': '2.10', + 'url': URL, + 'date': '2014-10-19T16:52:35+02:00', + 'sha256': '31e066137a962676e89f69d1b65382de95a7ef7d914b8cb956f41ea72e0f516b', # noqa + } +] + + +def integrity_to_hash(integrity_value: str) -> Tuple[str, str]: + hash_name, base64_value = integrity_value.split('-') + + from base64 import b64decode + from binascii import hexlify + + hash_hex = hexlify(b64decode(base64_value)).decode('utf-8') + return hash_name, hash_hex + + +def test_integrity_to_hash(): + hash_name, hash_hex = integrity_to_hash( + 'sha256-MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs=') + + assert hash_name == 'sha256' + assert hash_hex == '31e066137a962676e89f69d1b65382de95a7ef7d914b8cb956f41ea72e0f516b' # noqa + + +requests_mock_datadir_missing = requests_mock_datadir_factory(ignore_urls=[ + URL +]) + + +def test_tar_visit_with_no_artifact_found( + swh_config, requests_mock_datadir_missing): + loader = TarLoader(url=URL, packages=PACKAGES) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'uneventful' + stats = loader.storage.stat_counters() + + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(URL)) + assert origin_visit['status'] == 'partial' + + +def test_tar_visit_with_artifact_found(swh_config, requests_mock_datadir): + loader = TarLoader(url=URL, packages=PACKAGES) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + stats = loader.storage.stat_counters() + + assert { + 'content': 303, + 'directory': 12, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(URL)) + assert origin_visit['status'] == 'full' + + expected_snapshot = { + 'id': 'e759f554b660f03ebaf2e5bf62c34e0fe0ee5748', + 'branches': { + 'HEAD': { + 'target_type': 'alias', + 'target': 'releases/2.10' + }, + 'releases/2.10': { + 'target_type': 'revision', + 'target': '326260b671e595403f03b9e673af519e23011c52', + } + }, + } + + check_snapshot(expected_snapshot, loader.storage) + + +def test_tar_2_visits_without_change(swh_config, requests_mock_datadir): + """2 visits on the same origins ends up with 1 same snapshot across visit + + """ + url = URL + loader = TarLoader(url=url, packages=PACKAGES) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + stats = loader.storage.stat_counters() + + assert { + 'content': 303, + 'directory': 12, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + actual_load_status2 = loader.load() + assert actual_load_status2['status'] == 'uneventful' + origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit2['status'] == 'full' + stats = loader.storage.stat_counters() + + assert { + 'content': 303, + 'directory': 12, + 'origin': 1, + 'origin_visit': 2, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + urls = [ + m.url for m in requests_mock_datadir.request_history + if m.url.startswith('https://deposit.softwareheritage.org') + ] + assert len(urls) == 1