diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py index e02467f..aa23794 100644 --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -1,139 +1,123 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import iso8601 import logging from os import path from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple from swh.loader.package.loader import PackageLoader -from swh.loader.package.utils import release_name +from swh.loader.package.utils import release_name, artifact_identity from swh.model.identifiers import normalize_timestamp logger = logging.getLogger(__name__) SWH_PERSON = { 'name': b'Software Heritage', 'fullname': b'Software Heritage', 'email': b'robot@softwareheritage.org' } REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' class ArchiveLoader(PackageLoader): """Load archive origin's artifact files into swh archive """ visit_type = 'tar' def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]], identity_artifact_keys: Optional[Sequence[str]] = None): """Loader constructor. For now, this is the lister's task output. Args: url: Origin url artifacts: List of artifact information with keys: **time**: last modification time as either isoformat date string or timestamp **url**: the artifact url to retrieve filename **artifact's filename version**: artifact's version length **length**: artifact's length identity_artifact_keys: Optional List of keys forming the "identity" of an artifact """ super().__init__(url=url) self.artifacts = artifacts # assume order is enforced in the lister if not identity_artifact_keys: # default keys for gnu identity_artifact_keys = ['time', 'url', 'length', 'version'] self.identity_artifact_keys = identity_artifact_keys def get_versions(self) -> Sequence[str]: versions = [] for archive in self.artifacts: v = archive.get('version') if v: versions.append(v) return versions def get_default_version(self) -> str: # It's the most recent, so for this loader, it's the last one return self.artifacts[-1]['version'] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: for a_metadata in self.artifacts: url = a_metadata['url'] package_version = a_metadata['version'] if version == package_version: filename = a_metadata.get('filename') p_info = { 'url': url, 'filename': filename if filename else path.split(url)[-1], 'raw': a_metadata, } # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: identity = artifact_identity( artifact_metadata, id_keys=self.identity_artifact_keys) for rev_id, known_artifact in known_artifacts.items(): logging.debug('known_artifact: %s', known_artifact) reference_artifact = known_artifact['extrinsic']['raw'] known_identity = artifact_identity( reference_artifact, id_keys=self.identity_artifact_keys) if identity == known_identity: return rev_id return None def build_revision(self, a_metadata: Mapping[str, Any], uncompressed_path: str) -> Dict: time = a_metadata['time'] # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date time = iso8601.parse_date(time) normalized_time = normalize_timestamp(time) return { 'type': 'tar', 'message': REVISION_MESSAGE, 'date': normalized_time, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'committer_date': normalized_time, 'parents': [], 'metadata': { 'intrinsic': {}, 'extrinsic': { 'provider': self.url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, }, } - - -def artifact_identity(d: Mapping[str, Any], - id_keys: Sequence[str]) -> Sequence[Any]: - """Compute the primary key for a dict using the id_keys as primary key - composite. - - Args: - d: A dict entry to compute the primary key on - id_keys: Sequence of keys to use as primary key - - Returns: - The identity for that dict entry - - """ - return [d.get(k) for k in id_keys] diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py index 75b45aa..de3a4d0 100644 --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -1,366 +1,345 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model.hashutil import hash_to_bytes -from swh.loader.package.archive.loader import ArchiveLoader, artifact_identity +from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) URL = 'https://ftp.gnu.org/gnu/8sync/' GNU_ARTIFACTS = [ { 'time': 944729610, 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', 'length': 221837, 'filename': '8sync-0.1.0.tar.gz', 'version': '0.1.0', } ] _expected_new_contents_first_visit = [ 'e9258d81faf5881a2f96a77ba609396f82cb97ad', '1170cf105b04b7e2822a0e09d2acf71da7b9a130', 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', '0057bec9b5422aff9256af240b177ac0e3ac2608', '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', 'd64e64d4c73679323f8d4cde2643331ba6c20af9', '7a756602914be889c0a2d3952c710144b3e64cb0', '84fb589b554fcb7f32b806951dcf19518d67b08f', '8624bcdae55baeef00cd11d5dfcfa60f68710a02', 'e08441aeab02704cfbd435d6445f7c072f8f524e', 'f67935bc3a83a67259cda4b2d43373bd56703844', '809788434b433eb2e3cfabd5d591c9a659d5e3d8', '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', 'b99fec102eb24bffd53ab61fc30d59e810f116a2', '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', 'f0c97052e567948adf03e641301e9983c478ccff', '7fb724242e2b62b85ca64190c31dcae5303e19b3', '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', '7350628ccf194c2c3afba4ac588c33e3f3ac778d', '0bb892d9391aa706dc2c3b1906567df43cbe06a2', '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', '3046e5d1f70297e2a507b98224b6222c9688d610', '1572607d456d7f633bc6065a2b3048496d679a31', ] _expected_new_directories_first_visit = [ 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', '263be23b4a8101d3ad0d9831319a3e0f2b065f36', '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', '4db0a3ecbc976083e2dac01a62f93729698429a3', 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', 'eca971d346ea54d95a6e19d5051f900237fafdaa', '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', ] _expected_new_revisions_first_visit = { '44183488c0774ce3c957fa19ba695cf18a4a42b3': '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' } _expected_branches_first_visit = { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0', }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', }, } # hash is different then before as we changed the snapshot # gnu used to use `release/` (singular) instead of plural _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa def visit_with_no_artifact_found(swh_config, requests_mock_datadir): url = URL unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz' loader = ArchiveLoader(url, artifacts=[ { 'time': 944729610, 'url': unknown_artifact_url, # unknown artifact 'length': 221837, 'filename': '8sync-0.1.0.tar.gz', 'version': '0.1.0', } ]) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' assert actual_load_status['snapshot_id'] is not None stats = get_stats(loader.storage) assert { 'content': 0, 'directory': 0, 'origin': 1, 'origin_visit': 1, 'person': 0, 'release': 0, 'revision': 0, 'skipped_content': 0, 'snapshot': 1, } == stats origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'tar' def test_check_revision_metadata_structure(swh_config, requests_mock_datadir): loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( '44183488c0774ce3c957fa19ba695cf18a4a42b3') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_visit_with_release_artifact_no_prior_visit( swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] == _expected_new_snapshot_first_visit_id # noqa stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(loader.storage.content_missing_per_sha1(expected_contents)) \ == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(loader.storage.directory_missing(expected_dirs)) == [] expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) assert list(loader.storage.revision_missing(expected_revs)) == [] expected_snapshot = { 'id': _expected_new_snapshot_first_visit_id, 'branches': _expected_branches_first_visit, } check_snapshot(expected_snapshot, loader.storage) def test_2_visits_without_change(swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'tar' actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' assert actual_load_status2['snapshot_id'] is not None assert actual_load_status['snapshot_id'] == actual_load_status2[ 'snapshot_id'] origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'tar' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(url, [artifact1]) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'tar' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 artifact2 = { 'time': 1480991830, 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'length': 238466, 'filename': '8sync-0.2.0.tar.gz', 'version': '0.2.0', } loader2 = ArchiveLoader(url, [artifact1, artifact2]) # implementation detail: share the storage in between visits loader2.storage = loader.storage stats2 = get_stats(loader2.storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2['status'] == 'eventful' assert actual_load_status2['snapshot_id'] is not None stats2 = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 8, 'origin': 1, 'origin_visit': 1 + 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 1, 'skipped_content': 0, 'snapshot': 1 + 1, } == stats2 origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'tar' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 -def test_artifact_identity(): - """Compute primary key should return the right identity - - """ - data = { - 'a': 1, - 'b': 2, - 'length': 221837, - 'filename': '8sync-0.1.0.tar.gz', - 'version': '0.1.0', - } - - for id_keys, expected_id in [ - (['a', 'b'], [1, 2]), - ([], []), - (['a', 'key-that-does-not-exist'], [1, None]) - ]: - actual_id = artifact_identity(data, id_keys=id_keys) - assert actual_id == expected_id - - def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir): """Load a project archive (not gnu) ends up with 1 snapshot """ url = 'https://something.else.org/8sync/' artifacts = [ # this is not a gnu artifact { 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa # keep a gnu artifact reference to avoid adding other test files 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', 'length': 238466, 'filename': '8sync-0.2.0.tar.gz', 'version': '0.2.0', } ] # Here the loader defines the id_keys to use for existence in the snapshot # It's not the default archive loader which loader = ArchiveLoader( url, artifacts=artifacts, identity_artifact_keys=[ 'sha256', 'length', 'url']) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'tar' actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'uneventful' assert actual_load_status2['snapshot_id'] == actual_load_status[ 'snapshot_id'] origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'tar' urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith('https://ftp.gnu.org') ] assert len(urls) == 1 diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index 63024b8..9f40bcb 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,307 +1,322 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import swh.loader.package from swh.loader.package.utils import ( - download, api_info, release_name, parse_author + download, api_info, release_name, parse_author, artifact_identity ) def test_version_generation(): assert swh.loader.package.__version__ != 'devel', \ "Make sure swh.loader.core is installed (e.g. pip install -e .)" @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = 'https://pypi.org/pypi/arrow/json' status_code = 404 requests_mock.get(url, status_code=status_code) with pytest.raises(ValueError) as e: download(url, tmp_path) assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % ( url, status_code) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' # noqa assert (actual_hashes['checksums']['sha256'] == '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') @pytest.mark.fs def test_download_ok_no_header(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data) # no header information actual_filepath, actual_hashes = download(url, dest=str(tmp_path)) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['checksums']['sha1'] == 'fdd1ce606a904b08c816ba84f3125f2af44d92b2' # noqa assert (actual_hashes['checksums']['sha256'] == '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5') @pytest.mark.fs def test_download_ok_with_hashes(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) # good hashes for such file good = { 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa } actual_filepath, actual_hashes = download(url, dest=str(tmp_path), hashes=good) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes['length'] == len(data) assert actual_hashes['checksums']['sha1'] == good['sha1'] assert actual_hashes['checksums']['sha256'] == good['sha256'] @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise """ filename = 'requests-0.0.1.tar.gz' url = 'https://pypi.org/pypi/requests/%s' % filename data = 'this is something' requests_mock.get(url, text=data, headers={ 'content-length': str(len(data)) }) # good hashes for such file good = { 'sha1': 'fdd1ce606a904b08c816ba84f3125f2af44d92b2', 'sha256': '1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5', # noqa } for hash_algo in good.keys(): wrong_hash = good[hash_algo].replace('1', '0') expected_hashes = good.copy() expected_hashes[hash_algo] = wrong_hash # set the wrong hash expected_msg = ("Failure when fetching %s. " "Checksum mismatched: %s != %s" % ( url, wrong_hash, good[hash_algo] )) with pytest.raises(ValueError, match=expected_msg): download(url, dest=str(tmp_path), hashes=expected_hashes) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = 'https://pypi.org/pypi/requests/json' status_code = 400 requests_mock.get(url, status_code=status_code) with pytest.raises(ValueError) as e0: api_info(url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % ( url, status_code ) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = 'https://pypi.org/pypi/requests/json' requests_mock.get(url, text='{"version": "0.0.1"}') actual_info = api_info(url) assert actual_info == { 'version': '0.0.1', } def test_release_name(): for version, filename, expected_release in [ ('0.0.1', None, 'releases/0.0.1'), ('0.0.2', 'something', 'releases/0.0.2/something')]: assert release_name(version, filename) == expected_release def _parse_author_string_test(author_str, expected_result): assert parse_author(author_str) == expected_result assert parse_author(' %s' % author_str) == expected_result assert parse_author('%s ' % author_str) == expected_result def test_parse_author(): _parse_author_string_test( 'John Doe', { 'name': 'John Doe' } ) _parse_author_string_test( '', { 'email': 'john.doe@foo.bar' } ) _parse_author_string_test( '(https://john.doe)', { 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe ', { 'name': 'John Doe', 'email': 'john.doe@foo.bar' } ) _parse_author_string_test( 'John Doe', { 'name': 'John Doe', 'email': 'john.doe@foo.bar' } ) _parse_author_string_test( 'John Doe (https://john.doe)', { 'name': 'John Doe', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe(https://john.doe)', { 'name': 'John Doe', 'url': 'https://john.doe' } ) _parse_author_string_test( ' (https://john.doe)', { 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( '(https://john.doe) ', { 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe (https://john.doe)', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe (https://john.doe) ', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe (https://john.doe)', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe(https://john.doe)', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test('', {}) _parse_author_string_test('<>', {}) _parse_author_string_test(' <>', {}) _parse_author_string_test('<>()', {}) _parse_author_string_test('<> ()', {}) _parse_author_string_test('()', {}) _parse_author_string_test(' ()', {}) _parse_author_string_test( 'John Doe <> ()', { 'name': 'John Doe' } ) _parse_author_string_test( 'John Doe <>', { 'name': 'John Doe' } ) _parse_author_string_test( 'John Doe ()', { 'name': 'John Doe' } ) -# def test_swh_author(): -# for author, expected_author in [ -# ({}, ) -# ]: +def test_artifact_identity(): + """Compute primary key should return the right identity + + """ + data = { + 'a': 1, + 'b': 2, + 'length': 221837, + 'filename': '8sync-0.1.0.tar.gz', + 'version': '0.1.0', + } + + for id_keys, expected_id in [ + (['a', 'b'], [1, 2]), + ([], []), + (['a', 'key-that-does-not-exist'], [1, None]) + ]: + actual_id = artifact_identity(data, id_keys=id_keys) + assert actual_id == expected_id diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index 9f78a35..eff017f 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,197 +1,213 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import logging import os import requests import re -from typing import Dict, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.loader.package import DEFAULT_PARAMS logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(['sha1', 'sha256', 'length']) # https://github.com/jonschlinkert/author-regex _author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' _EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} def api_info(url: str) -> Dict: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: ValueError in case of query failures (for some reasons: 404, ...) Returns: The associated response's information dict """ response = requests.get(url, **DEFAULT_PARAMS) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response.json() def download(url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params['auth'] = auth response = requests.get(url, **params, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) filename = filename if filename else os.path.basename(url) logger.debug('filename: %s', filename) filepath = os.path.join(dest, filename) logger.debug('filepath: %s', filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( 'Failure when fetching %s. ' 'Checksum mismatched: %s != %s' % ( url, expected_digest, actual_digest)) computed_hashes = h.hexdigest() length = computed_hashes.pop('length') extrinsic_metadata = { 'length': length, 'filename': filename, 'checksums': computed_hashes, } logger.debug('extrinsic_metadata', extrinsic_metadata) return filepath, extrinsic_metadata def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return 'releases/%s/%s' % (version, filename) return 'releases/%s' % version def parse_author(author_str: str) -> Dict[str, str]: """ Parse npm package author string. It works with a flexible range of formats, as detailed below:: name name (url) name (url) name (url) name(url) name (url) name (url) name(url) name(url) name (url) name(url) name name (url) (url) (url) (url) (url) Args: author_str (str): input author string Returns: dict: A dict that may contain the following keys: * name * email * url """ author = {} matches = re.findall(_author_regexp, author_str.replace('<>', '').replace('()', ''), re.M) for match in matches: if match[0].strip(): author['name'] = match[0].strip() if match[1].strip(): author['email'] = match[1].strip() if match[2].strip(): author['url'] = match[2].strip() return author def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]: """Transform an author like dict to an expected swh like dict (values are bytes) """ name = author.get('name') email = author.get('email') fullname = None if name and email: fullname = '%s <%s>' % (name, email) elif name: fullname = name if not fullname: r = _EMPTY_AUTHOR else: r = { 'fullname': fullname.encode('utf-8') if fullname else None, 'name': name.encode('utf-8') if name else None, 'email': email.encode('utf-8') if email else None } return r + + +def artifact_identity(d: Mapping[str, Any], + id_keys: Sequence[str]) -> List[Any]: + """Compute the primary key for a dict using the id_keys as primary key + composite. + + Args: + d: A dict entry to compute the primary key on + id_keys: Sequence of keys to use as primary key + + Returns: + The identity for that dict entry + + """ + return [d.get(k) for k in id_keys]