diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py --- a/swh/loader/package/pypi.py +++ b/swh/loader/package/pypi.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import os +import logging from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple from urllib.parse import urlparse @@ -15,6 +16,8 @@ from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import api_info, release_name +logger = logging.getLogger(__name__) + class PyPILoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. @@ -66,12 +69,7 @@ def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: - sha256 = artifact_metadata['digests']['sha256'] - for rev_id, known_artifact in known_artifacts.items(): - for original_artifact in known_artifact['original_artifact']: - if sha256 == original_artifact['checksums']['sha256']: - return rev_id - return None + return artifact_to_revision_id(known_artifacts, artifact_metadata) def build_revision( self, a_metadata: Dict, uncompressed_path: str) -> Dict: @@ -109,6 +107,54 @@ } +def artifact_to_revision_id( + known_artifacts: Dict, artifact_metadata: Dict) -> Optional[bytes]: + """Given metadata artifact, solves the associated revision id. + + The following code allows to deal with 2 metadata formats (column metadata + in 'revision') + + - old format sample: + + { + 'original_artifact': { + 'sha256': "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa + ... + }, + ... + } + + - new format sample: + + { + 'original_artifact': [{ + 'checksums': { + 'sha256': "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa + ... + }, + }], + ... + } + + """ + sha256 = artifact_metadata['digests']['sha256'] + for rev_id, known_artifact in known_artifacts.items(): + original_artifact = known_artifact['original_artifact'] + if isinstance(original_artifact, dict): + # previous loader-pypi version stored metadata as dict + original_sha256 = original_artifact['sha256'] + if sha256 == original_sha256: + return rev_id + continue + # new pypi loader actually store metadata dict differently... + assert isinstance(original_artifact, list) + # current loader-pypi stores metadata as list of dict + for original_artifact in known_artifact['original_artifact']: + if sha256 == original_artifact['checksums']['sha256']: + return rev_id + return None + + def pypi_api_url(url: str) -> str: """Compute api url from a project url diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py --- a/swh/loader/package/tests/test_pypi.py +++ b/swh/loader/package/tests/test_pypi.py @@ -16,7 +16,8 @@ from swh.model.hashutil import hash_to_bytes from swh.loader.package.pypi import ( - PyPILoader, pypi_api_url, author, extract_intrinsic_metadata + PyPILoader, pypi_api_url, author, extract_intrinsic_metadata, + artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats @@ -705,3 +706,98 @@ origin_visit = next(loader.storage.origin_visit_get(url)) assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'pypi' + + +def test_pypi_artifact_to_revision_id_none(): + """Current loader version should stop soon if nothing can be found + + """ + artifact_metadata = { + 'digests': { + 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa + }, + } + + assert artifact_to_revision_id({}, artifact_metadata) is None + + known_artifacts = { + 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': { + 'original_artifact': { + 'sha256': 'something-irrelevant', + }, + }, + } + + assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None + + +def test_pypi_artifact_to_revision_id_old_loader_version(): + """Current loader version should solve old metadata scheme + + """ + artifact_metadata = { + 'digests': { + 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa + } + } + + known_artifacts = { + hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { + 'original_artifact': { + 'sha256': "something-wrong", + }, + }, + hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { + 'original_artifact': { + 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa + }, + } + } + + assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ + == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') + + +def test_pypi_artifact_to_revision_id_current_loader_version(): + """Current loader version should be able to solve current metadata scheme + + """ + artifact_metadata = { + 'digests': { + 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa + } + } + + known_artifacts = { + hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { + 'original_artifact': [{ + 'checksums': { + 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa + }, + }], + }, + hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { + 'original_artifact': [{ + 'checksums': { + 'sha256': 'something-wrong' + }, + }], + }, + } + + assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ + == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') + + +def test_pypi_artifact_to_revision_id_failures(): + with pytest.raises(KeyError, match='sha256'): + artifact_metadata = { + 'digests': {}, + } + assert artifact_to_revision_id({}, artifact_metadata) + + with pytest.raises(KeyError, match='digests'): + artifact_metadata = { + 'something': 'wrong', + } + assert artifact_to_revision_id({}, artifact_metadata)