diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -13,7 +13,7 @@ import attr import iso8601 -from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.loader import BaseManifestPackageInfo, PackageLoader from swh.loader.package.utils import release_name from swh.model.model import ( Person, @@ -34,7 +34,7 @@ @attr.s -class ArchivePackageInfo(BasePackageInfo): +class ArchivePackageInfo(BaseManifestPackageInfo): raw_info = attr.ib(type=Dict[str, Any]) length = attr.ib(type=int) """Size of the archive file""" @@ -54,6 +54,7 @@ manifest = manifest_format.substitute( {k: str(v) for (k, v) in self.raw_info.items()} ) + print("hashed", repr(manifest)) return hashlib.sha256(manifest.encode()).digest() @classmethod @@ -139,21 +140,14 @@ # versioned package yield release_name(version), p_info - def extid_from_reference_artifact(self, reference_artifact: Dict) -> bytes: - reference_artifact_info = ArchivePackageInfo.from_metadata(reference_artifact) - return reference_artifact_info.extid(manifest_format=self.extid_manifest_format) - - def resolve_revision_from( - self, known_artifacts: Dict, p_info: ArchivePackageInfo - ) -> Optional[bytes]: - extid = p_info.extid(manifest_format=self.extid_manifest_format) - for rev_id, known_artifact in known_artifacts.items(): - logging.debug("known_artifact: %s", known_artifact) - reference_artifact = known_artifact["extrinsic"]["raw"] - known_extid = self.extid_from_reference_artifact(reference_artifact) - if extid == known_extid: - return rev_id - return None + def new_packageinfo_to_extid(self, p_info: ArchivePackageInfo) -> Optional[bytes]: + return p_info.extid(manifest_format=self.extid_manifest_format) + + def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[bytes]: + known_artifact_info = ArchivePackageInfo.from_metadata( + known_artifact["extrinsic"]["raw"] + ) + return known_artifact_info.extid(manifest_format=self.extid_manifest_format) def build_revision( self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -10,13 +10,13 @@ from os import path import re import string -from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple import attr import dateutil.parser from debian.deb822 import Deb822 -from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.loader import BaseManifestPackageInfo, PackageLoader from swh.loader.package.utils import release_name from swh.model.model import ( Person, @@ -34,7 +34,7 @@ @attr.s -class CRANPackageInfo(BasePackageInfo): +class CRANPackageInfo(BaseManifestPackageInfo): raw_info = attr.ib(type=Dict[str, Any]) version = attr.ib(type=str) @@ -87,24 +87,8 @@ if version == p_info.version: yield release_name(version), p_info - def extid_from_known_artifact(self, known_artifact: Dict) -> bytes: - return CRANPackageInfo.from_metadata(known_artifact).extid() - - def resolve_revision_from( - self, known_artifacts: Mapping[bytes, Mapping], p_info: CRANPackageInfo, - ) -> Optional[bytes]: - """Given known_artifacts per revision, try to determine the revision for - artifact_metadata - - """ - new_extid = p_info.extid() - for rev_id, known_artifact_meta in known_artifacts.items(): - logging.debug("known_artifact_meta: %s", known_artifact_meta) - known_artifact = known_artifact_meta["extrinsic"]["raw"] - known_extid = self.extid_from_known_artifact(known_artifact) - if new_extid == known_extid: - return rev_id - return None + def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[bytes]: + return CRANPackageInfo.from_metadata(known_artifact["extrinsic"]["raw"]).extid() def build_revision( self, p_info: CRANPackageInfo, uncompressed_path: str, directory: Sha1Git diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -17,6 +17,7 @@ from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import download, release_name +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, Revision, @@ -30,6 +31,12 @@ UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") +class DscCountError(ValueError): + """Raised when an unexpected number of .dsc files is seen""" + + pass + + @attr.s class DebianFileMetadata: md5sum = attr.ib(type=str) @@ -74,6 +81,19 @@ version=a_metadata["version"], ) + def extid(self) -> Optional[bytes]: + dsc_files = [ + file for (name, file) in self.files.items() if name.endswith(".dsc") + ] + + if len(dsc_files) != 1: + raise DscCountError( + f"Expected exactly one .dsc file for package {self.name}, " + f"got {len(dsc_files)}" + ) + + return hash_to_bytes(dsc_files[0].sha256) + @attr.s class IntrinsicPackageMetadata: @@ -156,10 +176,20 @@ p_info = DebianPackageInfo.from_metadata(meta, url=self.url) yield release_name(version), p_info + def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[bytes]: + sha256 = _artifact_to_dsc_sha256(known_artifact, url=self.url) + if sha256 is None: + return None + return hash_to_bytes(sha256) + def resolve_revision_from( - self, known_package_artifacts: Mapping, p_info: DebianPackageInfo + self, known_artifacts: Dict, p_info: DebianPackageInfo, ) -> Optional[bytes]: - return resolve_revision_from(known_package_artifacts, p_info) + try: + return super().resolve_revision_from(known_artifacts, p_info) + except DscCountError: + # known_artifacts are corrupted, ignore them instead of crashing + return None def download_package( self, p_info: DebianPackageInfo, tmpdir: str @@ -231,38 +261,6 @@ ) -def resolve_revision_from( - known_package_artifacts: Mapping, p_info: DebianPackageInfo -) -> Optional[bytes]: - """Given known package artifacts (resolved from the snapshot of previous - visit) and the new artifact to fetch, try to solve the corresponding - revision. - - """ - artifacts_to_fetch = p_info.files - if not artifacts_to_fetch: - return None - - new_dsc_files = [ - file for (name, file) in p_info.files.items() if name.endswith(".dsc") - ] - - if len(new_dsc_files) != 1: - raise ValueError( - f"Expected exactly one new .dsc file for package {p_info.name}, " - f"got {len(new_dsc_files)}" - ) - - new_dsc_sha256 = new_dsc_files[0].sha256 - - for rev_id, known_artifacts in known_package_artifacts.items(): - - if new_dsc_sha256 == _artifact_to_dsc_sha256(known_artifacts, p_info.url): - return rev_id - - return None - - def _artifact_to_dsc_sha256(known_artifacts: Dict, url: str) -> Optional[str]: extrinsic = known_artifacts.get("extrinsic") if not extrinsic: @@ -272,7 +270,7 @@ dsc = [file for (name, file) in known_p_info.files.items() if name.endswith(".dsc")] if len(dsc) != 1: - raise ValueError( + raise DscCountError( f"Expected exactly one known .dsc file for package {known_p_info.name}, " f"got {len(dsc)}" ) @@ -367,7 +365,7 @@ for filename, fileinfo in p_info.files.items(): if filename.endswith(".dsc"): if dsc_name: - raise ValueError( + raise DscCountError( "Package %s_%s references several dsc files." % (p_info.name, p_info.version) ) diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -19,7 +19,6 @@ extract_package, get_intrinsic_package_metadata, prepare_person, - resolve_revision_from, uid_to_person, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats @@ -418,17 +417,18 @@ """Solving revision with empty data will result in unknown revision """ + loader = DebianLoader(None, None, None, None) empty_artifact = { "name": PACKAGE_FILES["name"], "version": PACKAGE_FILES["version"], } for package_artifacts in [empty_artifact, PACKAGE_FILES]: p_info = DebianPackageInfo.from_metadata(package_artifacts, url=URL) - actual_revision = resolve_revision_from({}, p_info) + actual_revision = loader.resolve_revision_from({}, p_info) assert actual_revision is None for known_artifacts in [{}, PACKAGE_FILES]: - actual_revision = resolve_revision_from( + actual_revision = loader.resolve_revision_from( known_artifacts, DebianPackageInfo.from_metadata(empty_artifact, url=URL) ) assert actual_revision is None @@ -441,7 +441,7 @@ # ... removed the unnecessary intermediary data } } - assert not resolve_revision_from( + assert not loader.resolve_revision_from( known_package_artifacts, DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) ) @@ -450,6 +450,7 @@ """Solving revision with inconsistent data will result in unknown revision """ + loader = DebianLoader(None, None, None, None) artifact_metadata = PACKAGE_FILES2 p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) expected_revision_id = ( @@ -462,7 +463,7 @@ } } - actual_revision = resolve_revision_from(known_package_artifacts, p_info) + actual_revision = loader.resolve_revision_from(known_package_artifacts, p_info) assert actual_revision is None @@ -471,6 +472,7 @@ """Solving revision with consistent data will solve the revision """ + loader = DebianLoader(None, None, None, None) artifact_metadata = PACKAGE_FILES p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) expected_revision_id = ( @@ -494,13 +496,14 @@ } } - actual_revision = resolve_revision_from(known_package_artifacts, p_info) + actual_revision = loader.resolve_revision_from(known_package_artifacts, p_info) assert actual_revision == expected_revision_id def test_debian_resolve_revision_from_corrupt_known_artifact(): """To many or not enough .dsc files in the known_artifacts dict""" + loader = DebianLoader(None, None, None, None) artifact_metadata = PACKAGE_FILES p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) expected_revision_id = ( @@ -523,17 +526,16 @@ # Too many .dsc files["another.dsc"] = files["cicero_0.7.2-3.dsc"] - with pytest.raises(ValueError, match="exactly one known .dsc"): - resolve_revision_from(known_package_artifacts, p_info) + assert loader.resolve_revision_from(known_package_artifacts, p_info) is None # Not enough .dsc del files["another.dsc"] del files["cicero_0.7.2-3.dsc"] - with pytest.raises(ValueError, match="exactly one known .dsc"): - resolve_revision_from(known_package_artifacts, p_info) + assert loader.resolve_revision_from(known_package_artifacts, p_info) is None def test_debian_resolve_revision_from_corrupt_new_artifact(): + loader = DebianLoader(None, None, None, None) artifact_metadata = PACKAGE_FILES files = PACKAGE_FILES["files"] @@ -543,12 +545,10 @@ # Too many .dsc files["another.dsc"] = files["cicero_0.7.2-3.dsc"] p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - with pytest.raises(ValueError, match="exactly one new .dsc"): - resolve_revision_from(PACKAGE_FILES, p_info) + assert loader.resolve_revision_from(PACKAGE_FILES, p_info) is None # Not enough .dsc del files["another.dsc"] del files["cicero_0.7.2-3.dsc"] p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - with pytest.raises(ValueError, match="exactly one new .dsc"): - resolve_revision_from(PACKAGE_FILES, p_info) + assert loader.resolve_revision_from(PACKAGE_FILES, p_info) is None diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -102,6 +102,11 @@ ], ) + def extid(self) -> None: + # For now, we don't try to deduplicate deposits. There is little point anyway, + # as it only happens when the exact same tarball was deposited twice. + return None + class DepositLoader(PackageLoader[DepositPackageInfo]): """Load a deposited artifact into swh archive. diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -112,6 +112,18 @@ # TODO: add support for metadata for directories and contents + def extid(self) -> Optional[bytes]: + """Returns a unique intrinsic identifier of this package info, + or None if this package info is not 'deduplicatable' (meaning that + we will always load it, instead of checking the ExtID storage + to see if we already did)""" + return None + + +class BaseManifestPackageInfo(BasePackageInfo): + """Base class for PackageInfo classes that provide an extid based on + the hash of a manifest made of some of their attributes..""" + @property def MANIFEST_FORMAT(self) -> string.Template: """A string.Template object used to format a manifest, which is hashed @@ -121,8 +133,7 @@ f"or an override of extid()" ) - def extid(self) -> bytes: - """Returns a unique intrinsic identifier of this package info""" + def extid(self) -> Optional[bytes]: manifest = self.MANIFEST_FORMAT.substitute( {k: str(v) for (k, v) in attr.asdict(self).items()} ) @@ -238,6 +249,14 @@ revision.id: revision.metadata for revision in known_revisions if revision } + def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[bytes]: + return p_info.extid() + + def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[bytes]: + """Returns a unique intrinsic identifier of a downloaded artifact, + used to check if a new artifact is the same.""" + return None + def resolve_revision_from( self, known_artifacts: Dict, p_info: TPackageInfo, ) -> Optional[bytes]: @@ -255,6 +274,21 @@ None or revision identifier """ + if not known_artifacts: + # No known artifact, no need to compute the artifact's extid + return None + + new_extid = self.new_packageinfo_to_extid(p_info) + if new_extid is None: + # This loader does not support deduplication, at least not for this + # artifact. + return None + + for rev_id, known_artifact in known_artifacts.items(): + known_extid = self.known_artifact_to_extid(known_artifact) + if new_extid == known_extid: + return rev_id + return None def download_package( diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -50,6 +50,9 @@ raw_info=metadata, ) + def extid(self) -> bytes: + return self.integrity.encode("ascii") + class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): """Load sources from a sources.json file. This loader is used to load @@ -147,36 +150,19 @@ ret[revision.id] = revision.metadata return ret - def _get_integrity_from_artifact( - self, known_artifact: Dict, rev_id: bytes - ) -> Optional[str]: + def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[bytes]: try: - return known_artifact["extrinsic"]["raw"]["integrity"] + return known_artifact["extrinsic"]["raw"]["integrity"].encode("ascii") except KeyError as e: logger.exception( "Unexpected metadata revision structure detected: %(context)s", - { - "context": { - "revision": hashutil.hash_to_hex(rev_id), - "reason": str(e), - "known_artifact": known_artifact, - } - }, + {"context": {"reason": str(e), "known_artifact": known_artifact,}}, ) # metadata field for the revision is not as expected by the loader # nixguix. We consider this not the right revision and continue checking # the other revisions return None - def resolve_revision_from( - self, known_artifacts: Dict, p_info: NixGuixPackageInfo, - ) -> Optional[bytes]: - for rev_id, known_artifact in known_artifacts.items(): - known_integrity = self._get_integrity_from_artifact(known_artifact, rev_id) - if p_info.integrity == known_integrity: - return rev_id - return None - def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -26,7 +26,7 @@ from swh.loader.tests import assert_last_visit_matches from swh.loader.tests import check_snapshot as check_snapshot_full from swh.loader.tests import get_stats -from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ExtendedObjectType, ExtendedSWHID from swh.model.model import ( MetadataAuthority, @@ -696,16 +696,8 @@ actual_detections.append(record.args["context"]) expected_detections = [ - { - "revision": hash_to_hex(old_revision.id), - "reason": "'integrity'", - "known_artifact": old_revision.metadata, - }, - { - "revision": hash_to_hex(old_revision.id), - "reason": "'integrity'", - "known_artifact": old_revision.metadata, - }, + {"reason": "'integrity'", "known_artifact": old_revision.metadata,}, + {"reason": "'integrity'", "known_artifact": old_revision.metadata,}, ] # as many calls as there are sources listed in the sources.json diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -19,6 +19,7 @@ RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -79,6 +80,9 @@ ], ) + def extid(self) -> bytes: + return hash_to_bytes(self.shasum) + class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. @@ -133,10 +137,15 @@ ) yield release_name(version), p_info - def resolve_revision_from( - self, known_artifacts: Dict, p_info: NpmPackageInfo - ) -> Optional[bytes]: - return artifact_to_revision_id(known_artifacts, p_info) + @staticmethod + def known_artifact_to_extid(known_artifact: Dict) -> Optional[bytes]: + extid_str = _artifact_to_sha1(known_artifact) + if extid_str is None: + return None + try: + return hash_to_bytes(extid_str) + except ValueError: + return None def build_revision( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git @@ -182,10 +191,8 @@ return r -def artifact_to_revision_id( - known_artifacts: Dict, p_info: NpmPackageInfo -) -> Optional[bytes]: - """Given metadata artifact, solves the associated revision id. +def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]: + """Returns the sha1 from an NPM 'original_artifact' dict The following code allows to deal with 2 metadata formats: @@ -210,17 +217,6 @@ } """ - shasum = p_info.shasum - for rev_id, known_artifact in known_artifacts.items(): - original_hash = _artifact_to_sha1(known_artifact) - if shasum == original_hash: - return rev_id - - return None - - -def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]: - """Returns the sha1 from an NPM 'original_artifact' dict""" known_original_artifact = known_artifact.get("original_artifact") if not known_original_artifact: # previous loader-npm version kept original artifact elsewhere diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -12,7 +12,6 @@ from swh.loader.package.npm.loader import ( NpmLoader, _author_str, - artifact_to_revision_id, extract_npm_package_author, ) from swh.loader.package.tests.common import check_metadata_paths @@ -545,64 +544,37 @@ check_snapshot(expected_snapshot, swh_storage) -def test_npm_artifact_to_revision_id_none(): - """Current loader version should stop soon if nothing can be found +def test_npm__known_artifact_to_extid__old_loader_version(): + """Current loader version should parse old metadata scheme """ - - class artifact_metadata: - shasum = "05181c12cd8c22035dd31155656826b85745da37" - - known_artifacts = { - "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92": {}, - } - - assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None - - -def test_npm_artifact_to_revision_id_old_loader_version(): - """Current loader version should solve old metadata scheme - - """ - - class artifact_metadata: - shasum = "05181c12cd8c22035dd31155656826b85745da37" - - known_artifacts = { - hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { - "package_source": {"sha1": "something-wrong"} - }, - hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { - "package_source": {"sha1": "05181c12cd8c22035dd31155656826b85745da37",} - }, - } - - assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes( - "845673bfe8cbd31b1eaf757745a964137e6f9116" + assert ( + NpmLoader.known_artifact_to_extid( + {"package_source": {"sha1": "something-wrong"}} + ) + is None ) + sha1 = "05181c12cd8c22035dd31155656826b85745da37" + assert NpmLoader.known_artifact_to_extid( + {"package_source": {"sha1": sha1,}} + ) == hash_to_bytes(sha1) -def test_npm_artifact_to_revision_id_current_loader_version(): - """Current loader version should be able to solve current metadata scheme - - """ - class artifact_metadata: - shasum = "05181c12cd8c22035dd31155656826b85745da37" +def test_npm__known_artifact_to_extid__current_loader_version(): + """Current loader version should be able to parse current metadata scheme - known_artifacts = { - hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { - "original_artifact": [ - {"checksums": {"sha1": "05181c12cd8c22035dd31155656826b85745da37"},} - ], - }, - hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { - "original_artifact": [{"checksums": {"sha1": "something-wrong"},}], - }, - } + """ + sha1 = "05181c12cd8c22035dd31155656826b85745da37" + assert NpmLoader.known_artifact_to_extid( + {"original_artifact": [{"checksums": {"sha1": sha1},}],} + ) == hash_to_bytes(sha1) - assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes( - "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92" + assert ( + NpmLoader.known_artifact_to_extid( + {"original_artifact": [{"checksums": {"sha1": "something-wrong"},}],}, + ) + is None ) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -18,6 +18,7 @@ RawExtrinsicMetadataCore, ) from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name +from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, @@ -56,6 +57,9 @@ ], ) + def extid(self) -> bytes: + return hash_to_bytes(self.sha256) + class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive. @@ -113,10 +117,15 @@ for version, p_info in res: yield release_name(version, p_info.filename), p_info - def resolve_revision_from( - self, known_artifacts: Dict, p_info: PyPIPackageInfo - ) -> Optional[bytes]: - return artifact_to_revision_id(known_artifacts, p_info) + @staticmethod + def known_artifact_to_extid(known_artifact: Dict) -> Optional[bytes]: + extid_str = _artifact_to_sha256(known_artifact) + if extid_str is None: + return None + try: + return hash_to_bytes(extid_str) if extid_str else None + except ValueError: + return None def build_revision( self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git @@ -155,10 +164,8 @@ ) -def artifact_to_revision_id( - known_artifacts: Dict, p_info: PyPIPackageInfo -) -> Optional[bytes]: - """Given metadata artifact, solves the associated revision id. +def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]: + """Returns the sha256 from a PyPI 'original_artifact' dict The following code allows to deal with 2 metadata formats (column metadata in 'revision') @@ -186,17 +193,6 @@ } """ - sha256 = p_info.sha256 - for rev_id, known_artifact in known_artifacts.items(): - original_sha256 = _artifact_to_sha256(known_artifact) - if sha256 == original_sha256: - return rev_id - - return None - - -def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]: - """Returns the sha256 from a PyPI 'original_artifact' dict""" original_artifact = known_artifact["original_artifact"] if isinstance(original_artifact, dict): # previous loader-pypi version stored metadata as dict diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -15,7 +15,6 @@ from swh.loader.package import __version__ from swh.loader.package.pypi.loader import ( PyPILoader, - artifact_to_revision_id, author, extract_intrinsic_metadata, pypi_api_url, @@ -797,74 +796,38 @@ ) -def test_pypi_artifact_to_revision_id_none(): - """Current loader version should stop soon if nothing can be found - - """ - - class artifact_metadata: - sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" - - assert artifact_to_revision_id({}, artifact_metadata) is None - - known_artifacts = { - "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92": { - "original_artifact": {"sha256": "something-irrelevant",}, - }, - } - - assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None - - -def test_pypi_artifact_to_revision_id_old_loader_version(): +def test_pypi__known_artifact_to_extid__old_loader_version(): """Current loader version should solve old metadata scheme """ - - class artifact_metadata: - sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" - - known_artifacts = { - hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { - "original_artifact": {"sha256": "something-wrong",}, - }, - hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { - "original_artifact": { - "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa - }, - }, - } - - assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes( - "845673bfe8cbd31b1eaf757745a964137e6f9116" + assert ( + PyPILoader.known_artifact_to_extid( + {"original_artifact": {"sha256": "something-wrong",},} + ) + is None ) + sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" + assert PyPILoader.known_artifact_to_extid( + {"original_artifact": {"sha256": sha256}} + ) == hash_to_bytes(sha256) -def test_pypi_artifact_to_revision_id_current_loader_version(): + +def test_pypi__known_artifact_to_extid__current_loader_version(): """Current loader version should be able to solve current metadata scheme """ + sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" - class artifact_metadata: - sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" - - known_artifacts = { - hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { - "original_artifact": [ - { - "checksums": { - "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa - }, - } - ], - }, - hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { - "original_artifact": [{"checksums": {"sha256": "something-wrong"},}], - }, - } + assert PyPILoader.known_artifact_to_extid( + {"original_artifact": [{"checksums": {"sha256": sha256,},}],} + ) == hash_to_bytes(sha256) - assert artifact_to_revision_id(known_artifacts, artifact_metadata) == hash_to_bytes( - "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92" + assert ( + PyPILoader.known_artifact_to_extid( + {"original_artifact": [{"checksums": {"sha256": "something-wrong"},}],}, + ) + is None ) # there should not be more than one artifact diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -5,11 +5,12 @@ import hashlib import string +from unittest.mock import MagicMock import attr import pytest -from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.loader import BaseManifestPackageInfo, PackageLoader class FakeStorage: @@ -44,13 +45,58 @@ assert actual_load_status2 == {"status": "failed"} -def test_extid(): +def test_resolve_revision_from(): + loader = PackageLoader(None, None) + loader.known_artifact_to_extid = MagicMock( + wraps=lambda known_artifact: known_artifact["key"].encode() + ) + + known_artifacts = { + b"a" * 40: {"key": "extid-of-aaaa"}, + b"b" * 40: {"key": "extid-of-bbbb"}, + } + + p_info = MagicMock() + + # No known artifact -> it would be useless to compute the extid + assert loader.resolve_revision_from({}, p_info) is None + p_info.extid.assert_not_called() + loader.known_artifact_to_extid.assert_not_called() + + p_info.extid.reset_mock() + + # Some artifacts, but the PackageInfo does not support extids + p_info.extid.return_value = None + assert loader.resolve_revision_from(known_artifacts, p_info) is None + p_info.extid.assert_called_once() + loader.known_artifact_to_extid.assert_not_called() + + p_info.extid.reset_mock() + + # Some artifacts, and the PackageInfo is not one of them (ie. cache miss) + p_info.extid.return_value = b"extid-of-cccc" + assert loader.resolve_revision_from(known_artifacts, p_info) is None + p_info.extid.assert_called_once() + loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-aaaa"}) + loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-bbbb"}) + + p_info.extid.reset_mock() + loader.known_artifact_to_extid.reset_mock() + + # Some artifacts, and the PackageInfo is one of them (ie. cache hit) + p_info.extid.return_value = b"extid-of-aaaa" + assert loader.resolve_revision_from(known_artifacts, p_info) == b"a" * 40 + p_info.extid.assert_called_once() + loader.known_artifact_to_extid.assert_called_once_with({"key": "extid-of-aaaa"}) + + +def test_manifest_extid(): """Compute primary key should return the right identity """ @attr.s - class TestPackageInfo(BasePackageInfo): + class TestPackageInfo(BaseManifestPackageInfo): a = attr.ib() b = attr.ib() length = attr.ib()