diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -212,21 +212,27 @@ """ shasum = p_info.shasum for rev_id, known_artifact in known_artifacts.items(): - known_original_artifact = known_artifact.get("original_artifact") - if not known_original_artifact: - # previous loader-npm version kept original artifact elsewhere - known_original_artifact = known_artifact.get("package_source") - if not known_original_artifact: - continue - original_hash = known_original_artifact["sha1"] - else: - assert isinstance(known_original_artifact, list) - original_hash = known_original_artifact[0]["checksums"]["sha1"] + original_hash = _artifact_to_sha1(known_artifact) if shasum == original_hash: return rev_id + return None +def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]: + """Returns the sha1 from an NPM 'original_artifact' dict""" + known_original_artifact = known_artifact.get("original_artifact") + if not known_original_artifact: + # previous loader-npm version kept original artifact elsewhere + known_original_artifact = known_artifact.get("package_source") + if not known_original_artifact: + return None + return known_original_artifact["sha1"] + else: + assert isinstance(known_original_artifact, list) + return known_original_artifact[0]["checksums"]["sha1"] + + def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -188,22 +188,33 @@ """ sha256 = p_info.sha256 for rev_id, known_artifact in known_artifacts.items(): - original_artifact = known_artifact["original_artifact"] - if isinstance(original_artifact, dict): - # previous loader-pypi version stored metadata as dict - original_sha256 = original_artifact["sha256"] - if sha256 == original_sha256: - return rev_id - continue - # new pypi loader actually store metadata dict differently... - assert isinstance(original_artifact, list) - # current loader-pypi stores metadata as list of dict - for original_artifact in known_artifact["original_artifact"]: - if sha256 == original_artifact["checksums"]["sha256"]: - return rev_id + original_sha256 = _artifact_to_sha256(known_artifact) + if sha256 == original_sha256: + return rev_id + return None +def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]: + """Returns the sha256 from a PyPI 'original_artifact' dict""" + original_artifact = known_artifact["original_artifact"] + if isinstance(original_artifact, dict): + # previous loader-pypi version stored metadata as dict + return original_artifact["sha256"] + # new pypi loader actually store metadata dict differently... + assert isinstance(original_artifact, list) + # current loader-pypi stores metadata as list of dict + if len(known_artifact["original_artifact"]) == 0: + return None + elif len(known_artifact["original_artifact"]) == 1: + return original_artifact[0]["checksums"]["sha256"] + else: + raise ValueError( + f"Expected exactly one PyPI original_artifact, got " + f"{len(known_artifact['original_artifact'])}" + ) + + def pypi_api_url(url: str) -> str: """Compute api url from a project url diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -867,6 +867,20 @@ "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92" ) + # there should not be more than one artifact + with pytest.raises(ValueError): + artifact_to_revision_id( + { + hash_to_bytes("845673bfe8cbd31b1eaf757745a964137e6f9116"): { + "original_artifact": [ + {"checksums": {"sha256": artifact_metadata.sha256,},}, + {"checksums": {"sha256": artifact_metadata.sha256,},}, + ], + }, + }, + artifact_metadata, + ) + def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion