Page MenuHomeSoftware Heritage

D5423.diff
No OneTemporary

D5423.diff

diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py
--- a/swh/loader/package/archive/loader.py
+++ b/swh/loader/package/archive/loader.py
@@ -144,12 +144,6 @@
) -> Optional[PartialExtID]:
return p_info.extid(manifest_format=self.extid_manifest_format)
- def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]:
- known_artifact_info = ArchivePackageInfo.from_metadata(
- known_artifact["extrinsic"]["raw"]
- )
- return known_artifact_info.extid(manifest_format=self.extid_manifest_format)
-
def build_revision(
self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -169,12 +163,4 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {},
- "extrinsic": {
- "provider": self.url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py
--- a/swh/loader/package/archive/tests/test_archive.py
+++ b/swh/loader/package/archive/tests/test_archive.py
@@ -10,7 +10,6 @@
import pytest
from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Snapshot, SnapshotBranch, TargetType
@@ -110,37 +109,6 @@
assert_last_visit_matches(swh_storage, url, status="partial", type="tar")
-def test_archive_check_revision_metadata_structure(swh_storage, requests_mock_datadir):
- loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS)
-
- actual_load_status = loader.load()
- assert actual_load_status["status"] == "eventful"
- assert actual_load_status["snapshot_id"] is not None
-
- assert_last_visit_matches(swh_storage, URL, status="full", type="tar")
-
- expected_revision_id = hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3")
- revision = swh_storage.revision_get([expected_revision_id])[0]
- assert revision is not None
-
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("intrinsic", dict),
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
-
def test_archive_visit_with_release_artifact_no_prior_visit(
swh_storage, requests_mock_datadir
):
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
--- a/swh/loader/package/cran/loader.py
+++ b/swh/loader/package/cran/loader.py
@@ -16,7 +16,7 @@
import dateutil.parser
from debian.deb822 import Deb822
-from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID
+from swh.loader.package.loader import BasePackageInfo, PackageLoader
from swh.loader.package.utils import release_name
from swh.model.model import (
Person,
@@ -88,10 +88,6 @@
if version == p_info.version:
yield release_name(version), p_info
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- return CRANPackageInfo.from_metadata(known_artifact["extrinsic"]["raw"]).extid()
-
def build_revision(
self, p_info: CRANPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -110,14 +106,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "DESCRIPTION", "raw": metadata,},
- "extrinsic": {
- "provider": self.url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py
--- a/swh/loader/package/debian/loader.py
+++ b/swh/loader/package/debian/loader.py
@@ -178,21 +178,6 @@
p_info = DebianPackageInfo.from_metadata(meta, url=self.url)
yield release_name(version), p_info
- def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]:
- sha256 = _artifact_to_dsc_sha256(known_artifact, url=self.url)
- if sha256 is None:
- return None
- return (EXTID_TYPE, hash_to_bytes(sha256))
-
- def resolve_revision_from_artifacts(
- self, known_artifacts: Dict, p_info: DebianPackageInfo,
- ) -> Optional[bytes]:
- try:
- return super().resolve_revision_from_artifacts(known_artifacts, p_info)
- except DscCountError:
- # known_artifacts are corrupted, ignore them instead of crashing
- return None
-
def download_package(
self, p_info: DebianPackageInfo, tmpdir: str
) -> List[Tuple[str, Mapping]]:
@@ -252,14 +237,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "dsc", "raw": attr.asdict(intrinsic_metadata),},
- "extrinsic": {
- "provider": dsc_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py
--- a/swh/loader/package/debian/tests/test_debian.py
+++ b/swh/loader/package/debian/tests/test_debian.py
@@ -5,7 +5,6 @@
import logging
from os import path
-import random
import pytest
@@ -411,152 +410,3 @@
)
check_snapshot(expected_snapshot, swh_storage)
-
-
-def test_debian_resolve_revision_from_artifacts_edge_cases():
- """Solving revision with empty data will result in unknown revision
-
- """
- loader = DebianLoader(None, None, None, None)
- empty_artifact = {
- "name": PACKAGE_FILES["name"],
- "version": PACKAGE_FILES["version"],
- }
- for package_artifacts in [empty_artifact, PACKAGE_FILES]:
- p_info = DebianPackageInfo.from_metadata(package_artifacts, url=URL)
- actual_revision = loader.resolve_revision_from_artifacts({}, p_info)
- assert actual_revision is None
-
- for known_artifacts in [{}, PACKAGE_FILES]:
- actual_revision = loader.resolve_revision_from_artifacts(
- known_artifacts, DebianPackageInfo.from_metadata(empty_artifact, url=URL)
- )
- assert actual_revision is None
-
- known_package_artifacts = {
- b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07": {
- "extrinsic": {
- # empty
- },
- # ... removed the unnecessary intermediary data
- }
- }
- assert not loader.resolve_revision_from_artifacts(
- known_package_artifacts, DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL)
- )
-
-
-def test_debian_resolve_revision_from_artifacts_edge_cases_hit_and_miss():
- """Solving revision with inconsistent data will result in unknown revision
-
- """
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES2
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- expected_revision_id = (
- b"(\x08\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xff\x85\x85O\xfe\xcf\x07" # noqa
- )
- known_package_artifacts = {
- expected_revision_id: {
- "extrinsic": {"raw": PACKAGE_FILES,},
- # ... removed the unnecessary intermediary data
- }
- }
-
- actual_revision = loader.resolve_revision_from_artifacts(
- known_package_artifacts, p_info
- )
-
- assert actual_revision is None
-
-
-def test_debian_resolve_revision_from_artifacts():
- """Solving revision with consistent data will solve the revision
-
- """
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- expected_revision_id = (
- b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" # noqa
- )
-
- files = artifact_metadata["files"]
- # shuffling dict's keys
- keys = list(files.keys())
- random.shuffle(keys)
- package_files = {
- "name": PACKAGE_FILES["name"],
- "version": PACKAGE_FILES["version"],
- "files": {k: files[k] for k in keys},
- }
-
- known_package_artifacts = {
- expected_revision_id: {
- "extrinsic": {"raw": package_files,},
- # ... removed the unnecessary intermediary data
- }
- }
-
- actual_revision = loader.resolve_revision_from_artifacts(
- known_package_artifacts, p_info
- )
-
- assert actual_revision == expected_revision_id
-
-
-def test_debian_resolve_revision_from_artifacts_corrupt_known_artifact():
- """To many or not enough .dsc files in the known_artifacts dict"""
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- expected_revision_id = (
- b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07"
- )
-
- files = dict(artifact_metadata["files"])
- package_files = {
- "name": PACKAGE_FILES["name"],
- "version": PACKAGE_FILES["version"],
- "files": files,
- }
-
- known_package_artifacts = {
- expected_revision_id: {
- "extrinsic": {"raw": package_files,},
- # ... removed the unnecessary intermediary data
- }
- }
-
- # Too many .dsc
- files["another.dsc"] = files["cicero_0.7.2-3.dsc"]
- assert (
- loader.resolve_revision_from_artifacts(known_package_artifacts, p_info) is None
- )
-
- # Not enough .dsc
- del files["another.dsc"]
- del files["cicero_0.7.2-3.dsc"]
- assert (
- loader.resolve_revision_from_artifacts(known_package_artifacts, p_info) is None
- )
-
-
-def test_debian_resolve_revision_from_artifacts_corrupt_new_artifact():
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES
-
- files = PACKAGE_FILES["files"]
- files = {**files, "another.dsc": files["cicero_0.7.2-3.dsc"]}
- artifact_metadata = {**PACKAGE_FILES, "files": files}
-
- # Too many .dsc
- files["another.dsc"] = files["cicero_0.7.2-3.dsc"]
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- assert loader.resolve_revision_from_artifacts(PACKAGE_FILES, p_info) is None
-
- # Not enough .dsc
- del files["another.dsc"]
- del files["cicero_0.7.2-3.dsc"]
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- assert loader.resolve_revision_from_artifacts(PACKAGE_FILES, p_info) is None
diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -209,13 +209,6 @@
parents=p_info.revision_parents,
directory=directory,
synthetic=True,
- metadata={
- "extrinsic": {
- "provider": self.client.metadata_url(self.deposit_id),
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py
--- a/swh/loader/package/deposit/tests/test_deposit.py
+++ b/swh/loader/package/deposit/tests/test_deposit.py
@@ -12,7 +12,6 @@
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.loader.package.deposit.loader import ApiClient, DepositLoader
from swh.loader.package.loader import now
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model.identifiers import (
@@ -158,42 +157,6 @@
assert body == expected_body
-def test_deposit_revision_metadata_structure(
- swh_storage, deposit_client, requests_mock_datadir
-):
- url = "https://hal-test.archives-ouvertes.fr/some-external-id"
- deposit_id = 666
- loader = DepositLoader(
- swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip"
- )
-
- actual_load_status = loader.load()
- assert actual_load_status["status"] == "eventful"
- assert actual_load_status["snapshot_id"] is not None
- expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb")
- revision = loader.storage.revision_get([expected_revision_id])[0]
- assert revision is not None
-
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- # Only 2 top-level keys now
- assert set(revision.metadata.keys()) == {"extrinsic", "original_artifact"}
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
-
def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir):
url = "https://hal-test.archives-ouvertes.fr/some-external-id"
deposit_id = 666
@@ -336,8 +299,7 @@
assert revision
assert revision.date.to_dict() == raw_meta["deposit"]["author_date"]
assert revision.committer_date.to_dict() == raw_meta["deposit"]["committer_date"]
-
- read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/"
+ assert not revision.metadata
provider = {
"provider_name": "hal",
@@ -350,31 +312,6 @@
"version": "0.0.1",
"configuration": {"sword_version": "2"},
}
- assert revision.metadata == {
- "extrinsic": {
- "provider": read_api,
- "raw": {
- "origin": {"type": "deposit", "url": url,},
- "origin_metadata": {
- "metadata": raw_meta["metadata_dict"],
- "provider": provider,
- "tool": tool,
- },
- },
- "when": revision.metadata["extrinsic"]["when"], # dynamic
- },
- "original_artifact": [
- {
- "checksums": {
- "sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d",
- "sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa
- },
- "filename": "archive.zip",
- "length": 956830,
- "url": "https://deposit.softwareheritage.org/1/private/777/raw/",
- }
- ],
- }
fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",)
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -252,45 +252,6 @@
def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]:
return p_info.extid()
- def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]:
- """Returns a unique intrinsic identifier of a downloaded artifact,
- used to check if a new artifact is the same."""
- return None
-
- def resolve_revision_from_artifacts(
- self, known_artifacts: Dict[Sha1Git, Any], p_info: TPackageInfo,
- ) -> Optional[Sha1Git]:
- """Resolve the revision from known artifact metadata and a package info object.
-
- If the artifact has already been downloaded, this will return the
- existing revision targeting that uncompressed artifact directory.
- Otherwise, this returns None.
-
- Args:
- known_artifacts: dict from revision ids to revision metadata
- p_info: Package information
-
- Returns:
- None or revision identifier
-
- """
- if not known_artifacts:
- # No known artifact, no need to compute the artifact's extid
- return None
-
- new_extid = self.new_packageinfo_to_extid(p_info)
- if new_extid is None:
- # This loader does not support deduplication, at least not for this
- # artifact.
- return None
-
- for rev_id, known_artifact in known_artifacts.items():
- known_extid = self.known_artifact_to_extid(known_artifact)
- if new_extid == known_extid:
- return rev_id
-
- return None
-
def _get_known_extids(
self, packages_info: List[TPackageInfo]
) -> Dict[PartialExtID, List[CoreSWHID]]:
@@ -609,14 +570,6 @@
known_extids, p_info, last_snapshot_targets
)
- if revision_id is None:
- # No existing revision found from an acceptable ExtID,
- # search in the artifact data instead.
- # TODO: remove this after we finished migrating to ExtIDs.
- revision_id = self.resolve_revision_from_artifacts(
- known_artifacts, p_info
- )
-
if revision_id is None:
# No matching revision found in the last snapshot, load it.
try:
@@ -771,18 +724,6 @@
return None
metadata = [metadata for (filepath, metadata) in dl_artifacts]
- extra_metadata: Tuple[str, Any] = (
- "original_artifact",
- metadata,
- )
-
- if revision.metadata is not None:
- full_metadata = list(revision.metadata.items()) + [extra_metadata]
- else:
- full_metadata = [extra_metadata]
-
- # TODO: don't add these extrinsic metadata to the revision.
- revision = attr.evolve(revision, metadata=ImmutableDict(full_metadata))
original_artifact_metadata = RawExtrinsicMetadata(
target=ExtendedSWHID(
diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py
--- a/swh/loader/package/nixguix/loader.py
+++ b/swh/loader/package/nixguix/loader.py
@@ -155,21 +155,6 @@
ret[revision.id] = revision.metadata
return ret
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- try:
- value = known_artifact["extrinsic"]["raw"]["integrity"].encode("ascii")
- except KeyError as e:
- logger.exception(
- "Unexpected metadata revision structure detected: %(context)s",
- {"context": {"reason": str(e), "known_artifact": known_artifact,}},
- )
- # metadata field for the revision is not as expected by the loader
- # nixguix. We consider this not the right revision and continue checking
- # the other revisions
- return None
- return (EXTID_TYPE, value)
-
def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]:
"""We add a branch to the snapshot called 'evaluation' pointing to the
revision used to generate the sources.json file. This revision
@@ -210,13 +195,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "extrinsic": {
- "provider": self.provider_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py
--- a/swh/loader/package/nixguix/tests/test_nixguix.py
+++ b/swh/loader/package/nixguix/tests/test_nixguix.py
@@ -6,17 +6,14 @@
import json
import logging
import os
-from typing import Dict, List, Optional, Tuple
-from unittest.mock import patch
+from typing import Dict, Optional, Tuple
-import attr
import pytest
from swh.loader.package import __version__
from swh.loader.package.archive.loader import ArchiveLoader
from swh.loader.package.nixguix.loader import (
NixGuixLoader,
- NixGuixPackageInfo,
clean_sources,
make_pattern_unsupported_file_extension,
parse_sources,
@@ -93,10 +90,7 @@
for rev in revisions:
assert rev is not None
metadata = rev.metadata
- assert metadata is not None
- raw = metadata["extrinsic"]["raw"]
- assert "url" in raw
- assert "integrity" in raw
+ assert not metadata
def test_retrieve_sources(swh_storage, requests_mock_datadir):
@@ -469,24 +463,6 @@
} == stats
-def test_resolve_revision_from_artifacts(swh_storage, requests_mock_datadir, datadir):
- loader = NixGuixLoader(swh_storage, sources_url)
-
- known_artifacts = {
- "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}},
- "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}},
- }
-
- p_info = NixGuixPackageInfo.from_metadata(
- {"url": "url1", "integrity": "integrity1"}
- )
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) == "id1"
- p_info = NixGuixPackageInfo.from_metadata(
- {"url": "url3", "integrity": "integrity3"}
- )
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None
-
-
def test_evaluation_branch(swh_storage, requests_mock_datadir):
loader = NixGuixLoader(swh_storage, sources_url)
res = loader.load()
@@ -601,12 +577,6 @@
archive_loader.storage, gnu_url, status="full", type="tar"
)
- gnu_snapshot: Snapshot = snapshot_get_all_branches(
- archive_loader.storage, hash_to_bytes(expected_snapshot_id)
- )
-
- first_revision = gnu_snapshot.branches[f"releases/{release}".encode("utf-8")]
-
# 2. Then ingest with the nixguix loader which lists the same artifact within its
# sources.json
@@ -634,73 +604,3 @@
snapshot_id = actual_load_status2["snapshot_id"]
snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id))
assert snapshot
-
- # 3. Then ingest again with the nixguix loader, with a different snapshot
- # and different source
-
- # simulate a snapshot already seen with a revision with the wrong metadata structure
- # This revision should be skipped, thus making the artifact being ingested again.
- with patch(
- "swh.loader.package.loader.PackageLoader.last_snapshot"
- ) as last_snapshot:
- # mutate the snapshot to target a revision with the wrong metadata structure
- # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision
- old_revision = swh_storage.revision_get([first_revision.target])[0]
- # assert that revision is not in the right format
- assert old_revision.metadata["extrinsic"]["raw"].get("integrity", {}) == {}
-
- # mutate snapshot to create a clash
- snapshot = attr.evolve(
- snapshot,
- branches={
- **snapshot.branches,
- artifact_url.encode("utf-8"): SnapshotBranch(
- target_type=TargetType.REVISION,
- target=hash_to_bytes(old_revision.id),
- ),
- },
- )
-
- # modify snapshot to actually change revision metadata structure so we simulate
- # a revision written by somebody else (structure different)
- last_snapshot.return_value = snapshot
-
- loader = NixGuixLoader(swh_storage, sources_url)
- actual_load_status3 = loader.load()
- assert last_snapshot.called
- assert actual_load_status3["status"] == "eventful"
-
- assert_last_visit_matches(
- swh_storage, sources_url, status="full", type="nixguix"
- )
-
- new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965"
- assert actual_load_status3["snapshot_id"] == new_snapshot_id
-
- last_snapshot = snapshot_get_all_branches(
- swh_storage, hash_to_bytes(new_snapshot_id)
- )
- new_revision_branch = last_snapshot.branches[artifact_url.encode("utf-8")]
- assert new_revision_branch.target_type == TargetType.REVISION
-
- new_revision = swh_storage.revision_get([new_revision_branch.target])[0]
-
- # the new revision has the correct structure, so it got ingested alright by the
- # new run
- assert new_revision.metadata["extrinsic"]["raw"]["integrity"] is not None
-
- actual_detections: List[Dict] = []
- for record in caplog.records:
- logtext = record.getMessage()
- if "Unexpected metadata revision structure detected:" in logtext:
- actual_detections.append(record.args["context"])
-
- expected_detections = [
- {"reason": "'integrity'", "known_artifact": old_revision.metadata,},
- ]
-
- # less calls than there are sources listed in the sources.json;
- # as some of them are skipped using the ExtID from a previous run
- assert len(expected_detections) <= len(all_sources["sources"])
-
- assert actual_detections == expected_detections
diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -141,16 +141,6 @@
)
yield release_name(version), p_info
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- extid_str = _artifact_to_sha1(known_artifact)
- if extid_str is None:
- return None
- try:
- return (EXTID_TYPE, hash_to_bytes(extid_str))
- except ValueError:
- return None
-
def build_revision(
self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -183,56 +173,10 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "package.json", "raw": i_metadata,},
- "extrinsic": {
- "provider": self.provider_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
return r
-def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]:
- """Returns the sha1 from an NPM 'original_artifact' dict
-
- The following code allows to deal with 2 metadata formats:
-
- - old format sample::
-
- {
- 'package_source': {
- 'sha1': '05181c12cd8c22035dd31155656826b85745da37',
- }
- }
-
- - new format sample::
-
- {
- 'original_artifact': [{
- 'checksums': {
- 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
- ...
- },
- }],
- ...
- }
-
- """
- known_original_artifact = known_artifact.get("original_artifact")
- if not known_original_artifact:
- # previous loader-npm version kept original artifact elsewhere
- known_original_artifact = known_artifact.get("package_source")
- if not known_original_artifact:
- return None
- return known_original_artifact["sha1"]
- else:
- assert isinstance(known_original_artifact, list)
- return known_original_artifact[0]["checksums"]["sha1"]
-
-
def _author_str(author_data: Union[Dict, List, str]) -> str:
"""Parse author from package.json author fields
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -14,7 +14,6 @@
_author_str,
extract_npm_package_author,
)
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes
from swh.model.identifiers import (
@@ -308,37 +307,6 @@
return "https://replicate.npmjs.com/%s/" % package
-def test_npm_revision_metadata_structure(swh_storage, requests_mock_datadir):
- package = "org"
- loader = NpmLoader(swh_storage, package_url(package))
-
- actual_load_status = loader.load()
- assert actual_load_status["status"] == "eventful"
- assert actual_load_status["snapshot_id"] is not None
-
- expected_revision_id = hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e")
- revision = swh_storage.revision_get([expected_revision_id])[0]
- assert revision is not None
-
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("intrinsic.tool", str),
- ("intrinsic.raw", dict),
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
-
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
package = "org"
url = package_url(package)
@@ -544,41 +512,6 @@
check_snapshot(expected_snapshot, swh_storage)
-def test_npm__known_artifact_to_extid__old_loader_version():
- """Current loader version should parse old metadata scheme
-
- """
- assert (
- NpmLoader.known_artifact_to_extid(
- {"package_source": {"sha1": "something-wrong"}}
- )
- is None
- )
-
- sha1 = "05181c12cd8c22035dd31155656826b85745da37"
- assert NpmLoader.known_artifact_to_extid({"package_source": {"sha1": sha1,}}) == (
- "npm-archive-sha1",
- hash_to_bytes(sha1),
- )
-
-
-def test_npm__known_artifact_to_extid__current_loader_version():
- """Current loader version should be able to parse current metadata scheme
-
- """
- sha1 = "05181c12cd8c22035dd31155656826b85745da37"
- assert NpmLoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha1": sha1},}],}
- ) == ("npm-archive-sha1", hash_to_bytes(sha1))
-
- assert (
- NpmLoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha1": "something-wrong"},}],},
- )
- is None
- )
-
-
def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir):
"""Skip artifact with no intrinsic metadata during ingestion
diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py
--- a/swh/loader/package/pypi/loader.py
+++ b/swh/loader/package/pypi/loader.py
@@ -121,16 +121,6 @@
for version, p_info in res:
yield release_name(version, p_info.filename), p_info
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- extid_str = _artifact_to_sha256(known_artifact)
- if extid_str is None:
- return None
- try:
- return (EXTID_TYPE, hash_to_bytes(extid_str)) if extid_str else None
- except ValueError:
- return None
-
def build_revision(
self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -157,61 +147,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "PKG-INFO", "raw": i_metadata,},
- "extrinsic": {
- "provider": self.provider_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
- )
-
-
-def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]:
- """Returns the sha256 from a PyPI 'original_artifact' dict
-
- The following code allows to deal with 2 metadata formats (column metadata
- in 'revision')
-
- - old format sample::
-
- {
- 'original_artifact': {
- 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
- ...
- },
- ...
- }
-
- - new format sample::
-
- {
- 'original_artifact': [{
- 'checksums': {
- 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
- ...
- },
- }],
- ...
- }
-
- """
- original_artifact = known_artifact["original_artifact"]
- if isinstance(original_artifact, dict):
- # previous loader-pypi version stored metadata as dict
- return original_artifact["sha256"]
- # new pypi loader actually store metadata dict differently...
- assert isinstance(original_artifact, list)
- # current loader-pypi stores metadata as list of dict
- if len(known_artifact["original_artifact"]) == 0:
- return None
- elif len(known_artifact["original_artifact"]) == 1:
- return original_artifact[0]["checksums"]["sha256"]
- else:
- raise ValueError(
- f"Expected exactly one PyPI original_artifact, got "
- f"{len(known_artifact['original_artifact'])}"
)
diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py
--- a/swh/loader/package/pypi/tests/test_pypi.py
+++ b/swh/loader/package/pypi/tests/test_pypi.py
@@ -19,7 +19,6 @@
extract_intrinsic_metadata,
pypi_api_url,
)
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes
from swh.model.identifiers import (
@@ -334,24 +333,6 @@
revision = swh_storage.revision_get([expected_revision_id])[0]
assert revision is not None
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("intrinsic.tool", str),
- ("intrinsic.raw", dict),
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
revision_swhid = CoreSWHID(
object_type=ObjectType.REVISION, object_id=expected_revision_id
)
@@ -796,52 +777,6 @@
)
-def test_pypi__known_artifact_to_extid__old_loader_version():
- """Current loader version should solve old metadata scheme
-
- """
- assert (
- PyPILoader.known_artifact_to_extid(
- {"original_artifact": {"sha256": "something-wrong",},}
- )
- is None
- )
-
- sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec"
- assert PyPILoader.known_artifact_to_extid(
- {"original_artifact": {"sha256": sha256}}
- ) == ("pypi-archive-sha256", hash_to_bytes(sha256))
-
-
-def test_pypi__known_artifact_to_extid__current_loader_version():
- """Current loader version should be able to solve current metadata scheme
-
- """
- sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec"
-
- assert PyPILoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha256": sha256,},}],}
- ) == ("pypi-archive-sha256", hash_to_bytes(sha256))
-
- assert (
- PyPILoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha256": "something-wrong"},}],},
- )
- is None
- )
-
- # there should not be more than one artifact
- with pytest.raises(ValueError):
- PyPILoader.known_artifact_to_extid(
- {
- "original_artifact": [
- {"checksums": {"sha256": sha256,},},
- {"checksums": {"sha256": sha256,},},
- ],
- }
- )
-
-
def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir):
"""Skip artifact with no intrinsic metadata during ingestion
diff --git a/swh/loader/package/tests/common.py b/swh/loader/package/tests/common.py
--- a/swh/loader/package/tests/common.py
+++ b/swh/loader/package/tests/common.py
@@ -5,50 +5,8 @@
import logging
from os import path
-from typing import Dict, List, Tuple
logger = logging.getLogger(__file__)
DATADIR = path.join(path.abspath(path.dirname(__file__)), "resources")
-
-
-def check_metadata(metadata: Dict, key_path: str, raw_type: str):
- """Given a metadata dict, ensure the associated key_path value is of type
- raw_type.
-
- Args:
- metadata: Dict to check
- key_path: Path to check
- raw_type: Type to check the path with
-
- Raises:
- Assertion error in case of mismatch
-
- """
- data = metadata
- keys = key_path.split(".")
- for k in keys:
- try:
- data = data[k]
- except (TypeError, KeyError) as e:
- # KeyError: because path too long
- # TypeError: data is not a dict
- raise AssertionError(e)
- assert isinstance(data, raw_type) # type: ignore
-
-
-def check_metadata_paths(metadata: Dict, paths: List[Tuple[str, str]]):
- """Given a metadata dict, ensure the keys are of expected types
-
- Args:
- metadata: Dict to check
- key_path: Path to check
- raw_type: Type to check the path with
-
- Raises:
- Assertion error in case of mismatch
-
- """
- for key_path, raw_type in paths:
- check_metadata(metadata, key_path, raw_type)
diff --git a/swh/loader/package/tests/test_common.py b/swh/loader/package/tests/test_common.py
deleted file mode 100644
--- a/swh/loader/package/tests/test_common.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import pytest
-
-from swh.loader.package.tests.common import check_metadata, check_metadata_paths
-
-
-def test_check_metadata():
- metadata = {
- "a": {"raw": {"time": "something",},},
- "b": [],
- "c": 1,
- }
-
- for raw_path, raw_type in [
- ("a.raw", dict),
- ("a.raw.time", str),
- ("b", list),
- ("c", int),
- ]:
- check_metadata(metadata, raw_path, raw_type)
-
-
-def test_check_metadata_ko():
- metadata = {
- "a": {"raw": "hello",},
- "b": [],
- "c": 1,
- }
-
- for raw_path, raw_type in [
- ("a.b", dict),
- ("a.raw.time", str),
- ]:
- with pytest.raises(AssertionError):
- check_metadata(metadata, raw_path, raw_type)
-
-
-def test_check_metadata_paths():
- metadata = {
- "a": {"raw": {"time": "something",},},
- "b": [],
- "c": 1,
- }
-
- check_metadata_paths(
- metadata, [("a.raw", dict), ("a.raw.time", str), ("b", list), ("c", int),]
- )
-
-
-def test_check_metadata_paths_ko():
- metadata = {
- "a": {"raw": "hello",},
- "b": [],
- "c": 1,
- }
-
- with pytest.raises(AssertionError):
- check_metadata_paths(metadata, [("a.b", dict), ("a.raw.time", str),])
diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py
--- a/swh/loader/package/tests/test_loader.py
+++ b/swh/loader/package/tests/test_loader.py
@@ -83,51 +83,6 @@
assert actual_load_status2 == {"status": "failed"}
-def test_resolve_revision_from_artifacts() -> None:
- loader = PackageLoader(None, None) # type: ignore
- loader.known_artifact_to_extid = Mock( # type: ignore
- wraps=lambda known_artifact: ("extid-type", known_artifact["key"].encode())
- )
-
- known_artifacts = {
- b"a" * 20: {"key": "extid-of-aaaa"},
- b"b" * 20: {"key": "extid-of-bbbb"},
- }
-
- p_info = Mock(wraps=BasePackageInfo(None, None)) # type: ignore
-
- # No known artifact -> it would be useless to compute the extid
- assert loader.resolve_revision_from_artifacts({}, p_info) is None
- p_info.extid.assert_not_called()
- loader.known_artifact_to_extid.assert_not_called()
-
- p_info.extid.reset_mock()
-
- # Some artifacts, but the PackageInfo does not support extids
- p_info.extid.return_value = None
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None
- p_info.extid.assert_called_once()
- loader.known_artifact_to_extid.assert_not_called()
-
- p_info.extid.reset_mock()
-
- # Some artifacts, and the PackageInfo is not one of them (ie. cache miss)
- p_info.extid.return_value = ("extid-type", b"extid-of-cccc")
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None
- p_info.extid.assert_called_once()
- loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-aaaa"})
- loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-bbbb"})
-
- p_info.extid.reset_mock()
- loader.known_artifact_to_extid.reset_mock()
-
- # Some artifacts, and the PackageInfo is one of them (ie. cache hit)
- p_info.extid.return_value = ("extid-type", b"extid-of-aaaa")
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) == b"a" * 20
- p_info.extid.assert_called_once()
- loader.known_artifact_to_extid.assert_called_once_with({"key": "extid-of-aaaa"})
-
-
def test_resolve_revision_from_extids() -> None:
loader = PackageLoader(None, None) # type: ignore

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 9:18 AM (6 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229295

Event Timeline