Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9338962
D5423.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
D5423.diff
View Options
diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py
--- a/swh/loader/package/archive/loader.py
+++ b/swh/loader/package/archive/loader.py
@@ -144,12 +144,6 @@
) -> Optional[PartialExtID]:
return p_info.extid(manifest_format=self.extid_manifest_format)
- def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]:
- known_artifact_info = ArchivePackageInfo.from_metadata(
- known_artifact["extrinsic"]["raw"]
- )
- return known_artifact_info.extid(manifest_format=self.extid_manifest_format)
-
def build_revision(
self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -169,12 +163,4 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {},
- "extrinsic": {
- "provider": self.url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py
--- a/swh/loader/package/archive/tests/test_archive.py
+++ b/swh/loader/package/archive/tests/test_archive.py
@@ -10,7 +10,6 @@
import pytest
from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Snapshot, SnapshotBranch, TargetType
@@ -110,37 +109,6 @@
assert_last_visit_matches(swh_storage, url, status="partial", type="tar")
-def test_archive_check_revision_metadata_structure(swh_storage, requests_mock_datadir):
- loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS)
-
- actual_load_status = loader.load()
- assert actual_load_status["status"] == "eventful"
- assert actual_load_status["snapshot_id"] is not None
-
- assert_last_visit_matches(swh_storage, URL, status="full", type="tar")
-
- expected_revision_id = hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3")
- revision = swh_storage.revision_get([expected_revision_id])[0]
- assert revision is not None
-
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("intrinsic", dict),
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
-
def test_archive_visit_with_release_artifact_no_prior_visit(
swh_storage, requests_mock_datadir
):
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
--- a/swh/loader/package/cran/loader.py
+++ b/swh/loader/package/cran/loader.py
@@ -16,7 +16,7 @@
import dateutil.parser
from debian.deb822 import Deb822
-from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID
+from swh.loader.package.loader import BasePackageInfo, PackageLoader
from swh.loader.package.utils import release_name
from swh.model.model import (
Person,
@@ -88,10 +88,6 @@
if version == p_info.version:
yield release_name(version), p_info
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- return CRANPackageInfo.from_metadata(known_artifact["extrinsic"]["raw"]).extid()
-
def build_revision(
self, p_info: CRANPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -110,14 +106,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "DESCRIPTION", "raw": metadata,},
- "extrinsic": {
- "provider": self.url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py
--- a/swh/loader/package/debian/loader.py
+++ b/swh/loader/package/debian/loader.py
@@ -178,21 +178,6 @@
p_info = DebianPackageInfo.from_metadata(meta, url=self.url)
yield release_name(version), p_info
- def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]:
- sha256 = _artifact_to_dsc_sha256(known_artifact, url=self.url)
- if sha256 is None:
- return None
- return (EXTID_TYPE, hash_to_bytes(sha256))
-
- def resolve_revision_from_artifacts(
- self, known_artifacts: Dict, p_info: DebianPackageInfo,
- ) -> Optional[bytes]:
- try:
- return super().resolve_revision_from_artifacts(known_artifacts, p_info)
- except DscCountError:
- # known_artifacts are corrupted, ignore them instead of crashing
- return None
-
def download_package(
self, p_info: DebianPackageInfo, tmpdir: str
) -> List[Tuple[str, Mapping]]:
@@ -252,14 +237,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "dsc", "raw": attr.asdict(intrinsic_metadata),},
- "extrinsic": {
- "provider": dsc_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py
--- a/swh/loader/package/debian/tests/test_debian.py
+++ b/swh/loader/package/debian/tests/test_debian.py
@@ -5,7 +5,6 @@
import logging
from os import path
-import random
import pytest
@@ -411,152 +410,3 @@
)
check_snapshot(expected_snapshot, swh_storage)
-
-
-def test_debian_resolve_revision_from_artifacts_edge_cases():
- """Solving revision with empty data will result in unknown revision
-
- """
- loader = DebianLoader(None, None, None, None)
- empty_artifact = {
- "name": PACKAGE_FILES["name"],
- "version": PACKAGE_FILES["version"],
- }
- for package_artifacts in [empty_artifact, PACKAGE_FILES]:
- p_info = DebianPackageInfo.from_metadata(package_artifacts, url=URL)
- actual_revision = loader.resolve_revision_from_artifacts({}, p_info)
- assert actual_revision is None
-
- for known_artifacts in [{}, PACKAGE_FILES]:
- actual_revision = loader.resolve_revision_from_artifacts(
- known_artifacts, DebianPackageInfo.from_metadata(empty_artifact, url=URL)
- )
- assert actual_revision is None
-
- known_package_artifacts = {
- b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07": {
- "extrinsic": {
- # empty
- },
- # ... removed the unnecessary intermediary data
- }
- }
- assert not loader.resolve_revision_from_artifacts(
- known_package_artifacts, DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL)
- )
-
-
-def test_debian_resolve_revision_from_artifacts_edge_cases_hit_and_miss():
- """Solving revision with inconsistent data will result in unknown revision
-
- """
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES2
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- expected_revision_id = (
- b"(\x08\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xff\x85\x85O\xfe\xcf\x07" # noqa
- )
- known_package_artifacts = {
- expected_revision_id: {
- "extrinsic": {"raw": PACKAGE_FILES,},
- # ... removed the unnecessary intermediary data
- }
- }
-
- actual_revision = loader.resolve_revision_from_artifacts(
- known_package_artifacts, p_info
- )
-
- assert actual_revision is None
-
-
-def test_debian_resolve_revision_from_artifacts():
- """Solving revision with consistent data will solve the revision
-
- """
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- expected_revision_id = (
- b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" # noqa
- )
-
- files = artifact_metadata["files"]
- # shuffling dict's keys
- keys = list(files.keys())
- random.shuffle(keys)
- package_files = {
- "name": PACKAGE_FILES["name"],
- "version": PACKAGE_FILES["version"],
- "files": {k: files[k] for k in keys},
- }
-
- known_package_artifacts = {
- expected_revision_id: {
- "extrinsic": {"raw": package_files,},
- # ... removed the unnecessary intermediary data
- }
- }
-
- actual_revision = loader.resolve_revision_from_artifacts(
- known_package_artifacts, p_info
- )
-
- assert actual_revision == expected_revision_id
-
-
-def test_debian_resolve_revision_from_artifacts_corrupt_known_artifact():
- """To many or not enough .dsc files in the known_artifacts dict"""
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- expected_revision_id = (
- b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07"
- )
-
- files = dict(artifact_metadata["files"])
- package_files = {
- "name": PACKAGE_FILES["name"],
- "version": PACKAGE_FILES["version"],
- "files": files,
- }
-
- known_package_artifacts = {
- expected_revision_id: {
- "extrinsic": {"raw": package_files,},
- # ... removed the unnecessary intermediary data
- }
- }
-
- # Too many .dsc
- files["another.dsc"] = files["cicero_0.7.2-3.dsc"]
- assert (
- loader.resolve_revision_from_artifacts(known_package_artifacts, p_info) is None
- )
-
- # Not enough .dsc
- del files["another.dsc"]
- del files["cicero_0.7.2-3.dsc"]
- assert (
- loader.resolve_revision_from_artifacts(known_package_artifacts, p_info) is None
- )
-
-
-def test_debian_resolve_revision_from_artifacts_corrupt_new_artifact():
- loader = DebianLoader(None, None, None, None)
- artifact_metadata = PACKAGE_FILES
-
- files = PACKAGE_FILES["files"]
- files = {**files, "another.dsc": files["cicero_0.7.2-3.dsc"]}
- artifact_metadata = {**PACKAGE_FILES, "files": files}
-
- # Too many .dsc
- files["another.dsc"] = files["cicero_0.7.2-3.dsc"]
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- assert loader.resolve_revision_from_artifacts(PACKAGE_FILES, p_info) is None
-
- # Not enough .dsc
- del files["another.dsc"]
- del files["cicero_0.7.2-3.dsc"]
- p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL)
- assert loader.resolve_revision_from_artifacts(PACKAGE_FILES, p_info) is None
diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -209,13 +209,6 @@
parents=p_info.revision_parents,
directory=directory,
synthetic=True,
- metadata={
- "extrinsic": {
- "provider": self.client.metadata_url(self.deposit_id),
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py
--- a/swh/loader/package/deposit/tests/test_deposit.py
+++ b/swh/loader/package/deposit/tests/test_deposit.py
@@ -12,7 +12,6 @@
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.loader.package.deposit.loader import ApiClient, DepositLoader
from swh.loader.package.loader import now
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model.identifiers import (
@@ -158,42 +157,6 @@
assert body == expected_body
-def test_deposit_revision_metadata_structure(
- swh_storage, deposit_client, requests_mock_datadir
-):
- url = "https://hal-test.archives-ouvertes.fr/some-external-id"
- deposit_id = 666
- loader = DepositLoader(
- swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip"
- )
-
- actual_load_status = loader.load()
- assert actual_load_status["status"] == "eventful"
- assert actual_load_status["snapshot_id"] is not None
- expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb")
- revision = loader.storage.revision_get([expected_revision_id])[0]
- assert revision is not None
-
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- # Only 2 top-level keys now
- assert set(revision.metadata.keys()) == {"extrinsic", "original_artifact"}
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
-
def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir):
url = "https://hal-test.archives-ouvertes.fr/some-external-id"
deposit_id = 666
@@ -336,8 +299,7 @@
assert revision
assert revision.date.to_dict() == raw_meta["deposit"]["author_date"]
assert revision.committer_date.to_dict() == raw_meta["deposit"]["committer_date"]
-
- read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/"
+ assert not revision.metadata
provider = {
"provider_name": "hal",
@@ -350,31 +312,6 @@
"version": "0.0.1",
"configuration": {"sword_version": "2"},
}
- assert revision.metadata == {
- "extrinsic": {
- "provider": read_api,
- "raw": {
- "origin": {"type": "deposit", "url": url,},
- "origin_metadata": {
- "metadata": raw_meta["metadata_dict"],
- "provider": provider,
- "tool": tool,
- },
- },
- "when": revision.metadata["extrinsic"]["when"], # dynamic
- },
- "original_artifact": [
- {
- "checksums": {
- "sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d",
- "sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa
- },
- "filename": "archive.zip",
- "length": 956830,
- "url": "https://deposit.softwareheritage.org/1/private/777/raw/",
- }
- ],
- }
fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",)
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -252,45 +252,6 @@
def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]:
return p_info.extid()
- def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]:
- """Returns a unique intrinsic identifier of a downloaded artifact,
- used to check if a new artifact is the same."""
- return None
-
- def resolve_revision_from_artifacts(
- self, known_artifacts: Dict[Sha1Git, Any], p_info: TPackageInfo,
- ) -> Optional[Sha1Git]:
- """Resolve the revision from known artifact metadata and a package info object.
-
- If the artifact has already been downloaded, this will return the
- existing revision targeting that uncompressed artifact directory.
- Otherwise, this returns None.
-
- Args:
- known_artifacts: dict from revision ids to revision metadata
- p_info: Package information
-
- Returns:
- None or revision identifier
-
- """
- if not known_artifacts:
- # No known artifact, no need to compute the artifact's extid
- return None
-
- new_extid = self.new_packageinfo_to_extid(p_info)
- if new_extid is None:
- # This loader does not support deduplication, at least not for this
- # artifact.
- return None
-
- for rev_id, known_artifact in known_artifacts.items():
- known_extid = self.known_artifact_to_extid(known_artifact)
- if new_extid == known_extid:
- return rev_id
-
- return None
-
def _get_known_extids(
self, packages_info: List[TPackageInfo]
) -> Dict[PartialExtID, List[CoreSWHID]]:
@@ -609,14 +570,6 @@
known_extids, p_info, last_snapshot_targets
)
- if revision_id is None:
- # No existing revision found from an acceptable ExtID,
- # search in the artifact data instead.
- # TODO: remove this after we finished migrating to ExtIDs.
- revision_id = self.resolve_revision_from_artifacts(
- known_artifacts, p_info
- )
-
if revision_id is None:
# No matching revision found in the last snapshot, load it.
try:
@@ -771,18 +724,6 @@
return None
metadata = [metadata for (filepath, metadata) in dl_artifacts]
- extra_metadata: Tuple[str, Any] = (
- "original_artifact",
- metadata,
- )
-
- if revision.metadata is not None:
- full_metadata = list(revision.metadata.items()) + [extra_metadata]
- else:
- full_metadata = [extra_metadata]
-
- # TODO: don't add these extrinsic metadata to the revision.
- revision = attr.evolve(revision, metadata=ImmutableDict(full_metadata))
original_artifact_metadata = RawExtrinsicMetadata(
target=ExtendedSWHID(
diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py
--- a/swh/loader/package/nixguix/loader.py
+++ b/swh/loader/package/nixguix/loader.py
@@ -155,21 +155,6 @@
ret[revision.id] = revision.metadata
return ret
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- try:
- value = known_artifact["extrinsic"]["raw"]["integrity"].encode("ascii")
- except KeyError as e:
- logger.exception(
- "Unexpected metadata revision structure detected: %(context)s",
- {"context": {"reason": str(e), "known_artifact": known_artifact,}},
- )
- # metadata field for the revision is not as expected by the loader
- # nixguix. We consider this not the right revision and continue checking
- # the other revisions
- return None
- return (EXTID_TYPE, value)
-
def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]:
"""We add a branch to the snapshot called 'evaluation' pointing to the
revision used to generate the sources.json file. This revision
@@ -210,13 +195,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "extrinsic": {
- "provider": self.provider_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py
--- a/swh/loader/package/nixguix/tests/test_nixguix.py
+++ b/swh/loader/package/nixguix/tests/test_nixguix.py
@@ -6,17 +6,14 @@
import json
import logging
import os
-from typing import Dict, List, Optional, Tuple
-from unittest.mock import patch
+from typing import Dict, Optional, Tuple
-import attr
import pytest
from swh.loader.package import __version__
from swh.loader.package.archive.loader import ArchiveLoader
from swh.loader.package.nixguix.loader import (
NixGuixLoader,
- NixGuixPackageInfo,
clean_sources,
make_pattern_unsupported_file_extension,
parse_sources,
@@ -93,10 +90,7 @@
for rev in revisions:
assert rev is not None
metadata = rev.metadata
- assert metadata is not None
- raw = metadata["extrinsic"]["raw"]
- assert "url" in raw
- assert "integrity" in raw
+ assert not metadata
def test_retrieve_sources(swh_storage, requests_mock_datadir):
@@ -469,24 +463,6 @@
} == stats
-def test_resolve_revision_from_artifacts(swh_storage, requests_mock_datadir, datadir):
- loader = NixGuixLoader(swh_storage, sources_url)
-
- known_artifacts = {
- "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}},
- "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}},
- }
-
- p_info = NixGuixPackageInfo.from_metadata(
- {"url": "url1", "integrity": "integrity1"}
- )
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) == "id1"
- p_info = NixGuixPackageInfo.from_metadata(
- {"url": "url3", "integrity": "integrity3"}
- )
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None
-
-
def test_evaluation_branch(swh_storage, requests_mock_datadir):
loader = NixGuixLoader(swh_storage, sources_url)
res = loader.load()
@@ -601,12 +577,6 @@
archive_loader.storage, gnu_url, status="full", type="tar"
)
- gnu_snapshot: Snapshot = snapshot_get_all_branches(
- archive_loader.storage, hash_to_bytes(expected_snapshot_id)
- )
-
- first_revision = gnu_snapshot.branches[f"releases/{release}".encode("utf-8")]
-
# 2. Then ingest with the nixguix loader which lists the same artifact within its
# sources.json
@@ -634,73 +604,3 @@
snapshot_id = actual_load_status2["snapshot_id"]
snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id))
assert snapshot
-
- # 3. Then ingest again with the nixguix loader, with a different snapshot
- # and different source
-
- # simulate a snapshot already seen with a revision with the wrong metadata structure
- # This revision should be skipped, thus making the artifact being ingested again.
- with patch(
- "swh.loader.package.loader.PackageLoader.last_snapshot"
- ) as last_snapshot:
- # mutate the snapshot to target a revision with the wrong metadata structure
- # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision
- old_revision = swh_storage.revision_get([first_revision.target])[0]
- # assert that revision is not in the right format
- assert old_revision.metadata["extrinsic"]["raw"].get("integrity", {}) == {}
-
- # mutate snapshot to create a clash
- snapshot = attr.evolve(
- snapshot,
- branches={
- **snapshot.branches,
- artifact_url.encode("utf-8"): SnapshotBranch(
- target_type=TargetType.REVISION,
- target=hash_to_bytes(old_revision.id),
- ),
- },
- )
-
- # modify snapshot to actually change revision metadata structure so we simulate
- # a revision written by somebody else (structure different)
- last_snapshot.return_value = snapshot
-
- loader = NixGuixLoader(swh_storage, sources_url)
- actual_load_status3 = loader.load()
- assert last_snapshot.called
- assert actual_load_status3["status"] == "eventful"
-
- assert_last_visit_matches(
- swh_storage, sources_url, status="full", type="nixguix"
- )
-
- new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965"
- assert actual_load_status3["snapshot_id"] == new_snapshot_id
-
- last_snapshot = snapshot_get_all_branches(
- swh_storage, hash_to_bytes(new_snapshot_id)
- )
- new_revision_branch = last_snapshot.branches[artifact_url.encode("utf-8")]
- assert new_revision_branch.target_type == TargetType.REVISION
-
- new_revision = swh_storage.revision_get([new_revision_branch.target])[0]
-
- # the new revision has the correct structure, so it got ingested alright by the
- # new run
- assert new_revision.metadata["extrinsic"]["raw"]["integrity"] is not None
-
- actual_detections: List[Dict] = []
- for record in caplog.records:
- logtext = record.getMessage()
- if "Unexpected metadata revision structure detected:" in logtext:
- actual_detections.append(record.args["context"])
-
- expected_detections = [
- {"reason": "'integrity'", "known_artifact": old_revision.metadata,},
- ]
-
- # less calls than there are sources listed in the sources.json;
- # as some of them are skipped using the ExtID from a previous run
- assert len(expected_detections) <= len(all_sources["sources"])
-
- assert actual_detections == expected_detections
diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -141,16 +141,6 @@
)
yield release_name(version), p_info
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- extid_str = _artifact_to_sha1(known_artifact)
- if extid_str is None:
- return None
- try:
- return (EXTID_TYPE, hash_to_bytes(extid_str))
- except ValueError:
- return None
-
def build_revision(
self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -183,56 +173,10 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "package.json", "raw": i_metadata,},
- "extrinsic": {
- "provider": self.provider_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
)
return r
-def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]:
- """Returns the sha1 from an NPM 'original_artifact' dict
-
- The following code allows to deal with 2 metadata formats:
-
- - old format sample::
-
- {
- 'package_source': {
- 'sha1': '05181c12cd8c22035dd31155656826b85745da37',
- }
- }
-
- - new format sample::
-
- {
- 'original_artifact': [{
- 'checksums': {
- 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
- ...
- },
- }],
- ...
- }
-
- """
- known_original_artifact = known_artifact.get("original_artifact")
- if not known_original_artifact:
- # previous loader-npm version kept original artifact elsewhere
- known_original_artifact = known_artifact.get("package_source")
- if not known_original_artifact:
- return None
- return known_original_artifact["sha1"]
- else:
- assert isinstance(known_original_artifact, list)
- return known_original_artifact[0]["checksums"]["sha1"]
-
-
def _author_str(author_data: Union[Dict, List, str]) -> str:
"""Parse author from package.json author fields
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -14,7 +14,6 @@
_author_str,
extract_npm_package_author,
)
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes
from swh.model.identifiers import (
@@ -308,37 +307,6 @@
return "https://replicate.npmjs.com/%s/" % package
-def test_npm_revision_metadata_structure(swh_storage, requests_mock_datadir):
- package = "org"
- loader = NpmLoader(swh_storage, package_url(package))
-
- actual_load_status = loader.load()
- assert actual_load_status["status"] == "eventful"
- assert actual_load_status["snapshot_id"] is not None
-
- expected_revision_id = hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e")
- revision = swh_storage.revision_get([expected_revision_id])[0]
- assert revision is not None
-
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("intrinsic.tool", str),
- ("intrinsic.raw", dict),
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
-
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
package = "org"
url = package_url(package)
@@ -544,41 +512,6 @@
check_snapshot(expected_snapshot, swh_storage)
-def test_npm__known_artifact_to_extid__old_loader_version():
- """Current loader version should parse old metadata scheme
-
- """
- assert (
- NpmLoader.known_artifact_to_extid(
- {"package_source": {"sha1": "something-wrong"}}
- )
- is None
- )
-
- sha1 = "05181c12cd8c22035dd31155656826b85745da37"
- assert NpmLoader.known_artifact_to_extid({"package_source": {"sha1": sha1,}}) == (
- "npm-archive-sha1",
- hash_to_bytes(sha1),
- )
-
-
-def test_npm__known_artifact_to_extid__current_loader_version():
- """Current loader version should be able to parse current metadata scheme
-
- """
- sha1 = "05181c12cd8c22035dd31155656826b85745da37"
- assert NpmLoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha1": sha1},}],}
- ) == ("npm-archive-sha1", hash_to_bytes(sha1))
-
- assert (
- NpmLoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha1": "something-wrong"},}],},
- )
- is None
- )
-
-
def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir):
"""Skip artifact with no intrinsic metadata during ingestion
diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py
--- a/swh/loader/package/pypi/loader.py
+++ b/swh/loader/package/pypi/loader.py
@@ -121,16 +121,6 @@
for version, p_info in res:
yield release_name(version, p_info.filename), p_info
- @staticmethod
- def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]:
- extid_str = _artifact_to_sha256(known_artifact)
- if extid_str is None:
- return None
- try:
- return (EXTID_TYPE, hash_to_bytes(extid_str)) if extid_str else None
- except ValueError:
- return None
-
def build_revision(
self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
@@ -157,61 +147,6 @@
parents=(),
directory=directory,
synthetic=True,
- metadata={
- "intrinsic": {"tool": "PKG-INFO", "raw": i_metadata,},
- "extrinsic": {
- "provider": self.provider_url,
- "when": self.visit_date.isoformat(),
- "raw": p_info.raw_info,
- },
- },
- )
-
-
-def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]:
- """Returns the sha256 from a PyPI 'original_artifact' dict
-
- The following code allows to deal with 2 metadata formats (column metadata
- in 'revision')
-
- - old format sample::
-
- {
- 'original_artifact': {
- 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
- ...
- },
- ...
- }
-
- - new format sample::
-
- {
- 'original_artifact': [{
- 'checksums': {
- 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa
- ...
- },
- }],
- ...
- }
-
- """
- original_artifact = known_artifact["original_artifact"]
- if isinstance(original_artifact, dict):
- # previous loader-pypi version stored metadata as dict
- return original_artifact["sha256"]
- # new pypi loader actually store metadata dict differently...
- assert isinstance(original_artifact, list)
- # current loader-pypi stores metadata as list of dict
- if len(known_artifact["original_artifact"]) == 0:
- return None
- elif len(known_artifact["original_artifact"]) == 1:
- return original_artifact[0]["checksums"]["sha256"]
- else:
- raise ValueError(
- f"Expected exactly one PyPI original_artifact, got "
- f"{len(known_artifact['original_artifact'])}"
)
diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py
--- a/swh/loader/package/pypi/tests/test_pypi.py
+++ b/swh/loader/package/pypi/tests/test_pypi.py
@@ -19,7 +19,6 @@
extract_intrinsic_metadata,
pypi_api_url,
)
-from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes
from swh.model.identifiers import (
@@ -334,24 +333,6 @@
revision = swh_storage.revision_get([expected_revision_id])[0]
assert revision is not None
- check_metadata_paths(
- revision.metadata,
- paths=[
- ("intrinsic.tool", str),
- ("intrinsic.raw", dict),
- ("extrinsic.provider", str),
- ("extrinsic.when", str),
- ("extrinsic.raw", dict),
- ("original_artifact", list),
- ],
- )
-
- for original_artifact in revision.metadata["original_artifact"]:
- check_metadata_paths(
- original_artifact,
- paths=[("filename", str), ("length", int), ("checksums", dict),],
- )
-
revision_swhid = CoreSWHID(
object_type=ObjectType.REVISION, object_id=expected_revision_id
)
@@ -796,52 +777,6 @@
)
-def test_pypi__known_artifact_to_extid__old_loader_version():
- """Current loader version should solve old metadata scheme
-
- """
- assert (
- PyPILoader.known_artifact_to_extid(
- {"original_artifact": {"sha256": "something-wrong",},}
- )
- is None
- )
-
- sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec"
- assert PyPILoader.known_artifact_to_extid(
- {"original_artifact": {"sha256": sha256}}
- ) == ("pypi-archive-sha256", hash_to_bytes(sha256))
-
-
-def test_pypi__known_artifact_to_extid__current_loader_version():
- """Current loader version should be able to solve current metadata scheme
-
- """
- sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec"
-
- assert PyPILoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha256": sha256,},}],}
- ) == ("pypi-archive-sha256", hash_to_bytes(sha256))
-
- assert (
- PyPILoader.known_artifact_to_extid(
- {"original_artifact": [{"checksums": {"sha256": "something-wrong"},}],},
- )
- is None
- )
-
- # there should not be more than one artifact
- with pytest.raises(ValueError):
- PyPILoader.known_artifact_to_extid(
- {
- "original_artifact": [
- {"checksums": {"sha256": sha256,},},
- {"checksums": {"sha256": sha256,},},
- ],
- }
- )
-
-
def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir):
"""Skip artifact with no intrinsic metadata during ingestion
diff --git a/swh/loader/package/tests/common.py b/swh/loader/package/tests/common.py
--- a/swh/loader/package/tests/common.py
+++ b/swh/loader/package/tests/common.py
@@ -5,50 +5,8 @@
import logging
from os import path
-from typing import Dict, List, Tuple
logger = logging.getLogger(__file__)
DATADIR = path.join(path.abspath(path.dirname(__file__)), "resources")
-
-
-def check_metadata(metadata: Dict, key_path: str, raw_type: str):
- """Given a metadata dict, ensure the associated key_path value is of type
- raw_type.
-
- Args:
- metadata: Dict to check
- key_path: Path to check
- raw_type: Type to check the path with
-
- Raises:
- Assertion error in case of mismatch
-
- """
- data = metadata
- keys = key_path.split(".")
- for k in keys:
- try:
- data = data[k]
- except (TypeError, KeyError) as e:
- # KeyError: because path too long
- # TypeError: data is not a dict
- raise AssertionError(e)
- assert isinstance(data, raw_type) # type: ignore
-
-
-def check_metadata_paths(metadata: Dict, paths: List[Tuple[str, str]]):
- """Given a metadata dict, ensure the keys are of expected types
-
- Args:
- metadata: Dict to check
- key_path: Path to check
- raw_type: Type to check the path with
-
- Raises:
- Assertion error in case of mismatch
-
- """
- for key_path, raw_type in paths:
- check_metadata(metadata, key_path, raw_type)
diff --git a/swh/loader/package/tests/test_common.py b/swh/loader/package/tests/test_common.py
deleted file mode 100644
--- a/swh/loader/package/tests/test_common.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import pytest
-
-from swh.loader.package.tests.common import check_metadata, check_metadata_paths
-
-
-def test_check_metadata():
- metadata = {
- "a": {"raw": {"time": "something",},},
- "b": [],
- "c": 1,
- }
-
- for raw_path, raw_type in [
- ("a.raw", dict),
- ("a.raw.time", str),
- ("b", list),
- ("c", int),
- ]:
- check_metadata(metadata, raw_path, raw_type)
-
-
-def test_check_metadata_ko():
- metadata = {
- "a": {"raw": "hello",},
- "b": [],
- "c": 1,
- }
-
- for raw_path, raw_type in [
- ("a.b", dict),
- ("a.raw.time", str),
- ]:
- with pytest.raises(AssertionError):
- check_metadata(metadata, raw_path, raw_type)
-
-
-def test_check_metadata_paths():
- metadata = {
- "a": {"raw": {"time": "something",},},
- "b": [],
- "c": 1,
- }
-
- check_metadata_paths(
- metadata, [("a.raw", dict), ("a.raw.time", str), ("b", list), ("c", int),]
- )
-
-
-def test_check_metadata_paths_ko():
- metadata = {
- "a": {"raw": "hello",},
- "b": [],
- "c": 1,
- }
-
- with pytest.raises(AssertionError):
- check_metadata_paths(metadata, [("a.b", dict), ("a.raw.time", str),])
diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py
--- a/swh/loader/package/tests/test_loader.py
+++ b/swh/loader/package/tests/test_loader.py
@@ -83,51 +83,6 @@
assert actual_load_status2 == {"status": "failed"}
-def test_resolve_revision_from_artifacts() -> None:
- loader = PackageLoader(None, None) # type: ignore
- loader.known_artifact_to_extid = Mock( # type: ignore
- wraps=lambda known_artifact: ("extid-type", known_artifact["key"].encode())
- )
-
- known_artifacts = {
- b"a" * 20: {"key": "extid-of-aaaa"},
- b"b" * 20: {"key": "extid-of-bbbb"},
- }
-
- p_info = Mock(wraps=BasePackageInfo(None, None)) # type: ignore
-
- # No known artifact -> it would be useless to compute the extid
- assert loader.resolve_revision_from_artifacts({}, p_info) is None
- p_info.extid.assert_not_called()
- loader.known_artifact_to_extid.assert_not_called()
-
- p_info.extid.reset_mock()
-
- # Some artifacts, but the PackageInfo does not support extids
- p_info.extid.return_value = None
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None
- p_info.extid.assert_called_once()
- loader.known_artifact_to_extid.assert_not_called()
-
- p_info.extid.reset_mock()
-
- # Some artifacts, and the PackageInfo is not one of them (ie. cache miss)
- p_info.extid.return_value = ("extid-type", b"extid-of-cccc")
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None
- p_info.extid.assert_called_once()
- loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-aaaa"})
- loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-bbbb"})
-
- p_info.extid.reset_mock()
- loader.known_artifact_to_extid.reset_mock()
-
- # Some artifacts, and the PackageInfo is one of them (ie. cache hit)
- p_info.extid.return_value = ("extid-type", b"extid-of-aaaa")
- assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) == b"a" * 20
- p_info.extid.assert_called_once()
- loader.known_artifact_to_extid.assert_called_once_with({"key": "extid-of-aaaa"})
-
-
def test_resolve_revision_from_extids() -> None:
loader = PackageLoader(None, None) # type: ignore
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jul 3 2025, 9:18 AM (6 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229295
Attached To
D5423: package loaders: Stop reading/writing Revision.metadata
Event Timeline
Log In to Comment