Page MenuHomeSoftware Heritage

D2949.diff
No OneTemporary

D2949.diff

diff --git a/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1
new file mode 120000
--- /dev/null
+++ b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1
@@ -0,0 +1 @@
+gnu_8sync_8sync-0.1.0.tar.gz
\ No newline at end of file
diff --git a/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2
new file mode 120000
--- /dev/null
+++ b/swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2
@@ -0,0 +1 @@
+gnu_8sync_8sync-0.1.0.tar.gz
\ No newline at end of file
diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py
--- a/swh/loader/package/nixguix/loader.py
+++ b/swh/loader/package/nixguix/loader.py
@@ -97,11 +97,27 @@
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict
) -> Optional[bytes]:
-
for rev_id, known_artifact in known_artifacts.items():
- known_integrity = known_artifact["extrinsic"]["raw"]["integrity"]
- if artifact_metadata["integrity"] == known_integrity:
- return rev_id
+ try:
+ known_integrity = known_artifact["extrinsic"]["raw"]["integrity"]
+ except KeyError as e:
+ logger.exception(
+ "Unexpected metadata revision structure detected: %(context)s",
+ {
+ "context": {
+ "revision": hashutil.hash_to_hex(rev_id),
+ "reason": str(e),
+ "known_artifact": known_artifact,
+ }
+ },
+ )
+ # metadata field for the revision is not as expected by the loader
+ # nixguix. We consider this not the right revision and continue checking
+ # the other revisions
+ continue
+ else:
+ if artifact_metadata["integrity"] == known_integrity:
+ return rev_id
return None
def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]:
diff --git a/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org
new file mode 120000
--- /dev/null
+++ b/swh/loader/package/nixguix/tests/data/https_ftp.gnu.org
@@ -0,0 +1 @@
+../../../archive/tests/data/https_ftp.gnu.org
\ No newline at end of file
diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json
@@ -0,0 +1,16 @@
+{
+ "sources": [
+ {
+ "type": "url",
+ "urls": [ "https://github.com/owner-1/repository-1/revision-1.tgz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ }
+ ],
+ "version": 1,
+ "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
+}
diff --git a/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1
@@ -0,0 +1,16 @@
+{
+ "sources": [
+ {
+ "type": "url",
+ "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" ],
+ "integrity": "sha256-3vm2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ },
+ {
+ "type": "url",
+ "urls": [ "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz" ],
+ "integrity": "sha256-4wn2Nt+O4zHf3Ovd/qsv1gKTEUwodX9FLxlrQdry0zs="
+ }
+ ],
+ "version": 1,
+ "revision": "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"
+}
diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py
--- a/swh/loader/package/nixguix/tests/test_nixguix.py
+++ b/swh/loader/package/nixguix/tests/test_nixguix.py
@@ -3,12 +3,19 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import os
+import json
+import logging
+
import pytest
+from json.decoder import JSONDecodeError
from typing import Dict, Optional, Tuple
-from json.decoder import JSONDecodeError
+from unittest.mock import patch
+from swh.model.model import Snapshot
+from swh.loader.package.archive.loader import ArchiveLoader
from swh.loader.package.nixguix.loader import (
NixGuixLoader,
retrieve_sources,
@@ -17,6 +24,7 @@
from swh.loader.package.tests.common import get_stats, check_snapshot
from swh.loader.package.utils import download
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.storage.exc import HashCollision
sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json"
@@ -391,3 +399,122 @@
# The visit is partial because some hash collision were detected
assert origin_visit["status"] == "partial"
assert origin_visit["type"] == "nixguix"
+
+
+def test_load_nixguix_one_common_artifact_from_other_loader(
+ swh_config, datadir, requests_mock_datadir_visits, caplog
+):
+ """Misformatted revision should be caught and logged, then loading continues
+
+ """
+ caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader")
+
+ # 1. first ingest with for example the archive loader
+ gnu_url = "https://ftp.gnu.org/gnu/8sync/"
+ release = "0.1.0"
+ artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz"
+ gnu_artifacts = [
+ {
+ "time": 944729610,
+ "url": artifact_url,
+ "length": 221837,
+ "filename": f"8sync-{release}.tar.gz",
+ "version": release,
+ }
+ ]
+ archive_loader = ArchiveLoader(url=gnu_url, artifacts=gnu_artifacts)
+ actual_load_status = archive_loader.load()
+ expected_snapshot_id = "c419397fd912039825ebdbea378bc6283f006bf5"
+ assert actual_load_status["status"] == "eventful"
+ assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa
+
+ gnu_snapshot = archive_loader.storage.snapshot_get(
+ hash_to_bytes(expected_snapshot_id)
+ )
+
+ first_revision = gnu_snapshot["branches"][f"releases/{release}".encode("utf-8")]
+
+ # 2. Then ingest with the nixguix loader which lists the same artifact within its
+ # sources.json
+
+ # ensure test setup is ok
+ data_sources = os.path.join(
+ datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json"
+ )
+ all_sources = json.loads(open(data_sources).read())
+ found = False
+ for source in all_sources["sources"]:
+ if source["urls"][0] == artifact_url:
+ found = True
+ assert (
+ found is True
+ ), f"test setup error: {artifact_url} must be in {data_sources}"
+
+ # first visit with a snapshot, ok
+ sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json"
+ loader = NixGuixLoader(sources_url)
+ actual_load_status2 = loader.load()
+ assert actual_load_status2["status"] == "eventful"
+
+ snapshot_id = actual_load_status2["snapshot_id"]
+ snapshot = loader.storage.snapshot_get(hash_to_bytes(snapshot_id))
+ snapshot.pop("next_branch") # snapshot_get endpoint detail to drop
+
+ # simulate a snapshot already seen with a revision with the wrong metadata structure
+ # This revision should be skipped, thus making the artifact being ingested again.
+ with patch(
+ "swh.loader.package.loader.PackageLoader.last_snapshot"
+ ) as last_snapshot:
+ # mutate the snapshot to target a revision with the wrong metadata structure
+ # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision
+ old_revision = next(loader.storage.revision_get([first_revision["target"]]))
+ # assert that revision is not in the right format
+ assert old_revision["metadata"]["extrinsic"]["raw"].get("integrity", {}) == {}
+
+ # mutate snapshot to create a clash
+ snapshot["branches"][artifact_url.encode("utf-8")] = {
+ "target_type": "revision",
+ "target": old_revision["id"],
+ }
+
+ # modify snapshot to actually change revision metadata structure so we simulate
+ # a revision written by somebody else (structure different)
+ last_snapshot.return_value = Snapshot.from_dict(snapshot)
+
+ loader = NixGuixLoader(sources_url)
+ actual_load_status3 = loader.load()
+ assert last_snapshot.called
+ assert actual_load_status3["status"] == "eventful"
+
+ new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965"
+ assert actual_load_status3["snapshot_id"] == new_snapshot_id
+
+ last_snapshot = loader.storage.snapshot_get(hash_to_bytes(new_snapshot_id))
+ new_revision_branch = last_snapshot["branches"][artifact_url.encode("utf-8")]
+ assert new_revision_branch["target_type"] == "revision"
+
+ new_revision = next(
+ loader.storage.revision_get([new_revision_branch["target"]])
+ )
+
+ # the new revision has the correct structure, so it got ingested alright by the
+ # new run
+ assert new_revision["metadata"]["extrinsic"]["raw"]["integrity"] is not None
+
+ nb_detections = 0
+ actual_detection: Dict
+ for record in caplog.records:
+ logtext = record.getMessage()
+ if "Unexpected metadata revision structure detected:" in logtext:
+ nb_detections += 1
+ actual_detection = record.args["context"]
+
+ assert actual_detection
+ # as many calls as there are sources listed in the sources.json
+ assert nb_detections == len(all_sources["sources"])
+
+ assert actual_detection == {
+ "revision": hash_to_hex(old_revision["id"]),
+ "reason": "'integrity'",
+ "known_artifact": old_revision["metadata"],
+ }

File Metadata

Mime Type
text/plain
Expires
Jul 27 2024, 11:14 AM (11 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219287

Event Timeline