Page MenuHomeSoftware Heritage

D3905.id13830.diff
No OneTemporary

D3905.id13830.diff

diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
--- a/swh/storage/migrate_extrinsic_metadata.py
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -43,9 +43,12 @@
MetadataFetcher,
MetadataTargetType,
RawExtrinsicMetadata,
+ RevisionType,
Sha1Git,
)
from swh.storage import get_storage
+from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses
+from swh.storage.algos.snapshot import snapshot_get_all_branches
# XML namespaces and fields for metadata coming from the deposit:
@@ -160,6 +163,69 @@
del metadata[key]
+def _check_revision_in_origin(storage, origin, revision_id):
+ seen_snapshots = set() # no need to visit them again
+ seen_revisions = set()
+
+ for visit in iter_origin_visits(storage, origin):
+ for status in iter_origin_visit_statuses(storage, origin, visit.visit):
+ if status.snapshot is None:
+ continue
+ if status.snapshot in seen_snapshots:
+ continue
+ seen_snapshots.add(status.snapshot)
+ snapshot = snapshot_get_all_branches(storage, status.snapshot)
+ for (branch_name, branch) in snapshot.branches.items():
+ branch_target = branch.target
+
+ # If it's the revision passed as argument, then it is indeed in the
+ # origin
+ if branch_target == revision_id:
+ return True
+
+ # Else, let's make sure the branch doesn't have any other revision
+
+ # Get the revision at the top of the branch.
+ if branch_target in seen_revisions:
+ continue
+ seen_revisions.add(branch_target)
+ revision = storage.revision_get([branch_target])[0]
+
+ # Check it's DSC (we only support those for now)
+ assert (
+ revision.type == RevisionType.DSC
+ ), "non-DSC revisions are not supported"
+
+ # Check it doesn't have parents (else we would have to
+ # recurse)
+ assert revision.parents == (), "DSC revision with parents"
+
+ return False
+
+
+def debian_origins_from_row(row, storage):
+ """Guesses a Debian origin from a row. May return an empty list if it
+ cannot reliably guess it, but all results are guaranteed to be correct."""
+ filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]]
+ package_names = {filename.split("_")[0] for filename in filenames}
+ assert len(package_names) == 1, package_names
+ (package_name,) = package_names
+
+ candidate_origins = [
+ f"deb://Debian/packages/{package_name}",
+ f"deb://Debian-Security/packages/{package_name}",
+ f"http://snapshot.debian.org/package/{package_name}/",
+ ]
+
+ found_origins = []
+
+ for origin in candidate_origins:
+ if _check_revision_in_origin(storage, origin, row["id"]):
+ found_origins.append(origin)
+
+ return found_origins
+
+
# Cache of origins that are known to exist
_origins = set()
@@ -378,11 +444,10 @@
return
if type_ == "dsc":
- origin = None # TODO: I can't find how to get it reliably
+ origin = None # it will be defined later, using debian_origins_from_row
# TODO: the debian loader writes the changelog date as the revision's
- # author date and committer date. Instead, we should use the visit's date,
- # but I cannot find a way to reliably get it without the origin
+ # author date and committer date. Instead, we should use the visit's date
if "extrinsic" in metadata:
extrinsic_files = metadata["extrinsic"]["raw"]["files"]
@@ -805,16 +870,24 @@
}
assert set(original_artifact) <= allowed_keys, set(original_artifact)
- load_metadata(
- storage,
- row["id"],
- discovery_date,
- metadata["original_artifact"],
- ORIGINAL_ARTIFACT_FORMAT,
- authority=AUTHORITIES["swh"],
- origin=origin,
- dry_run=dry_run,
- )
+ if type_ == "dsc":
+ assert origin is None
+ origins = debian_origins_from_row(row, storage)
+ assert origins, row
+ else:
+ origins = [origin]
+
+ for origin in origins:
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["original_artifact"],
+ ORIGINAL_ARTIFACT_FORMAT,
+ authority=AUTHORITIES["swh"],
+ origin=origin,
+ dry_run=dry_run,
+ )
del metadata["original_artifact"]
assert metadata == {}, (
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
--- a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
@@ -9,7 +9,10 @@
import copy
import datetime
import json
-from unittest.mock import call, Mock
+from unittest.mock import call, Mock, patch as _patch
+
+import attr
+import pytest
from swh.model.identifiers import parse_swhid
from swh.model.model import (
@@ -17,10 +20,21 @@
MetadataAuthorityType,
MetadataFetcher,
MetadataTargetType,
+ OriginVisit,
+ OriginVisitStatus,
+ Person,
RawExtrinsicMetadata,
+ Revision,
+ RevisionType,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ Timestamp,
+ TimestampWithTimezone,
)
-from swh.storage.migrate_extrinsic_metadata import handle_row
+from swh.storage.interface import ListOrder, PagedResult
+from swh.storage.migrate_extrinsic_metadata import handle_row, debian_origins_from_row
FETCHER = MetadataFetcher(
@@ -33,6 +47,283 @@
)
+def patch(function_name, *args, **kwargs):
+ # It's a long name, this function spares some line breaks in 'with' statements
+ return _patch(
+ "swh.storage.migrate_extrinsic_metadata." + function_name, *args, **kwargs
+ )
+
+
+def test_debian_origins_from_row():
+ """Tests debian_origins_from_row on a real example (with some parts
+ omitted, for conciseness)."""
+ origin_url = "deb://Debian/packages/kalgebra"
+
+ visit = OriginVisit(
+ origin=origin_url,
+ date=datetime.datetime(
+ 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc,
+ ),
+ type="deb",
+ visit=280,
+ )
+
+ def mock_origin_visit_get(origin, page_token, order):
+ if origin in (
+ "deb://Debian-Security/packages/kalgebra",
+ "http://snapshot.debian.org/package/kalgebra/",
+ ):
+ return PagedResult(results=[], next_page_token=None)
+ elif origin == "deb://Debian/packages/kalgebra":
+ if page_token == None:
+ return PagedResult(
+ # ...
+ results=[visit,],
+ next_page_token="280",
+ )
+ elif page_token == "280":
+ return PagedResult(results=[], next_page_token=None)
+ else:
+ assert False, page_token
+ else:
+ assert False, origin
+
+ storage = Mock()
+
+ storage.origin_visit_get.side_effect = mock_origin_visit_get
+
+ storage.origin_visit_status_get.return_value = PagedResult(
+ results=[
+ OriginVisitStatus(
+ origin=origin_url,
+ visit=280,
+ date=datetime.datetime(
+ 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc
+ ),
+ status="full",
+ snapshot=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4",
+ metadata=None,
+ )
+ ],
+ next_page_token=None,
+ )
+
+ snapshot = Snapshot(
+ id=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4",
+ branches={
+ # ...
+ b"releases/unstable/main/4:19.12.1-1": SnapshotBranch(
+ target=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ target_type=TargetType.REVISION,
+ ),
+ },
+ )
+
+ revision_row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "metadata": {
+ # ...
+ "original_artifact": [
+ {
+ "filename": "kalgebra_19.12.1-1.dsc",
+ # ...
+ },
+ ]
+ },
+ }
+
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ assert debian_origins_from_row(revision_row, storage) == [origin_url]
+
+ assert storage.method_calls == [
+ call.origin_visit_get(
+ "deb://Debian/packages/kalgebra", order=ListOrder.ASC, page_token=None
+ ),
+ call.origin_visit_status_get(
+ "deb://Debian/packages/kalgebra", 280, order=ListOrder.ASC, page_token=None
+ ),
+ call.origin_visit_get(
+ "deb://Debian-Security/packages/kalgebra",
+ order=ListOrder.ASC,
+ page_token=None,
+ ),
+ call.origin_visit_get(
+ "http://snapshot.debian.org/package/kalgebra/",
+ order=ListOrder.ASC,
+ page_token=None,
+ ),
+ ]
+
+
+def test_debian_origins_from_row__no_result():
+ """Tests debian_origins_from_row when there's no origin, visit, status,
+ snapshot, branch, or matching branch.
+ """
+ storage = Mock()
+
+ origin_url = "deb://Debian/packages/kalgebra"
+
+ revision_row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]},
+ }
+
+ # no visit
+ with patch("iter_origin_visits", return_value=[]):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+ visit = OriginVisit(
+ origin=origin_url,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ type="deb",
+ visit=280,
+ )
+
+ # no status
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[]):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+ status = OriginVisitStatus(
+ origin=origin_url,
+ visit=280,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ status="full",
+ snapshot=None,
+ metadata=None,
+ )
+
+ # no snapshot
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+ status = attr.evolve(status, snapshot=b"42" * 10)
+
+ snapshot = Snapshot(id=b"42" * 10, branches={},)
+
+ # no branch
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ revision = Revision(
+ id=b"21" * 10,
+ message=b"foo",
+ author=Person.from_fullname(b"foo"),
+ committer=Person.from_fullname(b"foo"),
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ type=RevisionType.DSC,
+ directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb",
+ synthetic=True,
+ metadata=None,
+ parents=(),
+ extra_headers=(),
+ )
+
+ storage.revision_get.return_value = [revision]
+
+ # no matching branch
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+
+def test_debian_origins_from_row__check_revisions():
+ """Tests debian_origins_from_row errors when the revision at the head
+ of a branch is a DSC and has no parents
+ """
+ storage = Mock()
+
+ origin_url = "deb://Debian/packages/kalgebra"
+
+ revision_row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]},
+ }
+
+ visit = OriginVisit(
+ origin=origin_url,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ type="deb",
+ visit=280,
+ )
+
+ status = OriginVisitStatus(
+ origin=origin_url,
+ visit=280,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ status="full",
+ snapshot=b"42" * 10,
+ metadata=None,
+ )
+ snapshot = Snapshot(
+ id=b"42" * 10,
+ branches={
+ b"foo": SnapshotBranch(target_type=TargetType.REVISION, target=b"21" * 10)
+ },
+ )
+
+ revision = Revision(
+ id=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ message=b"foo",
+ author=Person.from_fullname(b"foo"),
+ committer=Person.from_fullname(b"foo"),
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ type=RevisionType.DSC,
+ directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb",
+ synthetic=True,
+ metadata=None,
+ parents=(b"parent " * 2,),
+ extra_headers=(),
+ )
+
+ storage.revision_get.return_value = [revision]
+
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ with pytest.raises(AssertionError, match="DSC revision with parents"):
+ debian_origins_from_row(revision_row, storage)
+
+ revision = attr.evolve(revision, type=RevisionType.GIT)
+ storage.revision_get.return_value = [revision]
+
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ with pytest.raises(AssertionError, match="non-DSC revision"):
+ debian_origins_from_row(revision_row, storage)
+
+
def test_debian_with_extrinsic():
dest_original_artifacts = [
{
@@ -138,9 +429,14 @@
},
}
+ origin_url = "deb://Debian/packages/kalgebra"
+
storage = Mock()
+
deposit_cur = None
- handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ with patch("debian_origins_from_row", return_value=[origin_url]):
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
assert storage.method_calls == [
call.raw_extrinsic_metadata_add(
@@ -157,9 +453,10 @@
fetcher=FETCHER,
format="original-artifacts-json",
metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
),
]
- )
+ ),
]
@@ -249,8 +546,12 @@
}
storage = Mock()
+
+ origin_url = "http://snapshot.debian.org/package/pymongo"
+
deposit_cur = None
- handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+ with patch("debian_origins_from_row", return_value=[origin_url]):
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
assert storage.method_calls == [
call.raw_extrinsic_metadata_add(
@@ -267,6 +568,7 @@
fetcher=FETCHER,
format="original-artifacts-json",
metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
),
]
)

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 5:06 PM (3 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227020

Event Timeline