Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9347195
D3905.id13830.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
D3905.id13830.diff
View Options
diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
--- a/swh/storage/migrate_extrinsic_metadata.py
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -43,9 +43,12 @@
MetadataFetcher,
MetadataTargetType,
RawExtrinsicMetadata,
+ RevisionType,
Sha1Git,
)
from swh.storage import get_storage
+from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses
+from swh.storage.algos.snapshot import snapshot_get_all_branches
# XML namespaces and fields for metadata coming from the deposit:
@@ -160,6 +163,69 @@
del metadata[key]
+def _check_revision_in_origin(storage, origin, revision_id):
+ seen_snapshots = set() # no need to visit them again
+ seen_revisions = set()
+
+ for visit in iter_origin_visits(storage, origin):
+ for status in iter_origin_visit_statuses(storage, origin, visit.visit):
+ if status.snapshot is None:
+ continue
+ if status.snapshot in seen_snapshots:
+ continue
+ seen_snapshots.add(status.snapshot)
+ snapshot = snapshot_get_all_branches(storage, status.snapshot)
+ for (branch_name, branch) in snapshot.branches.items():
+ branch_target = branch.target
+
+ # If it's the revision passed as argument, then it is indeed in the
+ # origin
+ if branch_target == revision_id:
+ return True
+
+ # Else, let's make sure the branch doesn't have any other revision
+
+ # Get the revision at the top of the branch.
+ if branch_target in seen_revisions:
+ continue
+ seen_revisions.add(branch_target)
+ revision = storage.revision_get([branch_target])[0]
+
+ # Check it's DSC (we only support those for now)
+ assert (
+ revision.type == RevisionType.DSC
+ ), "non-DSC revisions are not supported"
+
+ # Check it doesn't have parents (else we would have to
+ # recurse)
+ assert revision.parents == (), "DSC revision with parents"
+
+ return False
+
+
+def debian_origins_from_row(row, storage):
+ """Guesses a Debian origin from a row. May return an empty list if it
+ cannot reliably guess it, but all results are guaranteed to be correct."""
+ filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]]
+ package_names = {filename.split("_")[0] for filename in filenames}
+ assert len(package_names) == 1, package_names
+ (package_name,) = package_names
+
+ candidate_origins = [
+ f"deb://Debian/packages/{package_name}",
+ f"deb://Debian-Security/packages/{package_name}",
+ f"http://snapshot.debian.org/package/{package_name}/",
+ ]
+
+ found_origins = []
+
+ for origin in candidate_origins:
+ if _check_revision_in_origin(storage, origin, row["id"]):
+ found_origins.append(origin)
+
+ return found_origins
+
+
# Cache of origins that are known to exist
_origins = set()
@@ -378,11 +444,10 @@
return
if type_ == "dsc":
- origin = None # TODO: I can't find how to get it reliably
+ origin = None # it will be defined later, using debian_origins_from_row
# TODO: the debian loader writes the changelog date as the revision's
- # author date and committer date. Instead, we should use the visit's date,
- # but I cannot find a way to reliably get it without the origin
+ # author date and committer date. Instead, we should use the visit's date
if "extrinsic" in metadata:
extrinsic_files = metadata["extrinsic"]["raw"]["files"]
@@ -805,16 +870,24 @@
}
assert set(original_artifact) <= allowed_keys, set(original_artifact)
- load_metadata(
- storage,
- row["id"],
- discovery_date,
- metadata["original_artifact"],
- ORIGINAL_ARTIFACT_FORMAT,
- authority=AUTHORITIES["swh"],
- origin=origin,
- dry_run=dry_run,
- )
+ if type_ == "dsc":
+ assert origin is None
+ origins = debian_origins_from_row(row, storage)
+ assert origins, row
+ else:
+ origins = [origin]
+
+ for origin in origins:
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["original_artifact"],
+ ORIGINAL_ARTIFACT_FORMAT,
+ authority=AUTHORITIES["swh"],
+ origin=origin,
+ dry_run=dry_run,
+ )
del metadata["original_artifact"]
assert metadata == {}, (
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
--- a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
@@ -9,7 +9,10 @@
import copy
import datetime
import json
-from unittest.mock import call, Mock
+from unittest.mock import call, Mock, patch as _patch
+
+import attr
+import pytest
from swh.model.identifiers import parse_swhid
from swh.model.model import (
@@ -17,10 +20,21 @@
MetadataAuthorityType,
MetadataFetcher,
MetadataTargetType,
+ OriginVisit,
+ OriginVisitStatus,
+ Person,
RawExtrinsicMetadata,
+ Revision,
+ RevisionType,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ Timestamp,
+ TimestampWithTimezone,
)
-from swh.storage.migrate_extrinsic_metadata import handle_row
+from swh.storage.interface import ListOrder, PagedResult
+from swh.storage.migrate_extrinsic_metadata import handle_row, debian_origins_from_row
FETCHER = MetadataFetcher(
@@ -33,6 +47,283 @@
)
+def patch(function_name, *args, **kwargs):
+ # It's a long name, this function spares some line breaks in 'with' statements
+ return _patch(
+ "swh.storage.migrate_extrinsic_metadata." + function_name, *args, **kwargs
+ )
+
+
+def test_debian_origins_from_row():
+ """Tests debian_origins_from_row on a real example (with some parts
+ omitted, for conciseness)."""
+ origin_url = "deb://Debian/packages/kalgebra"
+
+ visit = OriginVisit(
+ origin=origin_url,
+ date=datetime.datetime(
+ 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc,
+ ),
+ type="deb",
+ visit=280,
+ )
+
+ def mock_origin_visit_get(origin, page_token, order):
+ if origin in (
+ "deb://Debian-Security/packages/kalgebra",
+ "http://snapshot.debian.org/package/kalgebra/",
+ ):
+ return PagedResult(results=[], next_page_token=None)
+ elif origin == "deb://Debian/packages/kalgebra":
+ if page_token == None:
+ return PagedResult(
+ # ...
+ results=[visit,],
+ next_page_token="280",
+ )
+ elif page_token == "280":
+ return PagedResult(results=[], next_page_token=None)
+ else:
+ assert False, page_token
+ else:
+ assert False, origin
+
+ storage = Mock()
+
+ storage.origin_visit_get.side_effect = mock_origin_visit_get
+
+ storage.origin_visit_status_get.return_value = PagedResult(
+ results=[
+ OriginVisitStatus(
+ origin=origin_url,
+ visit=280,
+ date=datetime.datetime(
+ 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc
+ ),
+ status="full",
+ snapshot=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4",
+ metadata=None,
+ )
+ ],
+ next_page_token=None,
+ )
+
+ snapshot = Snapshot(
+ id=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4",
+ branches={
+ # ...
+ b"releases/unstable/main/4:19.12.1-1": SnapshotBranch(
+ target=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ target_type=TargetType.REVISION,
+ ),
+ },
+ )
+
+ revision_row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "metadata": {
+ # ...
+ "original_artifact": [
+ {
+ "filename": "kalgebra_19.12.1-1.dsc",
+ # ...
+ },
+ ]
+ },
+ }
+
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ assert debian_origins_from_row(revision_row, storage) == [origin_url]
+
+ assert storage.method_calls == [
+ call.origin_visit_get(
+ "deb://Debian/packages/kalgebra", order=ListOrder.ASC, page_token=None
+ ),
+ call.origin_visit_status_get(
+ "deb://Debian/packages/kalgebra", 280, order=ListOrder.ASC, page_token=None
+ ),
+ call.origin_visit_get(
+ "deb://Debian-Security/packages/kalgebra",
+ order=ListOrder.ASC,
+ page_token=None,
+ ),
+ call.origin_visit_get(
+ "http://snapshot.debian.org/package/kalgebra/",
+ order=ListOrder.ASC,
+ page_token=None,
+ ),
+ ]
+
+
+def test_debian_origins_from_row__no_result():
+ """Tests debian_origins_from_row when there's no origin, visit, status,
+ snapshot, branch, or matching branch.
+ """
+ storage = Mock()
+
+ origin_url = "deb://Debian/packages/kalgebra"
+
+ revision_row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]},
+ }
+
+ # no visit
+ with patch("iter_origin_visits", return_value=[]):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+ visit = OriginVisit(
+ origin=origin_url,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ type="deb",
+ visit=280,
+ )
+
+ # no status
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[]):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+ status = OriginVisitStatus(
+ origin=origin_url,
+ visit=280,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ status="full",
+ snapshot=None,
+ metadata=None,
+ )
+
+ # no snapshot
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+ status = attr.evolve(status, snapshot=b"42" * 10)
+
+ snapshot = Snapshot(id=b"42" * 10, branches={},)
+
+ # no branch
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ revision = Revision(
+ id=b"21" * 10,
+ message=b"foo",
+ author=Person.from_fullname(b"foo"),
+ committer=Person.from_fullname(b"foo"),
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ type=RevisionType.DSC,
+ directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb",
+ synthetic=True,
+ metadata=None,
+ parents=(),
+ extra_headers=(),
+ )
+
+ storage.revision_get.return_value = [revision]
+
+ # no matching branch
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ assert debian_origins_from_row(revision_row, storage) == []
+
+ assert storage.method_calls == []
+
+
+def test_debian_origins_from_row__check_revisions():
+ """Tests debian_origins_from_row errors when the revision at the head
+ of a branch is a DSC and has no parents
+ """
+ storage = Mock()
+
+ origin_url = "deb://Debian/packages/kalgebra"
+
+ revision_row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]},
+ }
+
+ visit = OriginVisit(
+ origin=origin_url,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ type="deb",
+ visit=280,
+ )
+
+ status = OriginVisitStatus(
+ origin=origin_url,
+ visit=280,
+ date=datetime.datetime.now(tz=datetime.timezone.utc),
+ status="full",
+ snapshot=b"42" * 10,
+ metadata=None,
+ )
+ snapshot = Snapshot(
+ id=b"42" * 10,
+ branches={
+ b"foo": SnapshotBranch(target_type=TargetType.REVISION, target=b"21" * 10)
+ },
+ )
+
+ revision = Revision(
+ id=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ message=b"foo",
+ author=Person.from_fullname(b"foo"),
+ committer=Person.from_fullname(b"foo"),
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1580076204, microseconds=0),
+ offset=60,
+ negative_utc=False,
+ ),
+ type=RevisionType.DSC,
+ directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb",
+ synthetic=True,
+ metadata=None,
+ parents=(b"parent " * 2,),
+ extra_headers=(),
+ )
+
+ storage.revision_get.return_value = [revision]
+
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ with pytest.raises(AssertionError, match="DSC revision with parents"):
+ debian_origins_from_row(revision_row, storage)
+
+ revision = attr.evolve(revision, type=RevisionType.GIT)
+ storage.revision_get.return_value = [revision]
+
+ with patch("iter_origin_visits", return_value=[visit]):
+ with patch("iter_origin_visit_statuses", return_value=[status]):
+ with patch("snapshot_get_all_branches", return_value=snapshot):
+ with pytest.raises(AssertionError, match="non-DSC revision"):
+ debian_origins_from_row(revision_row, storage)
+
+
def test_debian_with_extrinsic():
dest_original_artifacts = [
{
@@ -138,9 +429,14 @@
},
}
+ origin_url = "deb://Debian/packages/kalgebra"
+
storage = Mock()
+
deposit_cur = None
- handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ with patch("debian_origins_from_row", return_value=[origin_url]):
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
assert storage.method_calls == [
call.raw_extrinsic_metadata_add(
@@ -157,9 +453,10 @@
fetcher=FETCHER,
format="original-artifacts-json",
metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
),
]
- )
+ ),
]
@@ -249,8 +546,12 @@
}
storage = Mock()
+
+ origin_url = "http://snapshot.debian.org/package/pymongo"
+
deposit_cur = None
- handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+ with patch("debian_origins_from_row", return_value=[origin_url]):
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
assert storage.method_calls == [
call.raw_extrinsic_metadata_add(
@@ -267,6 +568,7 @@
fetcher=FETCHER,
format="original-artifacts-json",
metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
),
]
)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 5:06 PM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227020
Attached To
D3905: migrate_extrinsic_metadata: guess Debian origins.
Event Timeline
Log In to Comment