diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py --- a/swh/storage/migrate_extrinsic_metadata.py +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -43,9 +43,12 @@ MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, + RevisionType, Sha1Git, ) from swh.storage import get_storage +from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses +from swh.storage.algos.snapshot import snapshot_get_all_branches # XML namespaces and fields for metadata coming from the deposit: @@ -160,6 +163,65 @@ del metadata[key] +def _check_revision_in_origin(storage, origin, revision_id): + seen_snapshots = set() # no need to visit them again + seen_revisions = set() + + for visit in iter_origin_visits(storage, origin): + for status in iter_origin_visit_statuses(storage, origin, visit.visit): + if status.snapshot is None: + continue + if status.snapshot in seen_snapshots: + continue + seen_snapshots.add(status.snapshot) + snapshot = snapshot_get_all_branches(storage, status.snapshot) + for (branch_name, branch) in snapshot.branches.items(): + # If it's the revision passed as argument, then it is indeed in the + # origin + if branch.target == revision_id: + return True + + # Else, let's make sure the branch doesn't have any other revision + + # Get the revision at the top of the branch. + if branch.target in seen_revisions: + continue + seen_revisions.add(branch.target) + revision = storage.revision_get([branch.target])[0] + + # Check it's DSC (we only support those for now) + assert ( + revision.type == RevisionType.DSC + ), "non-DSC revisions are not supported" + + # Check it doesn't have parents (else we would have to + # recurse) + assert revision.parents == (), "DSC revision with parents" + + return False + + +def debian_origins_from_row(row, storage): + """Guesses a Debian origin from a row. May return an empty list if it + cannot reliably guess it, but all results are guaranteed to be correct.""" + filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]] + package_names = {filename.split("_")[0] for filename in filenames} + assert len(package_names) == 1, package_names + (package_name,) = package_names + + candidate_origins = [ + f"deb://Debian/packages/{package_name}", + f"deb://Debian-Security/packages/{package_name}", + f"http://snapshot.debian.org/package/{package_name}/", + ] + + return [ + origin + for origin in candidate_origins + if _check_revision_in_origin(storage, origin, row["id"]) + ] + + # Cache of origins that are known to exist _origins = set() @@ -378,11 +440,10 @@ return if type_ == "dsc": - origin = None # TODO: I can't find how to get it reliably + origin = None # it will be defined later, using debian_origins_from_row # TODO: the debian loader writes the changelog date as the revision's - # author date and committer date. Instead, we should use the visit's date, - # but I cannot find a way to reliably get it without the origin + # author date and committer date. Instead, we should use the visit's date if "extrinsic" in metadata: extrinsic_files = metadata["extrinsic"]["raw"]["files"] @@ -805,16 +866,24 @@ } assert set(original_artifact) <= allowed_keys, set(original_artifact) - load_metadata( - storage, - row["id"], - discovery_date, - metadata["original_artifact"], - ORIGINAL_ARTIFACT_FORMAT, - authority=AUTHORITIES["swh"], - origin=origin, - dry_run=dry_run, - ) + if type_ == "dsc": + assert origin is None + origins = debian_origins_from_row(row, storage) + assert origins, row + else: + origins = [origin] + + for origin in origins: + load_metadata( + storage, + row["id"], + discovery_date, + metadata["original_artifact"], + ORIGINAL_ARTIFACT_FORMAT, + authority=AUTHORITIES["swh"], + origin=origin, + dry_run=dry_run, + ) del metadata["original_artifact"] assert metadata == {}, ( diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py @@ -9,7 +9,10 @@ import copy import datetime import json -from unittest.mock import call, Mock +from unittest.mock import call, Mock, patch as _patch + +import attr +import pytest from swh.model.identifiers import parse_swhid from swh.model.model import ( @@ -17,10 +20,21 @@ MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + OriginVisit, + OriginVisitStatus, + Person, RawExtrinsicMetadata, + Revision, + RevisionType, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, ) -from swh.storage.migrate_extrinsic_metadata import handle_row +from swh.storage.interface import ListOrder, PagedResult +from swh.storage.migrate_extrinsic_metadata import handle_row, debian_origins_from_row FETCHER = MetadataFetcher( @@ -33,6 +47,283 @@ ) +def patch(function_name, *args, **kwargs): + # It's a long name, this function spares some line breaks in 'with' statements + return _patch( + "swh.storage.migrate_extrinsic_metadata." + function_name, *args, **kwargs + ) + + +def test_debian_origins_from_row(): + """Tests debian_origins_from_row on a real example (with some parts + omitted, for conciseness).""" + origin_url = "deb://Debian/packages/kalgebra" + + visit = OriginVisit( + origin=origin_url, + date=datetime.datetime( + 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc, + ), + type="deb", + visit=280, + ) + + def mock_origin_visit_get(origin, page_token, order): + if origin in ( + "deb://Debian-Security/packages/kalgebra", + "http://snapshot.debian.org/package/kalgebra/", + ): + return PagedResult(results=[], next_page_token=None) + elif origin == "deb://Debian/packages/kalgebra": + if page_token == None: + return PagedResult( + # ... + results=[visit,], + next_page_token="280", + ) + elif page_token == "280": + return PagedResult(results=[], next_page_token=None) + else: + assert False, page_token + else: + assert False, origin + + storage = Mock() + + storage.origin_visit_get.side_effect = mock_origin_visit_get + + storage.origin_visit_status_get.return_value = PagedResult( + results=[ + OriginVisitStatus( + origin=origin_url, + visit=280, + date=datetime.datetime( + 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc + ), + status="full", + snapshot=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4", + metadata=None, + ) + ], + next_page_token=None, + ) + + snapshot = Snapshot( + id=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4", + branches={ + # ... + b"releases/unstable/main/4:19.12.1-1": SnapshotBranch( + target=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + target_type=TargetType.REVISION, + ), + }, + ) + + revision_row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "metadata": { + # ... + "original_artifact": [ + { + "filename": "kalgebra_19.12.1-1.dsc", + # ... + }, + ] + }, + } + + with patch("snapshot_get_all_branches", return_value=snapshot): + assert debian_origins_from_row(revision_row, storage) == [origin_url] + + assert storage.method_calls == [ + call.origin_visit_get( + "deb://Debian/packages/kalgebra", order=ListOrder.ASC, page_token=None + ), + call.origin_visit_status_get( + "deb://Debian/packages/kalgebra", 280, order=ListOrder.ASC, page_token=None + ), + call.origin_visit_get( + "deb://Debian-Security/packages/kalgebra", + order=ListOrder.ASC, + page_token=None, + ), + call.origin_visit_get( + "http://snapshot.debian.org/package/kalgebra/", + order=ListOrder.ASC, + page_token=None, + ), + ] + + +def test_debian_origins_from_row__no_result(): + """Tests debian_origins_from_row when there's no origin, visit, status, + snapshot, branch, or matching branch. + """ + storage = Mock() + + origin_url = "deb://Debian/packages/kalgebra" + + revision_row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]}, + } + + # no visit + with patch("iter_origin_visits", return_value=[]): + assert debian_origins_from_row(revision_row, storage) == [] + + assert storage.method_calls == [] + + visit = OriginVisit( + origin=origin_url, + date=datetime.datetime.now(tz=datetime.timezone.utc), + type="deb", + visit=280, + ) + + # no status + with patch("iter_origin_visits", return_value=[visit]): + with patch("iter_origin_visit_statuses", return_value=[]): + assert debian_origins_from_row(revision_row, storage) == [] + + assert storage.method_calls == [] + + status = OriginVisitStatus( + origin=origin_url, + visit=280, + date=datetime.datetime.now(tz=datetime.timezone.utc), + status="full", + snapshot=None, + metadata=None, + ) + + # no snapshot + with patch("iter_origin_visits", return_value=[visit]): + with patch("iter_origin_visit_statuses", return_value=[status]): + assert debian_origins_from_row(revision_row, storage) == [] + + assert storage.method_calls == [] + + status = attr.evolve(status, snapshot=b"42" * 10) + + snapshot = Snapshot(id=b"42" * 10, branches={},) + + # no branch + with patch("iter_origin_visits", return_value=[visit]): + with patch("iter_origin_visit_statuses", return_value=[status]): + with patch("snapshot_get_all_branches", return_value=snapshot): + assert debian_origins_from_row(revision_row, storage) == [] + + revision = Revision( + id=b"21" * 10, + message=b"foo", + author=Person.from_fullname(b"foo"), + committer=Person.from_fullname(b"foo"), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + type=RevisionType.DSC, + directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb", + synthetic=True, + metadata=None, + parents=(), + extra_headers=(), + ) + + storage.revision_get.return_value = [revision] + + # no matching branch + with patch("iter_origin_visits", return_value=[visit]): + with patch("iter_origin_visit_statuses", return_value=[status]): + with patch("snapshot_get_all_branches", return_value=snapshot): + assert debian_origins_from_row(revision_row, storage) == [] + + assert storage.method_calls == [] + + +def test_debian_origins_from_row__check_revisions(): + """Tests debian_origins_from_row errors when the revision at the head + of a branch is a DSC and has no parents + """ + storage = Mock() + + origin_url = "deb://Debian/packages/kalgebra" + + revision_row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]}, + } + + visit = OriginVisit( + origin=origin_url, + date=datetime.datetime.now(tz=datetime.timezone.utc), + type="deb", + visit=280, + ) + + status = OriginVisitStatus( + origin=origin_url, + visit=280, + date=datetime.datetime.now(tz=datetime.timezone.utc), + status="full", + snapshot=b"42" * 10, + metadata=None, + ) + snapshot = Snapshot( + id=b"42" * 10, + branches={ + b"foo": SnapshotBranch(target_type=TargetType.REVISION, target=b"21" * 10) + }, + ) + + revision = Revision( + id=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + message=b"foo", + author=Person.from_fullname(b"foo"), + committer=Person.from_fullname(b"foo"), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + type=RevisionType.DSC, + directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb", + synthetic=True, + metadata=None, + parents=(b"parent " * 2,), + extra_headers=(), + ) + + storage.revision_get.return_value = [revision] + + with patch("iter_origin_visits", return_value=[visit]): + with patch("iter_origin_visit_statuses", return_value=[status]): + with patch("snapshot_get_all_branches", return_value=snapshot): + with pytest.raises(AssertionError, match="DSC revision with parents"): + debian_origins_from_row(revision_row, storage) + + revision = attr.evolve(revision, type=RevisionType.GIT) + storage.revision_get.return_value = [revision] + + with patch("iter_origin_visits", return_value=[visit]): + with patch("iter_origin_visit_statuses", return_value=[status]): + with patch("snapshot_get_all_branches", return_value=snapshot): + with pytest.raises(AssertionError, match="non-DSC revision"): + debian_origins_from_row(revision_row, storage) + + def test_debian_with_extrinsic(): dest_original_artifacts = [ { @@ -138,9 +429,14 @@ }, } + origin_url = "deb://Debian/packages/kalgebra" + storage = Mock() + deposit_cur = None - handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + with patch("debian_origins_from_row", return_value=[origin_url]): + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) assert storage.method_calls == [ call.raw_extrinsic_metadata_add( @@ -157,9 +453,10 @@ fetcher=FETCHER, format="original-artifacts-json", metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, ), ] - ) + ), ] @@ -249,8 +546,12 @@ } storage = Mock() + + origin_url = "http://snapshot.debian.org/package/pymongo" + deposit_cur = None - handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + with patch("debian_origins_from_row", return_value=[origin_url]): + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) assert storage.method_calls == [ call.raw_extrinsic_metadata_add( @@ -267,6 +568,7 @@ fetcher=FETCHER, format="original-artifacts-json", metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, ), ] )