diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py --- a/swh/storage/migrate_extrinsic_metadata.py +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -46,6 +46,8 @@ Sha1Git, ) from swh.storage import get_storage +from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses +from swh.storage.algos.snapshot import snapshot_get_all_branches # XML namespaces and fields for metadata coming from the deposit: @@ -115,7 +117,7 @@ def pypi_project_from_filename(filename): match = re.match( r"^(?P[a-zA-Z0-9_.-]+)" - r"-[0-9.]+([a-z]+[0-9]+)?(\.dev[0-9]+)?\.(tar\.gz|zip)$", + r"-[0-9.]+([a-z]+[0-9]+)?(\.dev[0-9]+)?(-[a-z][a-z0-9]*)?\.(tar\.gz|zip)$", filename, ) assert match, filename @@ -160,6 +162,67 @@ del metadata[key] +def _check_revision_in_origin(storage, origin, revision_id): + seen_snapshots = set() # no need to visit them again + seen_revisions = set() + + for visit in iter_origin_visits(storage, origin): + for status in iter_origin_visit_statuses(storage, origin, visit.visit): + if status.snapshot is None: + continue + if status.snapshot in seen_snapshots: + continue + seen_snapshots.add(status.snapshot) + snapshot = snapshot_get_all_branches(storage, status.snapshot) + for (branch_name, branch) in snapshot.branches.items(): + if branch is None: + continue + + # If it's the revision passed as argument, then it is indeed in the + # origin + if branch.target == revision_id: + return True + + # Else, let's make sure the branch doesn't have any other revision + + # Get the revision at the top of the branch. + if branch.target in seen_revisions: + continue + seen_revisions.add(branch.target) + revision = storage.revision_get([branch.target])[0] + + if revision is None: + # https://forge.softwareheritage.org/T997 + continue + + # Check it doesn't have parents (else we would have to + # recurse) + assert revision.parents == (), "revision with parents" + + return False + + +def debian_origins_from_row(row, storage): + """Guesses a Debian origin from a row. May return an empty list if it + cannot reliably guess it, but all results are guaranteed to be correct.""" + filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]] + package_names = {filename.split("_")[0] for filename in filenames} + assert len(package_names) == 1, package_names + (package_name,) = package_names + + candidate_origins = [ + f"deb://Debian/packages/{package_name}", + f"deb://Debian-Security/packages/{package_name}", + f"http://snapshot.debian.org/package/{package_name}/", + ] + + return [ + origin + for origin in candidate_origins + if _check_revision_in_origin(storage, origin, row["id"]) + ] + + # Cache of origins that are known to exist _origins = set() @@ -378,11 +441,10 @@ return if type_ == "dsc": - origin = None # TODO: I can't find how to get it reliably + origin = None # it will be defined later, using debian_origins_from_row # TODO: the debian loader writes the changelog date as the revision's - # author date and committer date. Instead, we should use the visit's date, - # but I cannot find a way to reliably get it without the origin + # author date and committer date. Instead, we should use the visit's date if "extrinsic" in metadata: extrinsic_files = metadata["extrinsic"]["raw"]["files"] @@ -694,22 +756,15 @@ assert len(metadata["original_artifact"]) == 1 - # it's tempting here to do this: - # - # project_name = pypi_project_from_filename( - # metadata["original_artifact"][0]["filename"] - # ) - # origin = f"https://pypi.org/project/{project_name}/" - # assert_origin_exists(storage, origin) - # - # but unfortunately, the filename is user-provided, and doesn't - # necessarily match the package name on pypi. - - # TODO: on second thoughts, I think we can use this as a heuristic, - # then double-check by listing visits and snapshots from the origin; - # it should work for most packages. - - origin = None + project_name = pypi_project_from_filename( + metadata["original_artifact"][0]["filename"] + ) + origin = f"https://pypi.org/project/{project_name}/" + # But unfortunately, the filename is user-provided, and doesn't + # necessarily match the package name on pypi. Therefore, we need + # to check it. + if not _check_revision_in_origin(storage, origin, row["id"]): + origin = None if "project" in metadata: # pypi loader format 2 @@ -805,16 +860,24 @@ } assert set(original_artifact) <= allowed_keys, set(original_artifact) - load_metadata( - storage, - row["id"], - discovery_date, - metadata["original_artifact"], - ORIGINAL_ARTIFACT_FORMAT, - authority=AUTHORITIES["swh"], - origin=origin, - dry_run=dry_run, - ) + if type_ == "dsc": + assert origin is None + origins = debian_origins_from_row(row, storage) + assert origins, row + else: + origins = [origin] + + for origin in origins: + load_metadata( + storage, + row["id"], + discovery_date, + metadata["original_artifact"], + ORIGINAL_ARTIFACT_FORMAT, + authority=AUTHORITIES["swh"], + origin=origin, + dry_run=dry_run, + ) del metadata["original_artifact"] assert metadata == {}, ( diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py @@ -9,7 +9,10 @@ import copy import datetime import json -from unittest.mock import call, Mock +from unittest.mock import call, Mock, patch as _patch + +import attr +import pytest from swh.model.identifiers import parse_swhid from swh.model.model import ( @@ -17,10 +20,23 @@ MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + Origin, + OriginVisit, + OriginVisitStatus, + Person, RawExtrinsicMetadata, + Revision, + RevisionType, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, ) -from swh.storage.migrate_extrinsic_metadata import handle_row +from swh.storage import get_storage +from swh.storage.interface import ListOrder, PagedResult +from swh.storage.migrate_extrinsic_metadata import handle_row, debian_origins_from_row FETCHER = MetadataFetcher( @@ -33,6 +49,270 @@ ) +def now(): + return datetime.datetime.now(tz=datetime.timezone.utc) + + +def patch(function_name, *args, **kwargs): + # It's a long name, this function spares some line breaks in 'with' statements + return _patch( + "swh.storage.migrate_extrinsic_metadata." + function_name, *args, **kwargs + ) + + +def test_debian_origins_from_row(): + """Tests debian_origins_from_row on a real example (with some parts + omitted, for conciseness).""" + origin_url = "deb://Debian/packages/kalgebra" + + visit = OriginVisit( + origin=origin_url, + date=datetime.datetime( + 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc, + ), + type="deb", + visit=280, + ) + + storage = get_storage("memory") + + storage.origin_add( + [ + Origin(url=origin_url), + Origin(url="http://snapshot.debian.org/package/kalgebra/"), + ] + ) + + storage.origin_visit_add([visit]) + + storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin_url, + visit=280, + date=datetime.datetime( + 2020, 1, 27, 19, 32, 3, 925498, tzinfo=datetime.timezone.utc + ), + status="full", + snapshot=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4", + metadata=None, + ) + ], + ) + + snapshot = Snapshot( + id=b"\xafD\x15\x98){\xd4$\xdeI\x1f\xbe\x95lh`x\x14\xce\xc4", + branches={ + # ... + b"releases/unstable/main/4:19.12.1-1": SnapshotBranch( + target=b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + target_type=TargetType.REVISION, + ), + }, + ) + + revision_row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "metadata": { + # ... + "original_artifact": [ + { + "filename": "kalgebra_19.12.1-1.dsc", + # ... + }, + ] + }, + } + + storage.snapshot_add([snapshot]) + assert debian_origins_from_row(revision_row, storage) == [origin_url] + + +def test_debian_origins_from_row__no_result(): + """Tests debian_origins_from_row when there's no origin, visit, status, + snapshot, branch, or matching branch. + """ + storage = get_storage("memory") + + origin_url = "deb://Debian/packages/kalgebra" + snapshot_id = b"42424242424242424242" + revision_id = b"21212121212121212121" + + storage.origin_add([Origin(url=origin_url)]) + + revision_row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]}, + } + + # no visit + assert debian_origins_from_row(revision_row, storage) == [] + + storage.origin_visit_add( + [OriginVisit(origin=origin_url, date=now(), type="deb", visit=280,)] + ) + + # no status + assert debian_origins_from_row(revision_row, storage) == [] + + status = OriginVisitStatus( + origin=origin_url, + visit=280, + date=now(), + status="full", + snapshot=None, + metadata=None, + ) + storage.origin_visit_status_add([status]) + + # no snapshot + assert debian_origins_from_row(revision_row, storage) == [] + + status = attr.evolve(status, snapshot=snapshot_id, date=now()) + storage.origin_visit_status_add([status]) + + storage_before_snapshot = copy.deepcopy(storage) + + snapshot = Snapshot(id=snapshot_id, branches={}) + storage.snapshot_add([snapshot]) + + # no branch + assert debian_origins_from_row(revision_row, storage) == [] + + # "remove" the snapshot, so we can add a new one with the same id + storage = copy.deepcopy(storage_before_snapshot) + + snapshot = attr.evolve(snapshot, branches={b"foo": None,},) + storage.snapshot_add([snapshot]) + + # dangling branch + assert debian_origins_from_row(revision_row, storage) == [] + + # "remove" the snapshot again + storage = copy.deepcopy(storage_before_snapshot) + + snapshot = attr.evolve( + snapshot, + branches={ + b"foo": SnapshotBranch(target_type=TargetType.REVISION, target=revision_id,) + }, + ) + storage.snapshot_add([snapshot]) + + # branch points to unknown revision + assert debian_origins_from_row(revision_row, storage) == [] + + revision = Revision( + id=revision_id, + message=b"foo", + author=Person.from_fullname(b"foo"), + committer=Person.from_fullname(b"foo"), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + type=RevisionType.DSC, + directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb", + synthetic=True, + metadata=None, + parents=(), + extra_headers=(), + ) + + storage.revision_add([revision]) + + # no matching branch + assert debian_origins_from_row(revision_row, storage) == [] + + +def test_debian_origins_from_row__check_revisions(): + """Tests debian_origins_from_row errors when the revision at the head + of a branch is a DSC and has no parents + """ + storage = get_storage("memory") + + origin_url = "deb://Debian/packages/kalgebra" + revision_id = b"21" * 10 + + storage.origin_add([Origin(url=origin_url)]) + + revision_row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "metadata": {"original_artifact": [{"filename": "kalgebra_19.12.1-1.dsc",},]}, + } + + storage.origin_visit_add( + [ + OriginVisit( + origin=origin_url, + date=datetime.datetime.now(tz=datetime.timezone.utc), + type="deb", + visit=280, + ) + ] + ) + + storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin_url, + visit=280, + date=datetime.datetime.now(tz=datetime.timezone.utc), + status="full", + snapshot=b"42" * 10, + metadata=None, + ) + ] + ) + storage.snapshot_add( + [ + Snapshot( + id=b"42" * 10, + branches={ + b"foo": SnapshotBranch( + target_type=TargetType.REVISION, target=revision_id + ) + }, + ) + ] + ) + + storage_before_revision = copy.deepcopy(storage) + + revision = Revision( + id=revision_id, + message=b"foo", + author=Person.from_fullname(b"foo"), + committer=Person.from_fullname(b"foo"), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1580076204, microseconds=0), + offset=60, + negative_utc=False, + ), + type=RevisionType.DSC, + directory=b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb", + synthetic=True, + metadata=None, + parents=(b"parent " * 2,), + extra_headers=(), + ) + storage.revision_add([revision]) + + with pytest.raises(AssertionError, match="revision with parents"): + debian_origins_from_row(revision_row, storage) + + def test_debian_with_extrinsic(): dest_original_artifacts = [ { @@ -138,9 +418,14 @@ }, } + origin_url = "deb://Debian/packages/kalgebra" + storage = Mock() + deposit_cur = None - handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + with patch("debian_origins_from_row", return_value=[origin_url]): + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) assert storage.method_calls == [ call.raw_extrinsic_metadata_add( @@ -157,9 +442,10 @@ fetcher=FETCHER, format="original-artifacts-json", metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, ), ] - ) + ), ] @@ -249,8 +535,12 @@ } storage = Mock() + + origin_url = "http://snapshot.debian.org/package/pymongo" + deposit_cur = None - handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + with patch("debian_origins_from_row", return_value=[origin_url]): + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) assert storage.method_calls == [ call.raw_extrinsic_metadata_add( @@ -267,6 +557,7 @@ fetcher=FETCHER, format="original-artifacts-json", metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, ), ] ) diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -9,7 +9,8 @@ import copy import datetime import json -from unittest.mock import call, Mock + +import attr from swh.model.identifiers import parse_swhid from swh.model.model import ( @@ -18,9 +19,16 @@ MetadataFetcher, MetadataTargetType, Origin, + OriginVisit, + OriginVisitStatus, RawExtrinsicMetadata, + Snapshot, + SnapshotBranch, + TargetType, ) +from swh.storage import get_storage +from swh.storage.interface import PagedResult from swh.storage.migrate_extrinsic_metadata import ( handle_row, pypi_project_from_filename, @@ -31,15 +39,17 @@ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", ) PYPI_AUTHORITY = MetadataAuthority( - type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={}, + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) SWH_AUTHORITY = MetadataAuthority( - type=MetadataAuthorityType.REGISTRY, - url="https://softwareheritage.org/", - metadata={}, + type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", ) +def now(): + return datetime.datetime.now(tz=datetime.timezone.utc) + + def test_pypi_project_from_filename(): files = [ ("django-agent-trust-0.1.8.tar.gz", "django-agent-trust"), @@ -47,6 +57,8 @@ ("py-evm-0.2.0a9.tar.gz", "py-evm"), ("collective.texttospeech-1.0rc1.tar.gz", "collective.texttospeech"), ("flatland-fork-0.4.post1.dev40550160.zip", "flatland-fork"), + ("fake-factory-0.5.6-proper.tar.gz", "fake-factory"), + ("ariane_procos-0.1.2-b05.tar.gz", "ariane_procos"), ] for (filename, project) in files: @@ -119,55 +131,58 @@ origin_url = "https://pypi.org/project/m3-ui/" - storage = Mock() - - def origin_get(urls): - assert urls == [origin_url] - return [Origin(url=origin_url)] + storage = get_storage("memory") + storage.origin_add([Origin(url=origin_url)]) + storage.metadata_authority_add( + [ + attr.evolve(PYPI_AUTHORITY, metadata={}), + attr.evolve(SWH_AUTHORITY, metadata={}), + ] + ) + storage.metadata_fetcher_add([FETCHER]) - storage.origin_get.side_effect = origin_get deposit_cur = None handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) - assert storage.method_calls == [ - call.origin_get([origin_url]), - call.raw_extrinsic_metadata_add( - [ - RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=parse_swhid( - "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517" - ), - discovery_date=datetime.datetime( - 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc, - ), - authority=PYPI_AUTHORITY, - fetcher=FETCHER, - format="pypi-project-json", - metadata=json.dumps(extrinsic_metadata).encode(), - origin=origin_url, + revision_swhid = parse_swhid("swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517") + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=PYPI_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc, ), - ] - ), - call.raw_extrinsic_metadata_add( - [ - RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=parse_swhid( - "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517" - ), - discovery_date=datetime.datetime( - 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc, - ), - authority=SWH_AUTHORITY, - fetcher=FETCHER, - format="original-artifacts-json", - metadata=json.dumps(original_artifacts).encode(), - origin=origin_url, + authority=PYPI_AUTHORITY, + fetcher=FETCHER, + format="pypi-project-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ], + next_page_token=None, + ) + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=SWH_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc, ), - ] - ), - ] + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ], + next_page_token=None, + ) def test_pypi_2(): @@ -230,54 +245,59 @@ origin_url = "https://pypi.org/project/jupyterhub-simx/" - storage = Mock() - - def origin_get(urls): - assert urls == [origin_url] - return [Origin(url=origin_url)] + storage = get_storage("memory") - storage.origin_get.side_effect = origin_get + storage.origin_add([Origin(url=origin_url)]) + storage.metadata_authority_add( + [ + attr.evolve(PYPI_AUTHORITY, metadata={}), + attr.evolve(SWH_AUTHORITY, metadata={}), + ] + ) + storage.metadata_fetcher_add([FETCHER]) deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) - assert storage.method_calls == [ - call.raw_extrinsic_metadata_add( - [ - RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=parse_swhid( - "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca" - ), - discovery_date=datetime.datetime( - 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc, - ), - authority=PYPI_AUTHORITY, - fetcher=FETCHER, - format="pypi-project-json", - metadata=json.dumps(extrinsic_metadata).encode(), - origin=None, + revision_swhid = parse_swhid("swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca") + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=PYPI_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc, ), - ] - ), - call.raw_extrinsic_metadata_add( - [ - RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=parse_swhid( - "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca" - ), - discovery_date=datetime.datetime( - 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc, - ), - authority=SWH_AUTHORITY, - fetcher=FETCHER, - format="original-artifacts-json", - metadata=json.dumps(dest_original_artifacts).encode(), - origin=None, + authority=PYPI_AUTHORITY, + fetcher=FETCHER, + format="pypi-project-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=None, + ), + ], + next_page_token=None, + ) + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=SWH_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc, ), - ] - ), - ] + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=None, + ), + ], + next_page_token=None, + ) def test_pypi_3(): @@ -324,33 +344,151 @@ origin_url = "https://pypi.org/project/PyPDFLite/" - storage = Mock() + storage = get_storage("memory") - def origin_get(urls): - assert urls == [origin_url] - return [Origin(url=origin_url)] - - storage.origin_get.side_effect = origin_get + storage.origin_add([Origin(url=origin_url)]) + storage.metadata_authority_add( + [ + attr.evolve(PYPI_AUTHORITY, metadata={}), + attr.evolve(SWH_AUTHORITY, metadata={}), + ] + ) + storage.metadata_fetcher_add([FETCHER]) deposit_cur = None handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) - assert storage.method_calls == [ - call.raw_extrinsic_metadata_add( - [ - RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=parse_swhid( - "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2" - ), - discovery_date=datetime.datetime( - 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc, - ), - authority=SWH_AUTHORITY, - fetcher=FETCHER, - format="original-artifacts-json", - metadata=json.dumps(dest_original_artifacts).encode(), - origin=None, + revision_swhid = parse_swhid("swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2") + + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=PYPI_AUTHORITY, + ) == PagedResult(results=[], next_page_token=None,) + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=SWH_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc, ), - ] - ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=None, + ), + ], + next_page_token=None, + ) + + +def test_pypi_good_origin(): + """Tests loading a revision generated by a vert old PyPI loader that + does not have a provider orhas 'project' metadata.""" + + source_original_artifact = { + "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", + "date": "2014-05-07T22:03:00", + "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", + "size": 46644, + "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", + "filename": "PyPDFLite-0.1.32.tar.gz", + "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", + "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", + "archive_type": "tar", + } + + dest_original_artifacts = [ + { + "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", + "filename": "PyPDFLite-0.1.32.tar.gz", + "archive_type": "tar", + "length": 46644, + "checksums": { + "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", + "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", + "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", + "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", + }, + } ] + + revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2" + row = { + "id": revision_id, + "date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"0.1.32", + "metadata": {"original_artifact": source_original_artifact}, + } + + origin_url = "https://pypi.org/project/PyPDFLite/" + + storage = get_storage("memory") + + snapshot_id = b"42" * 10 + storage.origin_add([Origin(url=origin_url)]) + storage.origin_visit_add( + [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")] + ) + storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin_url, + visit=1, + date=now(), + status="partial", + snapshot=snapshot_id, + ) + ] + ) + storage.snapshot_add( + [ + Snapshot( + id=snapshot_id, + branches={ + b"foo": SnapshotBranch( + target_type=TargetType.REVISION, target=revision_id, + ) + }, + ) + ] + ) + storage.metadata_authority_add( + [ + attr.evolve(PYPI_AUTHORITY, metadata={}), + attr.evolve(SWH_AUTHORITY, metadata={}), + ] + ) + storage.metadata_fetcher_add([FETCHER]) + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + revision_swhid = parse_swhid("swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2") + + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=PYPI_AUTHORITY, + ) == PagedResult(results=[], next_page_token=None,) + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=SWH_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, + ), + ], + next_page_token=None, + )