diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py --- a/swh/storage/migrate_extrinsic_metadata.py +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -43,7 +43,6 @@ MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, - RevisionType, Sha1Git, ) from swh.storage import get_storage @@ -196,14 +195,9 @@ # https://forge.softwareheritage.org/T997 continue - # Check it's DSC (we only support those for now) - assert ( - revision.type == RevisionType.DSC - ), "non-DSC revisions are not supported" - # Check it doesn't have parents (else we would have to # recurse) - assert revision.parents == (), "DSC revision with parents" + assert revision.parents == (), "revision with parents" return False @@ -762,22 +756,15 @@ assert len(metadata["original_artifact"]) == 1 - # it's tempting here to do this: - # - # project_name = pypi_project_from_filename( - # metadata["original_artifact"][0]["filename"] - # ) - # origin = f"https://pypi.org/project/{project_name}/" - # assert_origin_exists(storage, origin) - # - # but unfortunately, the filename is user-provided, and doesn't - # necessarily match the package name on pypi. - - # TODO: on second thoughts, I think we can use this as a heuristic, - # then double-check by listing visits and snapshots from the origin; - # it should work for most packages. - - origin = None + project_name = pypi_project_from_filename( + metadata["original_artifact"][0]["filename"] + ) + origin = f"https://pypi.org/project/{project_name}/" + # But unfortunately, the filename is user-provided, and doesn't + # necessarily match the package name on pypi. Therefore, we need + # to check it. + if not _check_revision_in_origin(storage, origin, row["id"]): + origin = None if "project" in metadata: # pypi loader format 2 diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py @@ -309,14 +309,7 @@ ) storage.revision_add([revision]) - with pytest.raises(AssertionError, match="DSC revision with parents"): - debian_origins_from_row(revision_row, storage) - - storage = copy.deepcopy(storage_before_revision) - revision = attr.evolve(revision, type=RevisionType.GIT) - storage.revision_add([revision]) - - with pytest.raises(AssertionError, match="non-DSC revision"): + with pytest.raises(AssertionError, match="revision with parents"): debian_origins_from_row(revision_row, storage) diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -19,7 +19,12 @@ MetadataFetcher, MetadataTargetType, Origin, + OriginVisit, + OriginVisitStatus, RawExtrinsicMetadata, + Snapshot, + SnapshotBranch, + TargetType, ) from swh.storage import get_storage @@ -41,6 +46,10 @@ ) +def now(): + return datetime.datetime.now(tz=datetime.timezone.utc) + + def test_pypi_project_from_filename(): files = [ ("django-agent-trust-0.1.8.tar.gz", "django-agent-trust"), @@ -370,3 +379,114 @@ ], next_page_token=None, ) + + +def test_pypi_good_origin(): + """Tests loading a revision generated by a vert old PyPI loader that + does not have a provider orhas 'project' metadata.""" + + source_original_artifact = { + "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", + "date": "2014-05-07T22:03:00", + "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", + "size": 46644, + "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", + "filename": "PyPDFLite-0.1.32.tar.gz", + "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", + "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", + "archive_type": "tar", + } + + dest_original_artifacts = [ + { + "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", + "filename": "PyPDFLite-0.1.32.tar.gz", + "archive_type": "tar", + "length": 46644, + "checksums": { + "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", + "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", + "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", + "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", + }, + } + ] + + revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2" + row = { + "id": revision_id, + "date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"0.1.32", + "metadata": {"original_artifact": source_original_artifact}, + } + + origin_url = "https://pypi.org/project/PyPDFLite/" + + storage = get_storage("memory") + + snapshot_id = b"42" * 10 + storage.origin_add([Origin(url=origin_url)]) + storage.origin_visit_add( + [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")] + ) + storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin_url, + visit=1, + date=now(), + status="partial", + snapshot=snapshot_id, + ) + ] + ) + storage.snapshot_add( + [ + Snapshot( + id=snapshot_id, + branches={ + b"foo": SnapshotBranch( + target_type=TargetType.REVISION, target=revision_id, + ) + }, + ) + ] + ) + storage.metadata_authority_add( + [ + attr.evolve(PYPI_AUTHORITY, metadata={}), + attr.evolve(SWH_AUTHORITY, metadata={}), + ] + ) + storage.metadata_fetcher_add([FETCHER]) + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + revision_swhid = parse_swhid("swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2") + + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=PYPI_AUTHORITY, + ) == PagedResult(results=[], next_page_token=None,) + assert storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, revision_swhid, authority=SWH_AUTHORITY, + ) == PagedResult( + results=[ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, + ), + ], + next_page_token=None, + )