diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py --- a/swh/storage/migrate_extrinsic_metadata.py +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -29,7 +29,9 @@ import sys import time from typing import Any, Dict, Optional +from urllib.error import HTTPError from urllib.parse import unquote, urlparse +from urllib.request import urlopen import iso8601 import psycopg2 @@ -234,28 +236,39 @@ return match.group("project_name") +def pypi_origin_from_project_name(project_name: str) -> str: + return f"https://pypi.org/project/{project_name}/" + + def pypi_origin_from_filename(storage, rev_id: bytes, filename: str) -> Optional[str]: project_name = pypi_project_from_filename(filename) - origin = f"https://pypi.org/project/{project_name}/" + origin = pypi_origin_from_project_name(project_name) # But unfortunately, the filename is user-provided, and doesn't # necessarily match the package name on pypi. Therefore, we need # to check it. - if not _check_revision_in_origin(storage, origin, rev_id): - origin_with_dashes = origin.replace("_", "-") - # if the file name contains underscores but we can't find - # a matching origin, also try with dashes. It's common for package - # names containing underscores to use dashes on pypi. - if ( - "_" in origin - and check_origin_exists(storage, origin_with_dashes) - and _check_revision_in_origin(storage, origin_with_dashes, rev_id) - ): - origin = origin_with_dashes - else: - print(f"revision {rev_id.hex()} false positive of origin {origin}.") - return None + if _check_revision_in_origin(storage, origin, rev_id): + return origin + + # if the origin we guessed does not exist, query the PyPI API with the + # project name we guessed. If only the capitalisation and dash/underscores + # are wrong (by far the most common case), PyPI kindly corrects them. + try: + resp = urlopen(f"https://pypi.org/pypi/{project_name}/json/") + except HTTPError as e: + assert e.code == 404 + # nope; PyPI couldn't correct the wrong project name + return None + assert resp.code == 200, resp.code + project_name = json.load(resp)["name"] + origin = pypi_origin_from_project_name(project_name) - return origin + if _check_revision_in_origin(storage, origin, rev_id): + return origin + else: + # The origin exists, but the revision does not belong in it. + # This happens sometimes, as the filename we guessed the origin + # from is user-provided. + return None def cran_package_from_url(filename): diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -9,6 +9,7 @@ import copy import datetime import json +import urllib.error import attr @@ -30,6 +31,7 @@ from swh.storage.interface import PagedResult from swh.storage.migrate_extrinsic_metadata import ( handle_row, + pypi_origin_from_filename, pypi_project_from_filename, ) @@ -136,6 +138,63 @@ assert pypi_project_from_filename(filename) == project +def test_pypi_origin_from_project_name(mocker): + origin_url = "https://pypi.org/project/ProjectName/" + + storage = get_storage("memory") + + revision_id = b"41" * 10 + snapshot_id = b"42" * 10 + storage.origin_add([Origin(url=origin_url)]) + storage.origin_visit_add( + [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")] + ) + storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin_url, + visit=1, + date=now(), + status="partial", + snapshot=snapshot_id, + ) + ] + ) + storage.snapshot_add( + [ + Snapshot( + id=snapshot_id, + branches={ + b"foo": SnapshotBranch( + target_type=TargetType.REVISION, target=revision_id, + ) + }, + ) + ] + ) + + class response: + code = 200 + + def read(self): + return b'{"name": "ProjectName"}' + + mock_urlopen = mocker.patch( + "swh.storage.migrate_extrinsic_metadata.urlopen", return_value=response(), + ) + + assert ( + pypi_origin_from_filename(storage, revision_id, "ProjectName-1.0.0.tar.gz") + == origin_url + ) + mock_urlopen.assert_not_called() + assert ( + pypi_origin_from_filename(storage, revision_id, "projectname-1.0.0.tar.gz") + == origin_url + ) + mock_urlopen.assert_called_once_with("https://pypi.org/pypi/projectname/json/") + + def test_pypi_1(): """Tests loading a revision generated by a new PyPI loader that has a provider.""" @@ -256,10 +315,15 @@ ) -def test_pypi_2(): +def test_pypi_2(mocker): """Tests loading a revision generated by an old PyPI loader that does not have a provider, but has 'project' metadata.""" + mocker.patch( + "swh.storage.migrate_extrinsic_metadata.urlopen", + side_effect=urllib.error.HTTPError(None, 404, "Not Found", None, None), + ) + extrinsic_metadata = { "name": "jupyterhub-simx", "author": "Jupyter Development Team", @@ -371,10 +435,15 @@ ) -def test_pypi_3(): +def test_pypi_3(mocker): """Tests loading a revision generated by a very old PyPI loader that does not have a provider or has 'project' metadata.""" + mocker.patch( + "swh.storage.migrate_extrinsic_metadata.urlopen", + side_effect=urllib.error.HTTPError(None, 404, "Not Found", None, None), + ) + source_original_artifact = { "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", "date": "2014-05-07T22:03:00",