diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py --- a/swh/storage/migrate_extrinsic_metadata.py +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -115,10 +115,39 @@ # not reliable, because PyPI allows arbitrary names def pypi_project_from_filename(filename): + if filename == "mongomotor-0.13.0.n.tar.gz": + return "mongomotor" + elif re.match(r"datahaven-rev[0-9]+\.tar\.gz", filename): + return "datahaven" + elif re.match(r"Dtls-[0-9]\.[0-9]\.[0-9]\.sdist_with_openssl\..*", filename): + return "Dtls" + elif re.match("pytz-20[0-9][0-9][a-z].tar.gz", filename): + return "pytz" + elif filename.startswith(("powny-", "obedient.powny-",)): + return filename.split("-")[0] + elif filename.startswith("devpi-theme-16-"): + return "devpi-theme-16" + filename = filename.replace(" ", "-") + match = re.match( - r"^(?P[a-zA-Z0-9_.-]+)" - r"-[0-9.]+([a-z]+[0-9]+)?(dev|\.dev[0-9]+)?(-[a-z][a-z0-9]*)?\.(tar\.gz|zip)$", + r"^(?P[a-z_.-]+)" # project name + r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension filename, + re.I, + ) + if match: + return match.group("project_name") + + match = re.match( + r"^(?P[a-z0-9_.]+(-[a-z0-9_.]*[a-z][a-z0-9_.]*)*)" # project name + r"-v?[0-9_.]+([a-z]+[0-9]+)?(dev|pre|[a-c])?" # "main" version + r"(\.post[0-9]*)?" # suffix generated by vcversioner/setuptools-scm/... + r"([.-]?(alpha|beta|dev|post|pre|rc)(\.?[0-9]+)?)*" # development status + r"([.-]g?[0-9a-f]+)?" # git commit + r"(-[a-z][a-z0-9]*)?" # extra suffixes + r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension + filename, + re.I, ) assert match, filename return match.group("project_name") @@ -228,10 +257,17 @@ def assert_origin_exists(storage, origin): - assert ( - hashlib.sha1(origin.encode()).digest() in _origins # very fast - or storage.origin_get([origin])[0] is not None # slow, but up to date - ), origin + assert check_origin_exists(storage, origin), origin + + +def check_origin_exists(storage, origin): + return ( + ( + hashlib.sha1(origin.encode()).digest() in _origins # very fast + or storage.origin_get([origin])[0] is not None # slow, but up to date + ), + origin, + ) def load_metadata( @@ -764,7 +800,23 @@ # necessarily match the package name on pypi. Therefore, we need # to check it. if not _check_revision_in_origin(storage, origin, row["id"]): - origin = None + origin_with_dashes = origin.replace("_", "-") + # if the file name contains underscores but we can't find + # a matching origin, also try with dashes. It's common for package + # names containing underscores to use dashes on pypi. + if ( + "_" in origin + and check_origin_exists(storage, origin_with_dashes) + and _check_revision_in_origin( + storage, origin_with_dashes, row["id"] + ) + ): + origin = origin_with_dashes + else: + print( + f"revision {row['id'].hex()} false positive of origin {origin}." + ) + origin = None if "project" in metadata: # pypi loader format 2 diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -58,6 +58,33 @@ ("fake-factory-0.5.6-proper.tar.gz", "fake-factory"), ("ariane_procos-0.1.2-b05.tar.gz", "ariane_procos"), ("Yelpy-0.2.2dev.tar.gz", "Yelpy"), + ("geventhttpclient_c-1.0a-t1.tar.gz", "geventhttpclient_c"), + ("codeforlife-portal-1.0.0.post.dev618.tar.gz", "codeforlife-portal"), + ("ChecklistDSL-0.0.1.alpha.1.tar.gz", "ChecklistDSL"), + ("transifex-1.1.0beta.tar.gz", "transifex"), + ("thespian-2.5.10.tar.bz2", "thespian"), + ("janis pipelines-0.5.3.tar.gz", "janis-pipelines"), + ("pants-1.0.0-beta.2.tar.gz", "pants"), + ("uforge_python_sdk-3.8.4-RC15.tar.gz", "uforge_python_sdk"), + ("virtuoso-0.11.0.48.b5865c2b46fb.tar.gz", "virtuoso"), + ("cloud_ftp-v1.0.0.tar.gz", "cloud_ftp"), + ("frozenordereddict-1.0.0.tgz", "frozenordereddict"), + ("pywebsite-0.1.2pre.tar.gz", "pywebsite"), + ("Flask Unchained-0.2.0.tar.gz", "Flask-Unchained"), + ("mongomotor-0.13.0.n.tar.gz", "mongomotor"), + ("datahaven-rev8784.tar.gz", "datahaven"), + ("geopandas-0.1.0.dev-120d5ee.tar.gz", "geopandas"), + ("aimmo-v0.1.1-alpha.post.dev61.tar.gz", "aimmo"), + ("django-migrations-plus-0.1.0.dev5.gdd1abd3.tar.gz", "django-migrations-plus"), + ("function_shield.tar.gz", "function_shield"), + ("Dtls-0.1.0.sdist_with_openssl.mingw-win32.tar.gz", "Dtls"), + ("pytz-2005m.tar.gz", "pytz"), + ("python-librsync-0.1-3.tar.gz", "python-librsync"), + ("powny-1.4.0-alpha-20141205-1452-f5a2b03.tar.gz", "powny"), + ("stp-3pc-batch-0.1.11.tar.gz", "stp-3pc-batch"), + ("obedient.powny-3.0.0-alpha-20141027-2102-9e53ebd.tar.gz", "obedient.powny"), + ("mojimoji-0.0.9_2.tar.gz", "mojimoji"), + ("devpi-theme-16-2.0.0.tar.gz", "devpi-theme-16"), ] for (filename, project) in files: