Page MenuHomeSoftware Heritage

D3998.diff
No OneTemporary

D3998.diff

diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
--- a/swh/storage/migrate_extrinsic_metadata.py
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -115,10 +115,39 @@
# not reliable, because PyPI allows arbitrary names
def pypi_project_from_filename(filename):
+ if filename == "mongomotor-0.13.0.n.tar.gz":
+ return "mongomotor"
+ elif re.match(r"datahaven-rev[0-9]+\.tar\.gz", filename):
+ return "datahaven"
+ elif re.match(r"Dtls-[0-9]\.[0-9]\.[0-9]\.sdist_with_openssl\..*", filename):
+ return "Dtls"
+ elif re.match("pytz-20[0-9][0-9][a-z].tar.gz", filename):
+ return "pytz"
+ elif filename.startswith(("powny-", "obedient.powny-",)):
+ return filename.split("-")[0]
+ elif filename.startswith("devpi-theme-16-"):
+ return "devpi-theme-16"
+ filename = filename.replace(" ", "-")
+
match = re.match(
- r"^(?P<project_name>[a-zA-Z0-9_.-]+)"
- r"-[0-9.]+([a-z]+[0-9]+)?(dev|\.dev[0-9]+)?(-[a-z][a-z0-9]*)?\.(tar\.gz|zip)$",
+ r"^(?P<project_name>[a-z_.-]+)" # project name
+ r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension
filename,
+ re.I,
+ )
+ if match:
+ return match.group("project_name")
+
+ match = re.match(
+ r"^(?P<project_name>[a-z0-9_.]+(-[a-z0-9_.]*[a-z][a-z0-9_.]*)*)" # project name
+ r"-v?[0-9_.]+([a-z]+[0-9]+)?(dev|pre|[a-c])?" # "main" version
+ r"(\.post[0-9]*)?" # suffix generated by vcversioner/setuptools-scm/...
+ r"([.-]?(alpha|beta|dev|post|pre|rc)(\.?[0-9]+)?)*" # development status
+ r"([.-]g?[0-9a-f]+)?" # git commit
+ r"(-[a-z][a-z0-9]*)?" # extra suffixes
+ r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension
+ filename,
+ re.I,
)
assert match, filename
return match.group("project_name")
@@ -228,10 +257,17 @@
def assert_origin_exists(storage, origin):
- assert (
- hashlib.sha1(origin.encode()).digest() in _origins # very fast
- or storage.origin_get([origin])[0] is not None # slow, but up to date
- ), origin
+ assert check_origin_exists(storage, origin), origin
+
+
+def check_origin_exists(storage, origin):
+ return (
+ (
+ hashlib.sha1(origin.encode()).digest() in _origins # very fast
+ or storage.origin_get([origin])[0] is not None # slow, but up to date
+ ),
+ origin,
+ )
def load_metadata(
@@ -764,7 +800,23 @@
# necessarily match the package name on pypi. Therefore, we need
# to check it.
if not _check_revision_in_origin(storage, origin, row["id"]):
- origin = None
+ origin_with_dashes = origin.replace("_", "-")
+ # if the file name contains underscores but we can't find
+ # a matching origin, also try with dashes. It's common for package
+ # names containing underscores to use dashes on pypi.
+ if (
+ "_" in origin
+ and check_origin_exists(storage, origin_with_dashes)
+ and _check_revision_in_origin(
+ storage, origin_with_dashes, row["id"]
+ )
+ ):
+ origin = origin_with_dashes
+ else:
+ print(
+ f"revision {row['id'].hex()} false positive of origin {origin}."
+ )
+ origin = None
if "project" in metadata:
# pypi loader format 2
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
--- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
@@ -58,6 +58,33 @@
("fake-factory-0.5.6-proper.tar.gz", "fake-factory"),
("ariane_procos-0.1.2-b05.tar.gz", "ariane_procos"),
("Yelpy-0.2.2dev.tar.gz", "Yelpy"),
+ ("geventhttpclient_c-1.0a-t1.tar.gz", "geventhttpclient_c"),
+ ("codeforlife-portal-1.0.0.post.dev618.tar.gz", "codeforlife-portal"),
+ ("ChecklistDSL-0.0.1.alpha.1.tar.gz", "ChecklistDSL"),
+ ("transifex-1.1.0beta.tar.gz", "transifex"),
+ ("thespian-2.5.10.tar.bz2", "thespian"),
+ ("janis pipelines-0.5.3.tar.gz", "janis-pipelines"),
+ ("pants-1.0.0-beta.2.tar.gz", "pants"),
+ ("uforge_python_sdk-3.8.4-RC15.tar.gz", "uforge_python_sdk"),
+ ("virtuoso-0.11.0.48.b5865c2b46fb.tar.gz", "virtuoso"),
+ ("cloud_ftp-v1.0.0.tar.gz", "cloud_ftp"),
+ ("frozenordereddict-1.0.0.tgz", "frozenordereddict"),
+ ("pywebsite-0.1.2pre.tar.gz", "pywebsite"),
+ ("Flask Unchained-0.2.0.tar.gz", "Flask-Unchained"),
+ ("mongomotor-0.13.0.n.tar.gz", "mongomotor"),
+ ("datahaven-rev8784.tar.gz", "datahaven"),
+ ("geopandas-0.1.0.dev-120d5ee.tar.gz", "geopandas"),
+ ("aimmo-v0.1.1-alpha.post.dev61.tar.gz", "aimmo"),
+ ("django-migrations-plus-0.1.0.dev5.gdd1abd3.tar.gz", "django-migrations-plus"),
+ ("function_shield.tar.gz", "function_shield"),
+ ("Dtls-0.1.0.sdist_with_openssl.mingw-win32.tar.gz", "Dtls"),
+ ("pytz-2005m.tar.gz", "pytz"),
+ ("python-librsync-0.1-3.tar.gz", "python-librsync"),
+ ("powny-1.4.0-alpha-20141205-1452-f5a2b03.tar.gz", "powny"),
+ ("stp-3pc-batch-0.1.11.tar.gz", "stp-3pc-batch"),
+ ("obedient.powny-3.0.0-alpha-20141027-2102-9e53ebd.tar.gz", "obedient.powny"),
+ ("mojimoji-0.0.9_2.tar.gz", "mojimoji"),
+ ("devpi-theme-16-2.0.0.tar.gz", "devpi-theme-16"),
]
for (filename, project) in files:

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 6:52 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222056

Event Timeline