Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124773
D3998.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
D3998.diff
View Options
diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
--- a/swh/storage/migrate_extrinsic_metadata.py
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -115,10 +115,39 @@
# not reliable, because PyPI allows arbitrary names
def pypi_project_from_filename(filename):
+ if filename == "mongomotor-0.13.0.n.tar.gz":
+ return "mongomotor"
+ elif re.match(r"datahaven-rev[0-9]+\.tar\.gz", filename):
+ return "datahaven"
+ elif re.match(r"Dtls-[0-9]\.[0-9]\.[0-9]\.sdist_with_openssl\..*", filename):
+ return "Dtls"
+ elif re.match("pytz-20[0-9][0-9][a-z].tar.gz", filename):
+ return "pytz"
+ elif filename.startswith(("powny-", "obedient.powny-",)):
+ return filename.split("-")[0]
+ elif filename.startswith("devpi-theme-16-"):
+ return "devpi-theme-16"
+ filename = filename.replace(" ", "-")
+
match = re.match(
- r"^(?P<project_name>[a-zA-Z0-9_.-]+)"
- r"-[0-9.]+([a-z]+[0-9]+)?(dev|\.dev[0-9]+)?(-[a-z][a-z0-9]*)?\.(tar\.gz|zip)$",
+ r"^(?P<project_name>[a-z_.-]+)" # project name
+ r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension
filename,
+ re.I,
+ )
+ if match:
+ return match.group("project_name")
+
+ match = re.match(
+ r"^(?P<project_name>[a-z0-9_.]+(-[a-z0-9_.]*[a-z][a-z0-9_.]*)*)" # project name
+ r"-v?[0-9_.]+([a-z]+[0-9]+)?(dev|pre|[a-c])?" # "main" version
+ r"(\.post[0-9]*)?" # suffix generated by vcversioner/setuptools-scm/...
+ r"([.-]?(alpha|beta|dev|post|pre|rc)(\.?[0-9]+)?)*" # development status
+ r"([.-]g?[0-9a-f]+)?" # git commit
+ r"(-[a-z][a-z0-9]*)?" # extra suffixes
+ r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension
+ filename,
+ re.I,
)
assert match, filename
return match.group("project_name")
@@ -228,10 +257,17 @@
def assert_origin_exists(storage, origin):
- assert (
- hashlib.sha1(origin.encode()).digest() in _origins # very fast
- or storage.origin_get([origin])[0] is not None # slow, but up to date
- ), origin
+ assert check_origin_exists(storage, origin), origin
+
+
+def check_origin_exists(storage, origin):
+ return (
+ (
+ hashlib.sha1(origin.encode()).digest() in _origins # very fast
+ or storage.origin_get([origin])[0] is not None # slow, but up to date
+ ),
+ origin,
+ )
def load_metadata(
@@ -764,7 +800,23 @@
# necessarily match the package name on pypi. Therefore, we need
# to check it.
if not _check_revision_in_origin(storage, origin, row["id"]):
- origin = None
+ origin_with_dashes = origin.replace("_", "-")
+ # if the file name contains underscores but we can't find
+ # a matching origin, also try with dashes. It's common for package
+ # names containing underscores to use dashes on pypi.
+ if (
+ "_" in origin
+ and check_origin_exists(storage, origin_with_dashes)
+ and _check_revision_in_origin(
+ storage, origin_with_dashes, row["id"]
+ )
+ ):
+ origin = origin_with_dashes
+ else:
+ print(
+ f"revision {row['id'].hex()} false positive of origin {origin}."
+ )
+ origin = None
if "project" in metadata:
# pypi loader format 2
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
--- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
@@ -58,6 +58,33 @@
("fake-factory-0.5.6-proper.tar.gz", "fake-factory"),
("ariane_procos-0.1.2-b05.tar.gz", "ariane_procos"),
("Yelpy-0.2.2dev.tar.gz", "Yelpy"),
+ ("geventhttpclient_c-1.0a-t1.tar.gz", "geventhttpclient_c"),
+ ("codeforlife-portal-1.0.0.post.dev618.tar.gz", "codeforlife-portal"),
+ ("ChecklistDSL-0.0.1.alpha.1.tar.gz", "ChecklistDSL"),
+ ("transifex-1.1.0beta.tar.gz", "transifex"),
+ ("thespian-2.5.10.tar.bz2", "thespian"),
+ ("janis pipelines-0.5.3.tar.gz", "janis-pipelines"),
+ ("pants-1.0.0-beta.2.tar.gz", "pants"),
+ ("uforge_python_sdk-3.8.4-RC15.tar.gz", "uforge_python_sdk"),
+ ("virtuoso-0.11.0.48.b5865c2b46fb.tar.gz", "virtuoso"),
+ ("cloud_ftp-v1.0.0.tar.gz", "cloud_ftp"),
+ ("frozenordereddict-1.0.0.tgz", "frozenordereddict"),
+ ("pywebsite-0.1.2pre.tar.gz", "pywebsite"),
+ ("Flask Unchained-0.2.0.tar.gz", "Flask-Unchained"),
+ ("mongomotor-0.13.0.n.tar.gz", "mongomotor"),
+ ("datahaven-rev8784.tar.gz", "datahaven"),
+ ("geopandas-0.1.0.dev-120d5ee.tar.gz", "geopandas"),
+ ("aimmo-v0.1.1-alpha.post.dev61.tar.gz", "aimmo"),
+ ("django-migrations-plus-0.1.0.dev5.gdd1abd3.tar.gz", "django-migrations-plus"),
+ ("function_shield.tar.gz", "function_shield"),
+ ("Dtls-0.1.0.sdist_with_openssl.mingw-win32.tar.gz", "Dtls"),
+ ("pytz-2005m.tar.gz", "pytz"),
+ ("python-librsync-0.1-3.tar.gz", "python-librsync"),
+ ("powny-1.4.0-alpha-20141205-1452-f5a2b03.tar.gz", "powny"),
+ ("stp-3pc-batch-0.1.11.tar.gz", "stp-3pc-batch"),
+ ("obedient.powny-3.0.0-alpha-20141027-2102-9e53ebd.tar.gz", "obedient.powny"),
+ ("mojimoji-0.0.9_2.tar.gz", "mojimoji"),
+ ("devpi-theme-16-2.0.0.tar.gz", "devpi-theme-16"),
]
for (filename, project) in files:
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 6:52 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222056
Attached To
D3998: migrate_extrinsic_metadata: add support for guessing the origin of more PyPI packages from filenames.
Event Timeline
Log In to Comment