diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py --- a/swh/storage/migrate_extrinsic_metadata.py +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -127,6 +127,29 @@ return filename.split("-")[0] elif filename.startswith("devpi-theme-16-"): return "devpi-theme-16" + elif re.match("[^-]+-[0-9]+.tar.gz", filename): + return filename.split("-")[0] + elif filename == "ohai-1!0.tar.gz": + return "ohai" + elif filename == "collective.topicitemsevent-0.1dvl.tar.gz": + return "collective.topicitemsevent" + elif filename.startswith( + ("SpiNNStorageHandlers-1!", "sPyNNakerExternalDevicesPlugin-1!") + ): + return filename.split("-")[0] + elif filename.startswith("limnoria-201"): + return "limnoria" + elif filename.startswith("pytz-20"): + return "pytz" + elif filename.startswith("youtube_dl_server-alpha."): + return "youtube_dl_server" + elif filename == "json-extensions-b76bc7d.tar.gz": + return "json-extensions" + elif filename == "LitReview-0.6989ev.tar.gz": + # typo of "dev" + return "LitReview" + elif filename.startswith("django_options-r"): + return "django_options" filename = filename.replace(" ", "-") match = re.match( @@ -138,13 +161,38 @@ if match: return match.group("project_name") + # First try with a rather strict format, but that allows accidentally + # matching the version as part of the package name match = re.match( - r"^(?P[a-z0-9_.]+(-[a-z0-9_.]*[a-z][a-z0-9_.]*)*)" # project name - r"-v?[0-9_.]+([a-z]+[0-9]+)?(dev|pre|[a-c])?" # "main" version - r"(\.post[0-9]*)?" # suffix generated by vcversioner/setuptools-scm/... + r"^(?P[a-z0-9_.]+?([-_][a-z][a-z0-9.]+?)*?)" # project name + r"-v?" + r"([0-9]+!)?" # epoch + r"[0-9_.]+([a-z]+[0-9]+)?" # "main" version r"([.-]?(alpha|beta|dev|post|pre|rc)(\.?[0-9]+)?)*" # development status + r"([.-]?20[012][0-9]{5,9})?" # date r"([.-]g?[0-9a-f]+)?" # git commit - r"(-[a-z][a-z0-9]*)?" # extra suffixes + r"(-py(thon)?[23](\.?[0-9]{1,2})?)?" # python version + r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension + filename, + re.I, + ) + if match: + return match.group("project_name") + + # If that doesn't work, give up on trying to parse version suffixes, + # and just find the first version-like occurrence in the file name + + match = re.match( + r"^(?P[a-z0-9_.-]+?)" # project name + r"[-_.]v?" + r"([0-9]+!)?" # epoch + r"(" # "main" version + r"[0-9_]+\.[0-9_.]+([a-z]+[0-9]+)?" # classic version number + r"|20[012][0-9]{5,9}" # date as integer + r"|20[012][0-9]-[01][0-9]-[0-3][0-9]" # date as ISO 8601 + r")" # end of "main" version + r"[a-z]?(dev|pre)?" # direct version suffix + r"([._-].*)?" # extra suffixes r"\.(tar\.gz|tar\.bz2|tgz|zip)$", # extension filename, re.I, diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py --- a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -85,6 +85,35 @@ ("obedient.powny-3.0.0-alpha-20141027-2102-9e53ebd.tar.gz", "obedient.powny"), ("mojimoji-0.0.9_2.tar.gz", "mojimoji"), ("devpi-theme-16-2.0.0.tar.gz", "devpi-theme-16"), + ("Orange3-WONDER-1-1.0.7.tar.gz", "Orange3-WONDER-1"), + ("obj-34.tar.gz", "obj"), + ("pytorch-ignite-nightly-20190825.tar.gz", "pytorch-ignite-nightly"), + ("tlds-2019081900.tar.gz", "tlds"), + ("dominator-12.1.2-alpha-20141027-1446-ad46e0f.tar.gz", "dominator"), + ("waferslim-1.0.0-py3.1.zip", "waferslim"), + ("Beaver-21.tar.gz", "Beaver"), + ("aimmo-0.post.dev460.tar.gz", "aimmo"), + ("ohai-1!0.tar.gz", "ohai"), + ("nevolution-risk-139.tar.gz", "nevolution-risk"), + ("collective.topicitemsevent-0.1dvl.tar.gz", "collective.topicitemsevent"), + ("lesscpy-0.9g.tar.gz", "lesscpy"), + ("SpiNNStorageHandlers-1!4.0.0a1.tar.gz", "SpiNNStorageHandlers"), + ("limnoria-2013-03-27T16:32:26+0100.tar.gz", "limnoria"), + ( + "sPyNNakerExternalDevicesPlugin-1!4.0.0a2.tar.gz", + "sPyNNakerExternalDevicesPlugin", + ), + ("django-bootstrap-italia_0.1.tar.gz", "django-bootstrap-italia"), + ("sPyNNaker8-1!4.0.0a1.tar.gz", "sPyNNaker8"), + ("betahaus.openmember-0.1adev-r1651.tar.gz", "betahaus.openmember"), + ("mailer.0.8.0.zip", "mailer"), + ("pytz-2005k.tar.bz2", "pytz"), + ("aha.plugin.microne-0.62bdev.tar.gz", "aha.plugin.microne"), + ("youtube_dl_server-alpha.3.tar.gz", "youtube_dl_server"), + ("json-extensions-b76bc7d.tar.gz", "json-extensions"), + ("LitReview-0.6989ev.tar.gz", "LitReview"), + ("django_options-r5.tar.gz", "django_options"), + ("ddlib-2013-11-07.tar.gz", "ddlib"), ] for (filename, project) in files: