Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/migrate_extrinsic_metadata.py
Show All 37 Lines | |||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
from swh.model.identifiers import SWHID, parse_swhid | from swh.model.identifiers import SWHID, parse_swhid | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
MetadataAuthority, | MetadataAuthority, | ||||
MetadataAuthorityType, | MetadataAuthorityType, | ||||
MetadataFetcher, | MetadataFetcher, | ||||
MetadataTargetType, | MetadataTargetType, | ||||
RawExtrinsicMetadata, | RawExtrinsicMetadata, | ||||
RevisionType, | |||||
Sha1Git, | Sha1Git, | ||||
) | ) | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses | from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses | ||||
from swh.storage.algos.snapshot import snapshot_get_all_branches | from swh.storage.algos.snapshot import snapshot_get_all_branches | ||||
# XML namespaces and fields for metadata coming from the deposit: | # XML namespaces and fields for metadata coming from the deposit: | ||||
▲ Show 20 Lines • Show All 136 Lines • ▼ Show 20 Lines | for visit in iter_origin_visits(storage, origin): | ||||
continue | continue | ||||
seen_revisions.add(branch.target) | seen_revisions.add(branch.target) | ||||
revision = storage.revision_get([branch.target])[0] | revision = storage.revision_get([branch.target])[0] | ||||
if revision is None: | if revision is None: | ||||
# https://forge.softwareheritage.org/T997 | # https://forge.softwareheritage.org/T997 | ||||
continue | continue | ||||
# Check it's DSC (we only support those for now) | |||||
assert ( | |||||
revision.type == RevisionType.DSC | |||||
), "non-DSC revisions are not supported" | |||||
# Check it doesn't have parents (else we would have to | # Check it doesn't have parents (else we would have to | ||||
# recurse) | # recurse) | ||||
assert revision.parents == (), "DSC revision with parents" | assert revision.parents == (), "revision with parents" | ||||
return False | return False | ||||
def debian_origins_from_row(row, storage): | def debian_origins_from_row(row, storage): | ||||
"""Guesses a Debian origin from a row. May return an empty list if it | """Guesses a Debian origin from a row. May return an empty list if it | ||||
cannot reliably guess it, but all results are guaranteed to be correct.""" | cannot reliably guess it, but all results are guaranteed to be correct.""" | ||||
filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]] | filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]] | ||||
▲ Show 20 Lines • Show All 542 Lines • ▼ Show 20 Lines | elif type_ == "tar": | ||||
.get("url", "") | .get("url", "") | ||||
.startswith("https://files.pythonhosted.org/") | .startswith("https://files.pythonhosted.org/") | ||||
): | ): | ||||
if isinstance(metadata.get("original_artifact"), dict): | if isinstance(metadata.get("original_artifact"), dict): | ||||
metadata["original_artifact"] = [metadata["original_artifact"]] | metadata["original_artifact"] = [metadata["original_artifact"]] | ||||
assert len(metadata["original_artifact"]) == 1 | assert len(metadata["original_artifact"]) == 1 | ||||
# it's tempting here to do this: | project_name = pypi_project_from_filename( | ||||
# | metadata["original_artifact"][0]["filename"] | ||||
# project_name = pypi_project_from_filename( | ) | ||||
# metadata["original_artifact"][0]["filename"] | origin = f"https://pypi.org/project/{project_name}/" | ||||
# ) | # But unfortunately, the filename is user-provided, and doesn't | ||||
# origin = f"https://pypi.org/project/{project_name}/" | # necessarily match the package name on pypi. Therefore, we need | ||||
# assert_origin_exists(storage, origin) | # to check it. | ||||
# | if not _check_revision_in_origin(storage, origin, row["id"]): | ||||
# but unfortunately, the filename is user-provided, and doesn't | |||||
# necessarily match the package name on pypi. | |||||
# TODO: on second thoughts, I think we can use this as a heuristic, | |||||
# then double-check by listing visits and snapshots from the origin; | |||||
# it should work for most packages. | |||||
origin = None | origin = None | ||||
if "project" in metadata: | if "project" in metadata: | ||||
# pypi loader format 2 | # pypi loader format 2 | ||||
# same reason as above, we can't do this: | # same reason as above, we can't do this: | ||||
# if metadata["project"]: | # if metadata["project"]: | ||||
# assert metadata["project"]["name"] == project_name | # assert metadata["project"]["name"] == project_name | ||||
▲ Show 20 Lines • Show All 212 Lines • Show Last 20 Lines |