Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/migrate_extrinsic_metadata.py
Show All 37 Lines | |||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
from swh.model.identifiers import SWHID, parse_swhid | from swh.model.identifiers import SWHID, parse_swhid | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
MetadataAuthority, | MetadataAuthority, | ||||
MetadataAuthorityType, | MetadataAuthorityType, | ||||
MetadataFetcher, | MetadataFetcher, | ||||
MetadataTargetType, | MetadataTargetType, | ||||
RawExtrinsicMetadata, | RawExtrinsicMetadata, | ||||
RevisionType, | |||||
Sha1Git, | Sha1Git, | ||||
) | ) | ||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from swh.storage.algos.origin import iter_origin_visits, iter_origin_visit_statuses | |||||
from swh.storage.algos.snapshot import snapshot_get_all_branches | |||||
# XML namespaces and fields for metadata coming from the deposit: | # XML namespaces and fields for metadata coming from the deposit: | ||||
CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" | CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" | ||||
ATOM_NS = "http://www.w3.org/2005/Atom" | ATOM_NS = "http://www.w3.org/2005/Atom" | ||||
ATOM_KEYS = ["id", "author", "external_identifier", "title"] | ATOM_KEYS = ["id", "author", "external_identifier", "title"] | ||||
# columns of the revision table (of the storage DB) | # columns of the revision table (of the storage DB) | ||||
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines | def remove_atom_codemeta_metadata_without_xmlns(metadata): | ||||
"""Removes all known Atom and Codemeta metadata fields from the dict, | """Removes all known Atom and Codemeta metadata fields from the dict, | ||||
assuming this is a dict generated by xmltodict with expanded namespaces. | assuming this is a dict generated by xmltodict with expanded namespaces. | ||||
""" | """ | ||||
for key in list(metadata): | for key in list(metadata): | ||||
if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)): | if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)): | ||||
del metadata[key] | del metadata[key] | ||||
def _check_revision_in_origin(storage, origin, revision_id): | |||||
seen_snapshots = set() # no need to visit them again | |||||
seen_revisions = set() | |||||
for visit in iter_origin_visits(storage, origin): | |||||
for status in iter_origin_visit_statuses(storage, origin, visit.visit): | |||||
if status.snapshot is None: | |||||
continue | |||||
if status.snapshot in seen_snapshots: | |||||
ardumont: ```
if status.snapshot is None or status.snapshot in seen_snapshots:
continue
```
one… | |||||
continue | |||||
seen_snapshots.add(status.snapshot) | |||||
snapshot = snapshot_get_all_branches(storage, status.snapshot) | |||||
for (branch_name, branch) in snapshot.branches.items(): | |||||
Done Inline Actionsis this local variable really necessary? douardda: is this local variable really necessary? | |||||
# If it's the revision passed as argument, then it is indeed in the | |||||
# origin | |||||
if branch.target == revision_id: | |||||
return True | |||||
# Else, let's make sure the branch doesn't have any other revision | |||||
# Get the revision at the top of the branch. | |||||
if branch.target in seen_revisions: | |||||
continue | |||||
seen_revisions.add(branch.target) | |||||
revision = storage.revision_get([branch.target])[0] | |||||
# Check it's DSC (we only support those for now) | |||||
assert ( | |||||
revision.type == RevisionType.DSC | |||||
), "non-DSC revisions are not supported" | |||||
# Check it doesn't have parents (else we would have to | |||||
Not Done Inline Actionsunclear to me why assert this rather than ignore it. Why would you fail on that case? douardda: unclear to me why assert this rather than ignore it. Why would you fail on that case? | |||||
Done Inline ActionsI prefer to whitelist cases as I see them. vlorentz: I prefer to whitelist cases as I see them. | |||||
# recurse) | |||||
assert revision.parents == (), "DSC revision with parents" | |||||
return False | |||||
Not Done Inline Actionsunclear to me if this is not-yet-supported situation or if it's a unexpected one. douardda: unclear to me if this is not-yet-supported situation or if it's a unexpected one. | |||||
Done Inline Actionsunexpected; or I would have added a TODO vlorentz: unexpected; or I would have added a TODO | |||||
def debian_origins_from_row(row, storage): | |||||
"""Guesses a Debian origin from a row. May return an empty list if it | |||||
cannot reliably guess it, but all results are guaranteed to be correct.""" | |||||
filenames = [entry["filename"] for entry in row["metadata"]["original_artifact"]] | |||||
package_names = {filename.split("_")[0] for filename in filenames} | |||||
assert len(package_names) == 1, package_names | |||||
(package_name,) = package_names | |||||
Not Done Inline Actions2 nitpicky points:
but meh douardda: 2 nitpicky points:
- why not assert on `filenames` rather than `package_names` ?
- I find the… | |||||
Done Inline Actionsbecause there are multiple files (eg. kalgebra_19.12.1-1.dsc, kalgebra_19.12.1.orig.tar.xz, kalgebra_19.12.1-1.debian.tar.xz, and kalgebra_19.12.1.orig.tar.xz.asc), but we expect all of them to have the same name vlorentz: because there are multiple files (eg. `kalgebra_19.12.1-1.dsc`, `kalgebra_19.12.1.orig.tar.xz`… | |||||
candidate_origins = [ | |||||
f"deb://Debian/packages/{package_name}", | |||||
f"deb://Debian-Security/packages/{package_name}", | |||||
f"http://snapshot.debian.org/package/{package_name}/", | |||||
] | |||||
return [ | |||||
origin | |||||
for origin in candidate_origins | |||||
if _check_revision_in_origin(storage, origin, row["id"]) | |||||
] | |||||
Done Inline Actionswhy not a list comprehension here? douardda: why not a list comprehension here? | |||||
# Cache of origins that are known to exist | # Cache of origins that are known to exist | ||||
_origins = set() | _origins = set() | ||||
def assert_origin_exists(storage, origin): | def assert_origin_exists(storage, origin): | ||||
assert ( | assert ( | ||||
hashlib.sha1(origin.encode()).digest() in _origins # very fast | hashlib.sha1(origin.encode()).digest() in _origins # very fast | ||||
or storage.origin_get([origin])[0] is not None # slow, but up to date | or storage.origin_get([origin])[0] is not None # slow, but up to date | ||||
▲ Show 20 Lines • Show All 202 Lines • ▼ Show 20 Lines | def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool): | ||||
discovery_date = row["date"] or row["committer_date"] | discovery_date = row["date"] or row["committer_date"] | ||||
metadata = row["metadata"] | metadata = row["metadata"] | ||||
if metadata is None: | if metadata is None: | ||||
return | return | ||||
if type_ == "dsc": | if type_ == "dsc": | ||||
origin = None # TODO: I can't find how to get it reliably | origin = None # it will be defined later, using debian_origins_from_row | ||||
# TODO: the debian loader writes the changelog date as the revision's | # TODO: the debian loader writes the changelog date as the revision's | ||||
# author date and committer date. Instead, we should use the visit's date, | # author date and committer date. Instead, we should use the visit's date | ||||
# but I cannot find a way to reliably get it without the origin | |||||
if "extrinsic" in metadata: | if "extrinsic" in metadata: | ||||
extrinsic_files = metadata["extrinsic"]["raw"]["files"] | extrinsic_files = metadata["extrinsic"]["raw"]["files"] | ||||
for artifact_entry in metadata["original_artifact"]: | for artifact_entry in metadata["original_artifact"]: | ||||
extrinsic_file = extrinsic_files[artifact_entry["filename"]] | extrinsic_file = extrinsic_files[artifact_entry["filename"]] | ||||
for key in ("sha256",): | for key in ("sha256",): | ||||
assert artifact_entry["checksums"][key] == extrinsic_file[key] | assert artifact_entry["checksums"][key] == extrinsic_file[key] | ||||
artifact_entry["url"] = extrinsic_file["uri"] | artifact_entry["url"] = extrinsic_file["uri"] | ||||
▲ Show 20 Lines • Show All 406 Lines • ▼ Show 20 Lines | if "original_artifact" in metadata: | ||||
"checksums", | "checksums", | ||||
"filename", | "filename", | ||||
"length", | "length", | ||||
"url", | "url", | ||||
"archive_type", | "archive_type", | ||||
} | } | ||||
assert set(original_artifact) <= allowed_keys, set(original_artifact) | assert set(original_artifact) <= allowed_keys, set(original_artifact) | ||||
if type_ == "dsc": | |||||
assert origin is None | |||||
origins = debian_origins_from_row(row, storage) | |||||
assert origins, row | |||||
else: | |||||
origins = [origin] | |||||
for origin in origins: | |||||
load_metadata( | load_metadata( | ||||
storage, | storage, | ||||
row["id"], | row["id"], | ||||
discovery_date, | discovery_date, | ||||
metadata["original_artifact"], | metadata["original_artifact"], | ||||
ORIGINAL_ARTIFACT_FORMAT, | ORIGINAL_ARTIFACT_FORMAT, | ||||
authority=AUTHORITIES["swh"], | authority=AUTHORITIES["swh"], | ||||
origin=origin, | origin=origin, | ||||
dry_run=dry_run, | dry_run=dry_run, | ||||
) | ) | ||||
del metadata["original_artifact"] | del metadata["original_artifact"] | ||||
assert metadata == {}, ( | assert metadata == {}, ( | ||||
f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): " | f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): " | ||||
f"{metadata}" | f"{metadata}" | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 99 Lines • Show Last 20 Lines |
one could be even be vicious and suggest to put the None snapshot in the seen_snapshots
¯\_(ツ)_/¯ :DDDDDDDD and then