Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/migrate_extrinsic_metadata.py
- This file was added.
#!/usr/bin/env python3 | |||||
# Copyright (C) 2020 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
"""This is an executable script to migrate extrinsic revision metadata from | |||||
the revision table to the new extrinsic metadata storage. | |||||
This is designed to be as conservative as possible, following this principle: | |||||
for each revision the script reads (in "handle_row"), it will read some of the | |||||
fields, write them directly to the metadata storage, and remove them. | |||||
Then it checks all the remaining fields are in a hardcoded list of fields that | |||||
are known not to require migration. | |||||
This means that every field that isn't migrated was explicitly reviewed while | |||||
writing this script. | |||||
Additionally, this script contains many assertions to prevent false positives | |||||
in its heuristics. | |||||
""" | |||||
import datetime | |||||
import hashlib | |||||
import json | |||||
import os | |||||
import re | |||||
import sys | |||||
from typing import Any, Dict, Optional | |||||
from urllib.parse import unquote, urlparse | |||||
import iso8601 | |||||
from swh.core.db import BaseDb | |||||
from swh.model.hashutil import hash_to_hex | |||||
from swh.model.identifiers import SWHID, parse_swhid | |||||
from swh.model.model import ( | |||||
MetadataAuthority, | |||||
MetadataAuthorityType, | |||||
MetadataFetcher, | |||||
MetadataTargetType, | |||||
RawExtrinsicMetadata, | |||||
) | |||||
from swh.storage import get_storage | |||||
# XML namespaces and fields for metadata coming from the deposit: | |||||
CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" | |||||
ATOM_NS = "http://www.w3.org/2005/Atom" | |||||
ATOM_KEYS = ["id", "author", "external_identifier", "title"] | |||||
# columns of the revision table (of the storage DB) | |||||
REVISION_COLS = ["id", "date", "committer_date", "type", "message", "metadata"] | |||||
# columns of the tables of the deposit DB | |||||
DEPOSIT_COLS = [ | |||||
"deposit.id", | |||||
"deposit.external_id", | |||||
"deposit.swh_id_context", | |||||
"deposit.status", | |||||
"deposit_request.metadata", | |||||
"deposit_request.date", | |||||
"deposit_client.provider_url", | |||||
"deposit_collection.name", | |||||
"auth_user.username", | |||||
] | |||||
# Formats we write to the extrinsic metadata storage | |||||
OLD_DEPOSIT_FORMAT = ( | |||||
"sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces" # before february 2018 | |||||
) | |||||
NEW_DEPOSIT_FORMAT = "sword-v2-atom-codemeta-v2-in-json" # after february 2018 | |||||
GNU_FORMAT = "gnu-tree-json" | |||||
NIXGUIX_FORMAT = "nixguix-sources-json" | |||||
NPM_FORMAT = "replicate-npm-package-json" | |||||
ORIGINAL_ARTIFACT_FORMAT = "original-artifacts-json" | |||||
PYPI_FORMAT = "pypi-project-json" | |||||
# Information about this script, for traceability | |||||
FETCHER = MetadataFetcher( | |||||
name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", | |||||
) | |||||
# Authorities that we got the metadata from | |||||
AUTHORITIES = { | |||||
"npmjs": MetadataAuthority( | |||||
type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={} | |||||
), | |||||
"pypi": MetadataAuthority( | |||||
type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={} | |||||
), | |||||
"gnu": MetadataAuthority( | |||||
type=MetadataAuthorityType.FORGE, url="https://ftp.gnu.org/", metadata={} | |||||
), | |||||
"swh": MetadataAuthority( | |||||
type=MetadataAuthorityType.REGISTRY, | |||||
url="https://softwareheritage.org/", | |||||
metadata={}, | |||||
), # for original_artifact (which are checksums computed by SWH) | |||||
} | |||||
# Regular expression for the format of revision messages written by the | |||||
# deposit loader | |||||
deposit_revision_message_re = re.compile( | |||||
b"(?P<client>[a-z]*): " | |||||
b"Deposit (?P<deposit_id>[0-9]+) in collection (?P<collection>[a-z]+).*" | |||||
) | |||||
# not reliable, because PyPI allows arbitrary names | |||||
def pypi_project_from_filename(filename): | |||||
match = re.match( | |||||
r"^(?P<project_name>[a-zA-Z0-9_.-]+)" | |||||
r"-[0-9.]+([a-z]+[0-9]+)?(\.dev[0-9]+)?\.(tar\.gz|zip)$", | |||||
filename, | |||||
) | |||||
assert match, filename | |||||
return match.group("project_name") | |||||
def cran_package_from_url(filename): | |||||
match = re.match( | |||||
r"^https://cran\.r-project\.org/src/contrib/" | |||||
r"(?P<package_name>[a-zA-Z0-9.]+)_[0-9.-]+(\.tar\.gz)?$", | |||||
filename, | |||||
) | |||||
assert match, filename | |||||
return match.group("package_name") | |||||
def npm_package_from_source_url(package_source_url): | |||||
match = re.match( | |||||
"^https://registry.npmjs.org/(?P<package_name>.*)/-/[^/]+.tgz$", | |||||
package_source_url, | |||||
) | |||||
assert match, package_source_url | |||||
return unquote(match.group("package_name")) | |||||
def remove_atom_codemeta_metadata_with_xmlns(metadata): | |||||
"""Removes all known Atom and Codemeta metadata fields from the dict, | |||||
assuming this is a dict generated by xmltodict without expanding namespaces. | |||||
""" | |||||
keys_to_remove = ATOM_KEYS + ["@xmlns", "@xmlns:codemeta"] | |||||
for key in list(metadata): | |||||
if key.startswith("codemeta:") or key in keys_to_remove: | |||||
del metadata[key] | |||||
def remove_atom_codemeta_metadata_without_xmlns(metadata): | |||||
"""Removes all known Atom and Codemeta metadata fields from the dict, | |||||
assuming this is a dict generated by xmltodict with expanded namespaces. | |||||
""" | |||||
for key in list(metadata): | |||||
if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)): | |||||
del metadata[key] | |||||
# Cache of origins that are known to exist | |||||
_origins = set() | |||||
def assert_origin_exists(storage, origin): | |||||
assert ( | |||||
hashlib.sha1(origin.encode()).digest() in _origins # very fast | |||||
or storage.origin_get([origin])[0] is not None # slow, but up to date | |||||
), origin | |||||
def load_metadata( | |||||
storage, | |||||
revision_id, | |||||
discovery_date: datetime.datetime, | |||||
metadata: Dict[str, Any], | |||||
format: str, | |||||
authority: MetadataAuthority, | |||||
origin: Optional[str], | |||||
dry_run: bool, | |||||
): | |||||
"""Does the actual loading to swh-storage.""" | |||||
revision_swhid = SWHID(object_type="revision", object_id=hash_to_hex(revision_id)) | |||||
obj = RawExtrinsicMetadata( | |||||
type=MetadataTargetType.REVISION, | |||||
id=revision_swhid, | |||||
discovery_date=discovery_date, | |||||
authority=authority, | |||||
fetcher=FETCHER, | |||||
format=format, | |||||
metadata=json.dumps(metadata).encode(), | |||||
origin=origin, | |||||
) | |||||
if not dry_run: | |||||
storage.raw_extrinsic_metadata_add([obj]) | |||||
def handle_deposit_row( | |||||
row, | |||||
discovery_date: Optional[datetime.datetime], | |||||
origin, | |||||
storage, | |||||
deposit_cur, | |||||
dry_run: bool, | |||||
): | |||||
"""Loads metadata from the deposit database (which is more reliable as the | |||||
metadata on the revision object, as some versions of the deposit loader were | |||||
a bit lossy; and they used very different format for the field in the | |||||
revision table). | |||||
""" | |||||
parsed_message = deposit_revision_message_re.match(row["message"]) | |||||
assert parsed_message is not None, row["message"] | |||||
deposit_id = int(parsed_message.group("deposit_id")) | |||||
collection = parsed_message.group("collection").decode() | |||||
client_name = parsed_message.group("client").decode() | |||||
deposit_cur.execute( | |||||
f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit " | |||||
f"INNER JOIN deposit_collection " | |||||
f" ON (deposit.collection_id=deposit_collection.id) " | |||||
f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) " | |||||
f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) " | |||||
f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) " | |||||
f"WHERE deposit.id = %s", | |||||
(deposit_id,), | |||||
) | |||||
provider_urls = set() | |||||
swhids = set() | |||||
metadata_entries = [] | |||||
dates = set() | |||||
external_identifiers = set() | |||||
for deposit_request_row in deposit_cur: | |||||
deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row)) | |||||
# Sanity checks to make sure we selected the right deposit | |||||
assert deposit_request["deposit.id"] == deposit_id | |||||
assert deposit_request["deposit_collection.name"] == collection, deposit_request | |||||
if client_name != "": | |||||
# Sometimes it's missing from the commit message | |||||
assert deposit_request["auth_user.username"] == client_name | |||||
# Date of the deposit request (either the initial request, of subsequent ones) | |||||
date = deposit_request["deposit_request.date"] | |||||
dates.add(date) | |||||
assert deposit_request["deposit.swh_id_context"], deposit_request | |||||
external_identifiers.add(deposit_request["deposit.external_id"]) | |||||
swhids.add(deposit_request["deposit.swh_id_context"]) | |||||
# Client of the deposit | |||||
provider_urls.add(deposit_request["deposit_client.provider_url"]) | |||||
metadata = deposit_request["deposit_request.metadata"] | |||||
if metadata is not None: | |||||
json.dumps(metadata).encode() # check it's valid | |||||
if "@xmlns" in metadata: | |||||
assert metadata["@xmlns"] == ATOM_NS | |||||
assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) | |||||
format = NEW_DEPOSIT_FORMAT | |||||
else: | |||||
assert "{http://www.w3.org/2005/Atom}id" in metadata | |||||
assert ( | |||||
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata | |||||
or "{http://www.w3.org/2005/Atom}author" in metadata | |||||
) | |||||
format = OLD_DEPOSIT_FORMAT | |||||
metadata_entries.append((date, format, metadata)) | |||||
if discovery_date is None: | |||||
discovery_date = max(dates) | |||||
# Sanity checks to make sure deposit requests are consistent with each other | |||||
assert len(metadata_entries) >= 1, deposit_id | |||||
assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}" | |||||
(provider_url,) = provider_urls | |||||
assert len(swhids) == 1 | |||||
(swhid,) = swhids | |||||
assert ( | |||||
len(external_identifiers) == 1 | |||||
), f"expected 1 external identifier, got {external_identifiers}" | |||||
(external_identifier,) = external_identifiers | |||||
# computed the origin from the external_identifier if we don't have one | |||||
if origin is None: | |||||
origin = f"{provider_url.strip('/')}/{external_identifier}" | |||||
# explicit list of mistakes that happened in the past, but shouldn't | |||||
# happen again: | |||||
if origin == "https://hal.archives-ouvertes.fr/hal-01588781": | |||||
# deposit id 75 | |||||
origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01588782": | |||||
# deposit id 76 | |||||
origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01592430": | |||||
# deposit id 143 | |||||
origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592430" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01588927": | |||||
origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01593875": | |||||
# deposit id 175 | |||||
origin = "https://hal-preprod.archives-ouvertes.fr/hal-01593875" | |||||
elif deposit_id == 160: | |||||
assert origin == "https://www.softwareheritage.org/je-suis-gpl", origin | |||||
origin = "https://forge.softwareheritage.org/source/jesuisgpl/" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01588942": | |||||
# deposit id 90 | |||||
origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01592499": | |||||
# deposit id 162 | |||||
origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592499" | |||||
elif origin == "https://hal.archives-ouvertes.fr/hal-01588935": | |||||
# deposit id 89 | |||||
origin = "https://hal-preprod.archives-ouvertes.fr/hal-01588935" | |||||
assert_origin_exists(storage, origin) | |||||
# check the origin we computed matches the one in the deposit db | |||||
swhid_origin = parse_swhid(swhid).metadata["origin"] | |||||
if origin is not None: | |||||
# explicit list of mistakes that happened in the past, but shouldn't | |||||
# happen again: | |||||
exceptions = [ | |||||
( | |||||
# deposit id 229 | |||||
"https://hal.archives-ouvertes.fr/hal-01243573", | |||||
"https://hal-test.archives-ouvertes.fr/hal-01243573", | |||||
), | |||||
( | |||||
# deposit id 199 | |||||
"https://hal.archives-ouvertes.fr/hal-01243065", | |||||
"https://hal-test.archives-ouvertes.fr/hal-01243065", | |||||
), | |||||
( | |||||
# deposit id 164 | |||||
"https://hal.archives-ouvertes.fr/hal-01593855", | |||||
"https://hal-preprod.archives-ouvertes.fr/hal-01593855", | |||||
), | |||||
] | |||||
if (origin, swhid_origin) not in exceptions: | |||||
assert origin == swhid_origin, ( | |||||
f"the origin we guessed from the deposit db or revision ({origin}) " | |||||
f"doesn't match the one in the deposit db's SWHID ({swhid})" | |||||
) | |||||
authority = MetadataAuthority( | |||||
type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider_url, metadata={}, | |||||
) | |||||
for (date, format, metadata) in metadata_entries: | |||||
load_metadata( | |||||
storage, | |||||
row["id"], | |||||
date, | |||||
metadata, | |||||
format, | |||||
authority=authority, | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
return (origin, discovery_date) | |||||
def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool): | |||||
type_ = row["type"] | |||||
# default date in case we can't find a better one | |||||
discovery_date = row["date"] or row["committer_date"] | |||||
metadata = row["metadata"] | |||||
if metadata is None: | |||||
return | |||||
if type_ == "dsc": | |||||
origin = None # TODO: I can't find how to get it reliably | |||||
# TODO: the debian loader writes the changelog date as the revision's | |||||
# author date and committer date. Instead, we should use the visit's date, | |||||
# but I cannot find a way to reliably get it without the origin | |||||
if "extrinsic" in metadata: | |||||
extrinsic_files = metadata["extrinsic"]["raw"]["files"] | |||||
for artifact_entry in metadata["original_artifact"]: | |||||
extrinsic_file = extrinsic_files[artifact_entry["filename"]] | |||||
for key in ("sha256",): | |||||
assert artifact_entry["checksums"][key] == extrinsic_file[key] | |||||
artifact_entry["url"] = extrinsic_file["uri"] | |||||
del metadata["extrinsic"] | |||||
elif type_ == "tar": | |||||
provider = metadata.get("extrinsic", {}).get("provider") | |||||
if provider is not None: | |||||
# This is the format all the package loaders currently write, and | |||||
# it is the easiest, thanks to the 'provider' and 'when' fields, | |||||
# which have all the information we need to tell them easily | |||||
# and generate accurate metadata | |||||
discovery_date = iso8601.parse_date(metadata["extrinsic"]["when"]) | |||||
# New versions of the loaders write the provider; use it. | |||||
if provider.startswith("https://replicate.npmjs.com/"): | |||||
# npm loader format 1 | |||||
parsed_url = urlparse(provider) | |||||
assert re.match("^/[^/]+/?$", parsed_url.path), parsed_url | |||||
package_name = unquote(parsed_url.path.strip("/")) | |||||
origin = "https://www.npmjs.com/package/" + package_name | |||||
assert_origin_exists(storage, origin) | |||||
load_metadata( | |||||
storage, | |||||
row["id"], | |||||
discovery_date, | |||||
metadata["extrinsic"]["raw"], | |||||
NPM_FORMAT, | |||||
authority=AUTHORITIES["npmjs"], | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
del metadata["extrinsic"] | |||||
elif provider.startswith("https://pypi.org/"): | |||||
# pypi loader format 1 | |||||
match = re.match( | |||||
"https://pypi.org/pypi/(?P<project_name>.*)/json", provider | |||||
) | |||||
assert match, f"unexpected provider URL format: {provider}" | |||||
project_name = match.group("project_name") | |||||
origin = f"https://pypi.org/project/{project_name}/" | |||||
assert_origin_exists(storage, origin) | |||||
load_metadata( | |||||
storage, | |||||
row["id"], | |||||
discovery_date, | |||||
metadata["extrinsic"]["raw"], | |||||
PYPI_FORMAT, | |||||
authority=AUTHORITIES["pypi"], | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
del metadata["extrinsic"] | |||||
elif provider.startswith("https://cran.r-project.org/"): | |||||
# cran loader | |||||
provider = metadata["extrinsic"]["provider"] | |||||
if provider.startswith("https://cran.r-project.org/package="): | |||||
origin = metadata["extrinsic"]["provider"] | |||||
else: | |||||
ardumont: where is the `load_metadata` call in this conditional? | |||||
Done Inline Actionsraw_extrinsic_metadata == {}, so there is no metadata to load vlorentz: `raw_extrinsic_metadata == {}`, so there is no metadata to load | |||||
package_name = cran_package_from_url(provider) | |||||
origin = f"https://cran.r-project.org/package={package_name}" | |||||
# TODO https://forge.softwareheritage.org/T2536 | |||||
assert origin is not None | |||||
if ( | |||||
hashlib.sha1(origin.encode()).digest() not in _origins | |||||
and storage.origin_get([origin])[0] is None | |||||
): | |||||
print("MISSING CRAN ORIGIN", hash_to_hex(row["id"]), origin) | |||||
return | |||||
raw_extrinsic_metadata = metadata["extrinsic"]["raw"] | |||||
# this is actually intrinsic, ignore it | |||||
del raw_extrinsic_metadata["version"] | |||||
# Copy the URL to the original_artifacts metadata | |||||
assert len(metadata["original_artifact"]) == 1 | |||||
assert "url" not in metadata["original_artifact"][0] | |||||
metadata["original_artifact"][0]["url"] = raw_extrinsic_metadata["url"] | |||||
del raw_extrinsic_metadata["url"] | |||||
assert ( | |||||
raw_extrinsic_metadata == {} | |||||
), f"Unexpected metadata keys: {list(raw_extrinsic_metadata)}" | |||||
del metadata["extrinsic"] | |||||
elif provider.startswith("https://nix-community.github.io/nixpkgs-swh/"): | |||||
# nixguix loader | |||||
origin = provider | |||||
assert_origin_exists(storage, origin) | |||||
authority = MetadataAuthority( | |||||
type=MetadataAuthorityType.FORGE, url=provider, metadata={}, | |||||
) | |||||
assert row["date"] is None # the nixguix loader does not write dates | |||||
load_metadata( | |||||
storage, | |||||
Not Done Inline Actionssame load_metadata call? ardumont: same `load_metadata` call? | |||||
Done Inline Actionsmetadata["extrinsic"]["raw"] == {}, so there is no metadata to load either vlorentz: `metadata["extrinsic"]["raw"] == {}`, so there is no metadata to load either | |||||
row["id"], | |||||
discovery_date, | |||||
metadata["extrinsic"]["raw"], | |||||
NIXGUIX_FORMAT, | |||||
authority=authority, | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
del metadata["extrinsic"] | |||||
elif provider.startswith("https://ftp.gnu.org/"): | |||||
# archive loader format 1 | |||||
origin = provider | |||||
assert_origin_exists(storage, origin) | |||||
assert len(metadata["original_artifact"]) == 1 | |||||
metadata["original_artifact"][0]["url"] = metadata["extrinsic"]["raw"][ | |||||
"url" | |||||
] | |||||
# Remove duplicate keys of original_artifacts | |||||
for key in ("url", "time", "length", "version", "filename"): | |||||
del metadata["extrinsic"]["raw"][key] | |||||
assert metadata["extrinsic"]["raw"] == {} | |||||
del metadata["extrinsic"] | |||||
elif provider.startswith("https://deposit.softwareheritage.org/"): | |||||
origin = metadata["extrinsic"]["raw"]["origin"]["url"] | |||||
assert_origin_exists(storage, origin) | |||||
if "@xmlns" in metadata: | |||||
assert metadata["@xmlns"] == ATOM_NS | |||||
assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) | |||||
assert "intrinsic" not in metadata | |||||
assert "extra_headers" not in metadata | |||||
# deposit loader format 1 | |||||
# in this case, the metadata seems to be both directly in metadata | |||||
# and in metadata["extrinsic"]["raw"]["metadata"] | |||||
(origin, discovery_date) = handle_deposit_row( | |||||
row, discovery_date, origin, storage, deposit_cur, dry_run | |||||
) | |||||
remove_atom_codemeta_metadata_with_xmlns(metadata) | |||||
if "client" in metadata: | |||||
del metadata["client"] | |||||
del metadata["extrinsic"] | |||||
else: | |||||
# deposit loader format 2 | |||||
actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][ | |||||
"metadata" | |||||
] | |||||
if "@xmlns" in actual_metadata: | |||||
assert actual_metadata["@xmlns"] == ATOM_NS | |||||
assert actual_metadata["@xmlns:codemeta"] in ( | |||||
CODEMETA_NS, | |||||
[CODEMETA_NS], | |||||
) | |||||
else: | |||||
assert "{http://www.w3.org/2005/Atom}id" in actual_metadata | |||||
assert ( | |||||
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" | |||||
in actual_metadata | |||||
) | |||||
(origin, discovery_date) = handle_deposit_row( | |||||
row, discovery_date, origin, storage, deposit_cur, dry_run | |||||
) | |||||
del metadata["extrinsic"] | |||||
else: | |||||
assert False, f"unknown provider {provider}" | |||||
# Older versions don't write the provider; use heuristics instead. | |||||
elif ( | |||||
metadata.get("package_source", {}) | |||||
.get("url", "") | |||||
.startswith("https://registry.npmjs.org/") | |||||
): | |||||
# npm loader format 2 | |||||
package_source_url = metadata["package_source"]["url"] | |||||
package_name = npm_package_from_source_url(package_source_url) | |||||
origin = "https://www.npmjs.com/package/" + package_name | |||||
assert_origin_exists(storage, origin) | |||||
load_metadata( | |||||
storage, | |||||
row["id"], | |||||
discovery_date, | |||||
metadata["package"], | |||||
NPM_FORMAT, | |||||
authority=AUTHORITIES["npmjs"], | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
del metadata["package"] | |||||
assert "original_artifact" not in metadata | |||||
# rebuild an "original_artifact"-like metadata dict from what we | |||||
# can salvage of "package_source" | |||||
package_source_metadata = metadata["package_source"] | |||||
keep_keys = {"blake2s256", "filename", "sha1", "sha256", "url"} | |||||
discard_keys = { | |||||
"date", # is equal to the revision date | |||||
"name", # was loaded above | |||||
"version", # same | |||||
} | |||||
assert ( | |||||
set(package_source_metadata) == keep_keys | discard_keys | |||||
), package_source_metadata | |||||
# will be loaded below | |||||
metadata["original_artifact"] = [ | |||||
{ | |||||
"filename": package_source_metadata["filename"], | |||||
"checksums": { | |||||
"sha1": package_source_metadata["sha1"], | |||||
"sha256": package_source_metadata["sha256"], | |||||
"blake2s256": package_source_metadata["blake2s256"], | |||||
}, | |||||
"url": package_source_metadata["url"], | |||||
} | |||||
] | |||||
del metadata["package_source"] | |||||
elif "@xmlns" in metadata: | |||||
assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) | |||||
assert "intrinsic" not in metadata | |||||
assert "extra_headers" not in metadata | |||||
# deposit loader format 3 | |||||
if row["message"] == b"swh: Deposit 159 in collection swh": | |||||
# There is no deposit 159 in the deposit DB, for some reason | |||||
assert ( | |||||
hash_to_hex(row["id"]) == "8e9cee14a6ad39bca4347077b87fb5bbd8953bb1" | |||||
) | |||||
return | |||||
elif row["message"] == b"hal: Deposit 342 in collection hal": | |||||
# They have status 'failed' and no swhid | |||||
return | |||||
origin = None # TODO | |||||
discovery_date = None # TODO | |||||
(origin, discovery_date) = handle_deposit_row( | |||||
row, discovery_date, origin, storage, deposit_cur, dry_run | |||||
) | |||||
remove_atom_codemeta_metadata_with_xmlns(metadata) | |||||
if "client" in metadata: | |||||
del metadata["client"] # found in the deposit db | |||||
if "committer" in metadata: | |||||
del metadata["committer"] # found on the revision object | |||||
elif "{http://www.w3.org/2005/Atom}id" in metadata: | |||||
assert ( | |||||
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata | |||||
or "{http://www.w3.org/2005/Atom}author" in metadata | |||||
) | |||||
assert "intrinsic" not in metadata | |||||
assert "extra_headers" not in metadata | |||||
# deposit loader format 4 | |||||
origin = None | |||||
discovery_date = None # TODO | |||||
(origin, discovery_date) = handle_deposit_row( | |||||
row, discovery_date, origin, storage, deposit_cur, dry_run | |||||
) | |||||
remove_atom_codemeta_metadata_without_xmlns(metadata) | |||||
elif hash_to_hex(row["id"]) == "a86747d201ab8f8657d145df4376676d5e47cf9f": | |||||
# deposit 91, is missing "{http://www.w3.org/2005/Atom}id" for some | |||||
# reason, and has an invalid oririn | |||||
return | |||||
elif ( | |||||
isinstance(metadata.get("original_artifact"), dict) | |||||
and metadata["original_artifact"]["url"].startswith( | |||||
"https://files.pythonhosted.org/" | |||||
) | |||||
) or ( | |||||
isinstance(metadata.get("original_artifact"), list) | |||||
and len(metadata.get("original_artifact")) == 1 | |||||
and metadata["original_artifact"][0] | |||||
.get("url", "") | |||||
.startswith("https://files.pythonhosted.org/") | |||||
): | |||||
if isinstance(metadata.get("original_artifact"), dict): | |||||
metadata["original_artifact"] = [metadata["original_artifact"]] | |||||
assert len(metadata["original_artifact"]) == 1 | |||||
# it's tempting here to do this: | |||||
# | |||||
# project_name = pypi_project_from_filename( | |||||
# metadata["original_artifact"][0]["filename"] | |||||
# ) | |||||
# origin = f"https://pypi.org/project/{project_name}/" | |||||
# assert_origin_exists(storage, origin) | |||||
# | |||||
# but unfortunately, the filename is user-provided, and doesn't | |||||
# necessarily match the package name on pypi. | |||||
# TODO: on second thoughts, I think we can use this as a heuristic, | |||||
# then double-check by listing visits and snapshots from the origin; | |||||
# it should work for most packages. | |||||
origin = None | |||||
if "project" in metadata: | |||||
# pypi loader format 2 | |||||
# same reason as above, we can't do this: | |||||
# if metadata["project"]: | |||||
# assert metadata["project"]["name"] == project_name | |||||
load_metadata( | |||||
storage, | |||||
row["id"], | |||||
discovery_date, | |||||
metadata["project"], | |||||
PYPI_FORMAT, | |||||
authority=AUTHORITIES["pypi"], | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
del metadata["project"] | |||||
else: | |||||
assert set(metadata) == {"original_artifact"}, set(metadata) | |||||
# pypi loader format 3 | |||||
pass # nothing to do, there's no metadata | |||||
elif row["message"] == b"synthetic revision message": | |||||
assert isinstance(metadata["original_artifact"], list), metadata | |||||
assert not any("url" in d for d in metadata["original_artifact"]) | |||||
# archive loader format 2 | |||||
origin = None | |||||
elif deposit_revision_message_re.match(row["message"]): | |||||
# deposit without metadata in the revision | |||||
assert set(metadata) == {"original_artifact"}, metadata | |||||
origin = None # TODO | |||||
discovery_date = None | |||||
(origin, discovery_date) = handle_deposit_row( | |||||
row, discovery_date, origin, storage, deposit_cur, dry_run | |||||
) | |||||
else: | |||||
assert False, f"Unable to detect type of metadata for row: {row}" | |||||
# Ignore common intrinsic metadata keys | |||||
for key in ("intrinsic", "extra_headers"): | |||||
if key in metadata: | |||||
del metadata[key] | |||||
# Ignore loader-specific intrinsic metadata keys | |||||
if type_ == "hg": | |||||
del metadata["node"] | |||||
elif type_ == "dsc": | |||||
if "package_info" in metadata: | |||||
del metadata["package_info"] | |||||
if "original_artifact" in metadata: | |||||
for original_artifact in metadata["original_artifact"]: | |||||
# Rename keys to the expected format of original-artifacts-json. | |||||
rename_keys = [ | |||||
("name", "filename"), # eg. from old Debian loader | |||||
("size", "length"), # eg. from old PyPI loader | |||||
] | |||||
for (old_name, new_name) in rename_keys: | |||||
if old_name in original_artifact: | |||||
assert new_name not in original_artifact | |||||
original_artifact[new_name] = original_artifact.pop(old_name) | |||||
# Move the checksums to their own subdict, which is the expected format | |||||
# of original-artifacts-json. | |||||
if "sha1" in original_artifact: | |||||
assert "checksums" not in original_artifact | |||||
original_artifact["checksums"] = {} | |||||
for key in ("sha1", "sha256", "sha1_git", "blake2s256"): | |||||
if key in original_artifact: | |||||
original_artifact["checksums"][key] = original_artifact.pop(key) | |||||
if "date" in original_artifact: | |||||
# The information comes from the package repository rather than SWH, | |||||
# so it shouldn't be in the 'original-artifacts' metadata | |||||
# (which has SWH as authority). | |||||
# Moreover, it's not a very useful information, so let's just drop it. | |||||
del original_artifact["date"] | |||||
allowed_keys = { | |||||
"checksums", | |||||
"filename", | |||||
"length", | |||||
"url", | |||||
"archive_type", | |||||
} | |||||
assert set(original_artifact) <= allowed_keys, set(original_artifact) | |||||
load_metadata( | |||||
storage, | |||||
row["id"], | |||||
discovery_date, | |||||
metadata["original_artifact"], | |||||
ORIGINAL_ARTIFACT_FORMAT, | |||||
authority=AUTHORITIES["swh"], | |||||
origin=origin, | |||||
dry_run=dry_run, | |||||
) | |||||
del metadata["original_artifact"] | |||||
assert metadata == {}, ( | |||||
f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): " | |||||
f"{metadata}" | |||||
) | |||||
def create_fetchers(db): | |||||
with db.cursor() as cur: | |||||
cur.execute( | |||||
""" | |||||
INSERT INTO metadata_fetcher (name, version, metadata) | |||||
VALUES (%s, %s, %s) | |||||
ON CONFLICT DO NOTHING | |||||
""", | |||||
(FETCHER.name, FETCHER.version, FETCHER.metadata), | |||||
) | |||||
def main(storage_dbconn, storage_url, deposit_dbconn, first_id, dry_run): | |||||
storage_db = BaseDb.connect(storage_dbconn) | |||||
deposit_db = BaseDb.connect(deposit_dbconn) | |||||
storage = get_storage("remote", url=storage_url) | |||||
if not dry_run: | |||||
create_fetchers(storage_db) | |||||
# Not creating authorities, as the loaders are presumably already running | |||||
# and created them already. | |||||
# This also helps make sure this script doesn't accidentally create | |||||
# authorities that differ from what the loaders use. | |||||
total_rows = 0 | |||||
with storage_db.cursor() as read_cur: | |||||
with deposit_db.cursor() as deposit_cur: | |||||
after_id = first_id | |||||
while True: | |||||
read_cur.execute( | |||||
f"SELECT {', '.join(REVISION_COLS)} FROM revision " | |||||
f"WHERE id > %s AND metadata IS NOT NULL ORDER BY id LIMIT 1000", | |||||
(after_id,), | |||||
) | |||||
new_rows = 0 | |||||
for row in read_cur: | |||||
row_d = dict(zip(REVISION_COLS, row)) | |||||
handle_row(row_d, storage, deposit_cur, dry_run) | |||||
new_rows += 1 | |||||
if new_rows == 0: | |||||
break | |||||
after_id = row_d["id"] | |||||
total_rows += new_rows | |||||
percents = ( | |||||
int.from_bytes(after_id[0:4], byteorder="big") * 100 / (1 << 32) | |||||
) | |||||
print( | |||||
f"Migrated {total_rows/1000000.:.2f}M rows " | |||||
f"(~{percents:.1f}%, last revision: {after_id.hex()})" | |||||
) | |||||
if __name__ == "__main__": | |||||
if len(sys.argv) == 4: | |||||
(_, storage_dbconn, storage_url, deposit_dbconn) = sys.argv | |||||
first_id = "00" * 20 | |||||
elif len(sys.argv) == 5: | |||||
(_, storage_dbconn, storage_url, deposit_dbconn, first_id) = sys.argv | |||||
else: | |||||
print( | |||||
f"Syntax: {sys.argv[0]} <storage_dbconn> <storage_url> " | |||||
f"<deposit_dbconn> [<first id>]" | |||||
) | |||||
exit(1) | |||||
if os.path.isfile("./origins.txt"): | |||||
# You can generate this file with: | |||||
# psql service=swh-replica \ | |||||
# -c "\copy (select digest(url, 'sha1') from origin) to stdout" \ | |||||
# | pv -l > origins.txt | |||||
print("Loading origins...") | |||||
with open("./origins.txt") as fd: | |||||
for line in fd: | |||||
digest = line.strip()[3:] | |||||
_origins.add(bytes.fromhex(digest)) | |||||
print("Done loading origins.") | |||||
main(storage_dbconn, storage_url, deposit_dbconn, bytes.fromhex(first_id), True) |
where is the load_metadata call in this conditional?