diff --git a/bin/migrate-extrinsic-metadata.py b/bin/migrate-extrinsic-metadata.py new file mode 100644 --- /dev/null +++ b/bin/migrate-extrinsic-metadata.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import re +import sys +from typing import Any, Dict + +from swh.core.db import BaseDb +from swh.model.model import MetadataAuthority, MetadataAuthorityType + +CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" + +ATOM_NS = "http://www.w3.org/2005/Atom" +ATOM_KEYS = ["id", "author", "external_identifier", "title"] + +REVISION_COLS = ["id", "date", "date_offset", "type", "message", "metadata"] + +DEPOSIT_COLS = [ + "deposit.id", + "deposit_request.metadata", + "deposit_client.provider_url", + "deposit_collection.name", + "auth_user.username", +] + +FETCHERS = {} +AUTHORITIES = { + "npmjs.org": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={} + ), + "pypi.org": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={} + ), +} + +deposit_revision_message_re = re.compile( + b"(?P[a-z]*): " + b"Deposit (?P[0-9]+) in collection (?P[a-z]+).*" +) + + +def remove_atom_codemeta_metadata_with_xmlns(metadata): + keys_to_remove = ATOM_KEYS + ["@xmlns", "@xmlns:codemeta"] + for key in list(metadata): + if key.startswith("codemeta:") or key in keys_to_remove: + del metadata[key] + + +def remove_atom_codemeta_metadata_without_xmlns(metadata): + for key in list(metadata): + if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)): + del metadata[key] + + +def handle_deposit_row(row, write_cur, deposit_cur, dry_run): + parsed_message = deposit_revision_message_re.match(row["message"]) + assert parsed_message is not None, row["message"] + + deposit_id = int(parsed_message.group("deposit_id")) + collection = parsed_message.group("collection").decode() + client_name = parsed_message.group("client").decode() + + deposit_cur.execute( + f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit " + f"INNER JOIN deposit_collection " + f" ON (deposit.collection_id=deposit_collection.id) " + f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) " + f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) " + f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) " + f"WHERE deposit.id = %s", + (deposit_id,), + ) + + nb_metadata = 0 + for deposit_request_row in deposit_cur: + deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row)) + + # Sanity checks to make sure we selected the right deposit + assert deposit_request["deposit.id"] == deposit_id + assert deposit_request["deposit_collection.name"] == collection, deposit_request + if client_name != "": + # Sometimes it's missing from the commit message + assert deposit_request["auth_user.username"] == client_name + + metadata = deposit_request["deposit_request.metadata"] + if metadata is not None: + json.dumps(metadata).encode() # check it's valid + nb_metadata += 1 + + assert nb_metadata >= 1, deposit_id + + +def handle_row(row: Dict[str, Any], write_cur, deposit_cur, dry_run: bool): + type_ = row["type"] + + metadata = row["metadata"] + + if metadata is None: + return + + if type_ == "dsc": + if "extrinsic" in metadata: + extrinsic_files = metadata["extrinsic"]["raw"]["files"] + for artifact_entry in metadata["original_artifact"]: + extrinsic_file = extrinsic_files[artifact_entry["filename"]] + for key in ("sha256",): + assert artifact_entry["checksums"][key] == extrinsic_file[key] + artifact_entry["url"] = extrinsic_file["uri"] + del metadata["extrinsic"] + + elif type_ == "tar": + provider = metadata.get("extrinsic", {}).get("provider") + if provider is not None: + # New versions of the loaders write the provider; use it. + if provider.startswith("https://replicate.npmjs.com/"): + # npm loader format 1 + del metadata["extrinsic"] # TODO: load + + elif provider.startswith("https://pypi.org/"): + # npm loader format 1 + del metadata["extrinsic"] # TODO: load + + elif provider.startswith("https://cran.r-project.org/"): + # cran loader + del metadata["extrinsic"] # TODO: load + + elif provider.startswith("https://nix-community.github.io/nixpkgs-swh/"): + # nixguix loader + del metadata["extrinsic"] # TODO: load + + elif provider.startswith("https://ftp.gnu.org/"): + # archive loader + del metadata["extrinsic"] # TODO: load + + elif provider.startswith("https://deposit.softwareheritage.org/"): + if "@xmlns" in metadata: + assert metadata["@xmlns"] == ATOM_NS + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 1 + # (pretty rare? In id order, the first revision with this format + # is 022310df16fd9e4d4f81fe36a142e82db977c01d) + # in the case, the metadata seems to be both directly in metadata + # and in metadata["extrinsic"]["raw"]["metadata"] + + handle_deposit_row(row, write_cur, deposit_cur, dry_run) + + remove_atom_codemeta_metadata_with_xmlns(metadata) + del metadata["client"] + del metadata["extrinsic"] + else: + # deposit loader format 2 + + handle_deposit_row(row, write_cur, deposit_cur, dry_run) + + del metadata["extrinsic"] + + else: + assert False, f"unknown provider {provider}" + + # Older versions don't write the provider; use heuristics instead. + elif ( + metadata.get("package_source", {}) + .get("url", "") + .startswith("https://registry.npmjs.org/") + ): + # npm loader format 2 + del metadata["package"] + del metadata["package_source"] # TODO: load + + elif "project" in metadata: + assert metadata["original_artifact"]["url"].startswith( + "https://files.pythonhosted.org/" + ) + + # pypi loader format 2 + del metadata["project"] # TODO: load + + elif "@xmlns" in metadata: + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 3 and 4 + handle_deposit_row(row, write_cur, deposit_cur, dry_run) + remove_atom_codemeta_metadata_with_xmlns(metadata) + del metadata["client"] # found in the deposit db + if "committer" in metadata: + del metadata["committer"] # found on the revision object + + elif "{http://www.w3.org/2005/Atom}id" in metadata: + assert "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 5 + handle_deposit_row(row, write_cur, deposit_cur, dry_run) + remove_atom_codemeta_metadata_without_xmlns(metadata) + + # Remove common intrinsic metadata keys + for key in ("intrinsic", "extra_headers"): + if key in metadata: + del metadata[key] + + # Remove loader-specific intrinsic metadata keys + if type_ == "hg": + del metadata["node"] + elif type_ == "dsc": + if "package_info" in metadata: + del metadata["package_info"] + + for key in ("original_artifact",): + # TODO: send them + if key in metadata: + del metadata[key] + + assert metadata == {}, ( + f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): " + f"{metadata}" + ) + + +def create_fetchers(db): + with db.cursor() as cur: + for fetcher in FETCHERS.values(): + cur.execute( + """ + INSERT INTO metadata_fetcher (name, version, metadata) + VALUES (%s, %s, %s) + ON CONFLICT DO NOTHING + """, + (fetcher.name, fetcher.version, fetcher.metadata), + ) + + +def main(storage_dbconn, deposit_dbconn, first_id, dry_run): + storage_db = BaseDb.connect(storage_dbconn) + deposit_db = BaseDb.connect(deposit_dbconn) + + if not dry_run: + create_fetchers(storage_db) + # Not creating authorities, as the loaders are presumably already running + # and created them already. + # This also helps make sure this script doesn't accidentally create + # authorities that differ from what the loaders use. + + total_rows = 0 + with storage_db.cursor() as read_cur, storage_db.cursor() as write_cur: + with deposit_db.cursor() as deposit_cur: + after_id = first_id + while True: + read_cur.execute( + f"SELECT {', '.join(REVISION_COLS)} FROM revision " + f"WHERE id > %s AND metadata IS NOT NULL ORDER BY id LIMIT 1000", + (after_id,), + ) + new_rows = 0 + for row in read_cur: + row_d = dict(zip(REVISION_COLS, row)) + handle_row(row_d, write_cur, deposit_cur, dry_run) + new_rows += 1 + + if new_rows == 0: + break + + after_id = row_d["id"] + + total_rows += new_rows + percents = ( + int.from_bytes(after_id[0:4], byteorder="big") * 100 / (1 << 32) + ) + print( + f"Migrated {total_rows/1000000.:.2f}M rows " + f"(~{percents:.1f}%, last revision: {after_id.hex()})" + ) + + +if __name__ == "__main__": + if len(sys.argv) == 3: + (_, storage_dbconn, deposit_dbconn) = sys.argv + first_id = "00" * 20 + elif len(sys.argv) == 4: + (_, storage_dbconn, deposit_dbconn, first_id) = sys.argv + else: + print(f"Syntax: {sys.argv[0]} []") + exit(1) + main(storage_dbconn, deposit_dbconn, bytes.fromhex(first_id), True)