diff --git a/mypy.ini b/mypy.ini index 99c0bcc6..da53e716 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,60 +1,63 @@ [mypy] namespace_packages = True # due to the conditional import logic on swh.journal, in some cases a specific # type: ignore is needed, in other it isn't... warn_unused_ignores = False # support for sqlalchemy magic: see https://github.com/dropbox/sqlalchemy-stubs plugins = sqlmypy # 3rd party libraries without stubs (yet) [mypy-cassandra.*] ignore_missing_imports = True [mypy-confluent_kafka.*] ignore_missing_imports = True [mypy-deprecated.*] ignore_missing_imports = True # only shipped indirectly via hypothesis [mypy-django.*] ignore_missing_imports = True +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-msgpack.*] ignore_missing_imports = True [mypy-multiprocessing.util] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-psycopg2.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_cov.*] ignore_missing_imports = True [mypy-pytest_kafka.*] ignore_missing_imports = True [mypy-systemd.daemon.*] ignore_missing_imports = True [mypy-tenacity.*] ignore_missing_imports = True # temporary work-around for landing typing support in spite of the current # journal<->storage dependency loop [mypy-swh.journal.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 619b1325..3fdbb845 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ click flask psycopg2 vcversioner aiohttp tenacity cassandra-driver >= 3.19.0, != 3.21.0 deprecated typing-extensions mypy_extensions +iso8601 diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py new file mode 100644 index 00000000..78172083 --- /dev/null +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -0,0 +1,903 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""This is an executable script to migrate extrinsic revision metadata from +the revision table to the new extrinsic metadata storage. + +This is designed to be as conservative as possible, following this principle: +for each revision the script reads (in "handle_row"), it will read some of the +fields, write them directly to the metadata storage, and remove them. +Then it checks all the remaining fields are in a hardcoded list of fields that +are known not to require migration. + +This means that every field that isn't migrated was explicitly reviewed while +writing this script. + +Additionally, this script contains many assertions to prevent false positives +in its heuristics. +""" + +import datetime +import hashlib +import json +import os +import re +import sys +from typing import Any, Dict, Optional +from urllib.parse import unquote, urlparse + +import iso8601 + +from swh.core.db import BaseDb +from swh.model.hashutil import hash_to_hex +from swh.model.identifiers import SWHID, parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage import get_storage + +# XML namespaces and fields for metadata coming from the deposit: + +CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" +ATOM_NS = "http://www.w3.org/2005/Atom" +ATOM_KEYS = ["id", "author", "external_identifier", "title"] + +# columns of the revision table (of the storage DB) +REVISION_COLS = ["id", "date", "committer_date", "type", "message", "metadata"] + +# columns of the tables of the deposit DB +DEPOSIT_COLS = [ + "deposit.id", + "deposit.external_id", + "deposit.swh_id_context", + "deposit.status", + "deposit_request.metadata", + "deposit_request.date", + "deposit_client.provider_url", + "deposit_collection.name", + "auth_user.username", +] + +# Formats we write to the extrinsic metadata storage +OLD_DEPOSIT_FORMAT = ( + "sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces" # before february 2018 +) +NEW_DEPOSIT_FORMAT = "sword-v2-atom-codemeta-v2-in-json" # after february 2018 +GNU_FORMAT = "gnu-tree-json" +NIXGUIX_FORMAT = "nixguix-sources-json" +NPM_FORMAT = "replicate-npm-package-json" +ORIGINAL_ARTIFACT_FORMAT = "original-artifacts-json" +PYPI_FORMAT = "pypi-project-json" + +# Information about this script, for traceability +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) + +# Authorities that we got the metadata from +AUTHORITIES = { + "npmjs": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={} + ), + "pypi": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={} + ), + "gnu": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://ftp.gnu.org/", metadata={} + ), + "swh": MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, + ), # for original_artifact (which are checksums computed by SWH) +} + +# Regular expression for the format of revision messages written by the +# deposit loader +deposit_revision_message_re = re.compile( + b"(?P[a-z]*): " + b"Deposit (?P[0-9]+) in collection (?P[a-z]+).*" +) + + +# not reliable, because PyPI allows arbitrary names +def pypi_project_from_filename(filename): + match = re.match( + r"^(?P[a-zA-Z0-9_.-]+)" + r"-[0-9.]+([a-z]+[0-9]+)?(\.dev[0-9]+)?\.(tar\.gz|zip)$", + filename, + ) + assert match, filename + return match.group("project_name") + + +def cran_package_from_url(filename): + match = re.match( + r"^https://cran\.r-project\.org/src/contrib/" + r"(?P[a-zA-Z0-9.]+)_[0-9.-]+(\.tar\.gz)?$", + filename, + ) + assert match, filename + return match.group("package_name") + + +def npm_package_from_source_url(package_source_url): + match = re.match( + "^https://registry.npmjs.org/(?P.*)/-/[^/]+.tgz$", + package_source_url, + ) + assert match, package_source_url + return unquote(match.group("package_name")) + + +def remove_atom_codemeta_metadata_with_xmlns(metadata): + """Removes all known Atom and Codemeta metadata fields from the dict, + assuming this is a dict generated by xmltodict without expanding namespaces. + """ + keys_to_remove = ATOM_KEYS + ["@xmlns", "@xmlns:codemeta"] + for key in list(metadata): + if key.startswith("codemeta:") or key in keys_to_remove: + del metadata[key] + + +def remove_atom_codemeta_metadata_without_xmlns(metadata): + """Removes all known Atom and Codemeta metadata fields from the dict, + assuming this is a dict generated by xmltodict with expanded namespaces. + """ + for key in list(metadata): + if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)): + del metadata[key] + + +# Cache of origins that are known to exist +_origins = set() + + +def assert_origin_exists(storage, origin): + assert ( + hashlib.sha1(origin.encode()).digest() in _origins # very fast + or storage.origin_get([origin])[0] is not None # slow, but up to date + ), origin + + +def load_metadata( + storage, + revision_id, + discovery_date: datetime.datetime, + metadata: Dict[str, Any], + format: str, + authority: MetadataAuthority, + origin: Optional[str], + dry_run: bool, +): + """Does the actual loading to swh-storage.""" + revision_swhid = SWHID(object_type="revision", object_id=hash_to_hex(revision_id)) + obj = RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=discovery_date, + authority=authority, + fetcher=FETCHER, + format=format, + metadata=json.dumps(metadata).encode(), + origin=origin, + ) + if not dry_run: + storage.raw_extrinsic_metadata_add([obj]) + + +def handle_deposit_row( + row, + discovery_date: Optional[datetime.datetime], + origin, + storage, + deposit_cur, + dry_run: bool, +): + """Loads metadata from the deposit database (which is more reliable as the + metadata on the revision object, as some versions of the deposit loader were + a bit lossy; and they used very different format for the field in the + revision table). + """ + parsed_message = deposit_revision_message_re.match(row["message"]) + assert parsed_message is not None, row["message"] + + deposit_id = int(parsed_message.group("deposit_id")) + collection = parsed_message.group("collection").decode() + client_name = parsed_message.group("client").decode() + + deposit_cur.execute( + f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit " + f"INNER JOIN deposit_collection " + f" ON (deposit.collection_id=deposit_collection.id) " + f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) " + f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) " + f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) " + f"WHERE deposit.id = %s", + (deposit_id,), + ) + + provider_urls = set() + swhids = set() + metadata_entries = [] + dates = set() + external_identifiers = set() + for deposit_request_row in deposit_cur: + deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row)) + + # Sanity checks to make sure we selected the right deposit + assert deposit_request["deposit.id"] == deposit_id + assert deposit_request["deposit_collection.name"] == collection, deposit_request + if client_name != "": + # Sometimes it's missing from the commit message + assert deposit_request["auth_user.username"] == client_name + + # Date of the deposit request (either the initial request, of subsequent ones) + date = deposit_request["deposit_request.date"] + dates.add(date) + + assert deposit_request["deposit.swh_id_context"], deposit_request + external_identifiers.add(deposit_request["deposit.external_id"]) + swhids.add(deposit_request["deposit.swh_id_context"]) + + # Client of the deposit + provider_urls.add(deposit_request["deposit_client.provider_url"]) + + metadata = deposit_request["deposit_request.metadata"] + if metadata is not None: + json.dumps(metadata).encode() # check it's valid + if "@xmlns" in metadata: + assert metadata["@xmlns"] == ATOM_NS + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + format = NEW_DEPOSIT_FORMAT + else: + assert "{http://www.w3.org/2005/Atom}id" in metadata + assert ( + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata + or "{http://www.w3.org/2005/Atom}author" in metadata + ) + format = OLD_DEPOSIT_FORMAT + metadata_entries.append((date, format, metadata)) + + if discovery_date is None: + discovery_date = max(dates) + + # Sanity checks to make sure deposit requests are consistent with each other + assert len(metadata_entries) >= 1, deposit_id + assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}" + (provider_url,) = provider_urls + assert len(swhids) == 1 + (swhid,) = swhids + assert ( + len(external_identifiers) == 1 + ), f"expected 1 external identifier, got {external_identifiers}" + (external_identifier,) = external_identifiers + + # computed the origin from the external_identifier if we don't have one + if origin is None: + origin = f"{provider_url.strip('/')}/{external_identifier}" + + # explicit list of mistakes that happened in the past, but shouldn't + # happen again: + if origin == "https://hal.archives-ouvertes.fr/hal-01588781": + # deposit id 75 + origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781" + elif origin == "https://hal.archives-ouvertes.fr/hal-01588782": + # deposit id 76 + origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782" + elif origin == "https://hal.archives-ouvertes.fr/hal-01592430": + # deposit id 143 + origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592430" + elif origin == "https://hal.archives-ouvertes.fr/hal-01588927": + origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927" + elif origin == "https://hal.archives-ouvertes.fr/hal-01593875": + # deposit id 175 + origin = "https://hal-preprod.archives-ouvertes.fr/hal-01593875" + elif deposit_id == 160: + assert origin == "https://www.softwareheritage.org/je-suis-gpl", origin + origin = "https://forge.softwareheritage.org/source/jesuisgpl/" + elif origin == "https://hal.archives-ouvertes.fr/hal-01588942": + # deposit id 90 + origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942" + elif origin == "https://hal.archives-ouvertes.fr/hal-01592499": + # deposit id 162 + origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592499" + elif origin == "https://hal.archives-ouvertes.fr/hal-01588935": + # deposit id 89 + origin = "https://hal-preprod.archives-ouvertes.fr/hal-01588935" + + assert_origin_exists(storage, origin) + + # check the origin we computed matches the one in the deposit db + swhid_origin = parse_swhid(swhid).metadata["origin"] + if origin is not None: + # explicit list of mistakes that happened in the past, but shouldn't + # happen again: + exceptions = [ + ( + # deposit id 229 + "https://hal.archives-ouvertes.fr/hal-01243573", + "https://hal-test.archives-ouvertes.fr/hal-01243573", + ), + ( + # deposit id 199 + "https://hal.archives-ouvertes.fr/hal-01243065", + "https://hal-test.archives-ouvertes.fr/hal-01243065", + ), + ( + # deposit id 164 + "https://hal.archives-ouvertes.fr/hal-01593855", + "https://hal-preprod.archives-ouvertes.fr/hal-01593855", + ), + ] + if (origin, swhid_origin) not in exceptions: + assert origin == swhid_origin, ( + f"the origin we guessed from the deposit db or revision ({origin}) " + f"doesn't match the one in the deposit db's SWHID ({swhid})" + ) + + authority = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider_url, metadata={}, + ) + + for (date, format, metadata) in metadata_entries: + load_metadata( + storage, + row["id"], + date, + metadata, + format, + authority=authority, + origin=origin, + dry_run=dry_run, + ) + + return (origin, discovery_date) + + +def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool): + type_ = row["type"] + + # default date in case we can't find a better one + discovery_date = row["date"] or row["committer_date"] + + metadata = row["metadata"] + + if metadata is None: + return + + if type_ == "dsc": + origin = None # TODO: I can't find how to get it reliably + + # TODO: the debian loader writes the changelog date as the revision's + # author date and committer date. Instead, we should use the visit's date, + # but I cannot find a way to reliably get it without the origin + + if "extrinsic" in metadata: + extrinsic_files = metadata["extrinsic"]["raw"]["files"] + for artifact_entry in metadata["original_artifact"]: + extrinsic_file = extrinsic_files[artifact_entry["filename"]] + for key in ("sha256",): + assert artifact_entry["checksums"][key] == extrinsic_file[key] + artifact_entry["url"] = extrinsic_file["uri"] + del metadata["extrinsic"] + + elif type_ == "tar": + provider = metadata.get("extrinsic", {}).get("provider") + if provider is not None: + # This is the format all the package loaders currently write, and + # it is the easiest, thanks to the 'provider' and 'when' fields, + # which have all the information we need to tell them easily + # and generate accurate metadata + + discovery_date = iso8601.parse_date(metadata["extrinsic"]["when"]) + + # New versions of the loaders write the provider; use it. + if provider.startswith("https://replicate.npmjs.com/"): + # npm loader format 1 + + parsed_url = urlparse(provider) + assert re.match("^/[^/]+/?$", parsed_url.path), parsed_url + package_name = unquote(parsed_url.path.strip("/")) + origin = "https://www.npmjs.com/package/" + package_name + assert_origin_exists(storage, origin) + + load_metadata( + storage, + row["id"], + discovery_date, + metadata["extrinsic"]["raw"], + NPM_FORMAT, + authority=AUTHORITIES["npmjs"], + origin=origin, + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://pypi.org/"): + # pypi loader format 1 + + match = re.match( + "https://pypi.org/pypi/(?P.*)/json", provider + ) + assert match, f"unexpected provider URL format: {provider}" + project_name = match.group("project_name") + origin = f"https://pypi.org/project/{project_name}/" + assert_origin_exists(storage, origin) + + load_metadata( + storage, + row["id"], + discovery_date, + metadata["extrinsic"]["raw"], + PYPI_FORMAT, + authority=AUTHORITIES["pypi"], + origin=origin, + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://cran.r-project.org/"): + # cran loader + + provider = metadata["extrinsic"]["provider"] + if provider.startswith("https://cran.r-project.org/package="): + origin = metadata["extrinsic"]["provider"] + else: + package_name = cran_package_from_url(provider) + origin = f"https://cran.r-project.org/package={package_name}" + # TODO https://forge.softwareheritage.org/T2536 + assert origin is not None + if ( + hashlib.sha1(origin.encode()).digest() not in _origins + and storage.origin_get([origin])[0] is None + ): + print("MISSING CRAN ORIGIN", hash_to_hex(row["id"]), origin) + return + + raw_extrinsic_metadata = metadata["extrinsic"]["raw"] + + # this is actually intrinsic, ignore it + del raw_extrinsic_metadata["version"] + + # Copy the URL to the original_artifacts metadata + assert len(metadata["original_artifact"]) == 1 + assert "url" not in metadata["original_artifact"][0] + metadata["original_artifact"][0]["url"] = raw_extrinsic_metadata["url"] + del raw_extrinsic_metadata["url"] + + assert ( + raw_extrinsic_metadata == {} + ), f"Unexpected metadata keys: {list(raw_extrinsic_metadata)}" + + del metadata["extrinsic"] + + elif provider.startswith("https://nix-community.github.io/nixpkgs-swh/"): + # nixguix loader + origin = provider + assert_origin_exists(storage, origin) + + authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url=provider, metadata={}, + ) + assert row["date"] is None # the nixguix loader does not write dates + + load_metadata( + storage, + row["id"], + discovery_date, + metadata["extrinsic"]["raw"], + NIXGUIX_FORMAT, + authority=authority, + origin=origin, + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://ftp.gnu.org/"): + # archive loader format 1 + + origin = provider + assert_origin_exists(storage, origin) + + assert len(metadata["original_artifact"]) == 1 + metadata["original_artifact"][0]["url"] = metadata["extrinsic"]["raw"][ + "url" + ] + + # Remove duplicate keys of original_artifacts + for key in ("url", "time", "length", "version", "filename"): + del metadata["extrinsic"]["raw"][key] + + assert metadata["extrinsic"]["raw"] == {} + del metadata["extrinsic"] + + elif provider.startswith("https://deposit.softwareheritage.org/"): + origin = metadata["extrinsic"]["raw"]["origin"]["url"] + assert_origin_exists(storage, origin) + + if "@xmlns" in metadata: + assert metadata["@xmlns"] == ATOM_NS + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 1 + # in this case, the metadata seems to be both directly in metadata + # and in metadata["extrinsic"]["raw"]["metadata"] + + (origin, discovery_date) = handle_deposit_row( + row, discovery_date, origin, storage, deposit_cur, dry_run + ) + + remove_atom_codemeta_metadata_with_xmlns(metadata) + if "client" in metadata: + del metadata["client"] + del metadata["extrinsic"] + else: + # deposit loader format 2 + actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][ + "metadata" + ] + if "@xmlns" in actual_metadata: + assert actual_metadata["@xmlns"] == ATOM_NS + assert actual_metadata["@xmlns:codemeta"] in ( + CODEMETA_NS, + [CODEMETA_NS], + ) + else: + assert "{http://www.w3.org/2005/Atom}id" in actual_metadata + assert ( + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" + in actual_metadata + ) + + (origin, discovery_date) = handle_deposit_row( + row, discovery_date, origin, storage, deposit_cur, dry_run + ) + + del metadata["extrinsic"] + else: + assert False, f"unknown provider {provider}" + + # Older versions don't write the provider; use heuristics instead. + elif ( + metadata.get("package_source", {}) + .get("url", "") + .startswith("https://registry.npmjs.org/") + ): + # npm loader format 2 + + package_source_url = metadata["package_source"]["url"] + package_name = npm_package_from_source_url(package_source_url) + origin = "https://www.npmjs.com/package/" + package_name + assert_origin_exists(storage, origin) + + load_metadata( + storage, + row["id"], + discovery_date, + metadata["package"], + NPM_FORMAT, + authority=AUTHORITIES["npmjs"], + origin=origin, + dry_run=dry_run, + ) + del metadata["package"] + + assert "original_artifact" not in metadata + + # rebuild an "original_artifact"-like metadata dict from what we + # can salvage of "package_source" + package_source_metadata = metadata["package_source"] + keep_keys = {"blake2s256", "filename", "sha1", "sha256", "url"} + discard_keys = { + "date", # is equal to the revision date + "name", # was loaded above + "version", # same + } + assert ( + set(package_source_metadata) == keep_keys | discard_keys + ), package_source_metadata + + # will be loaded below + metadata["original_artifact"] = [ + { + "filename": package_source_metadata["filename"], + "checksums": { + "sha1": package_source_metadata["sha1"], + "sha256": package_source_metadata["sha256"], + "blake2s256": package_source_metadata["blake2s256"], + }, + "url": package_source_metadata["url"], + } + ] + del metadata["package_source"] + + elif "@xmlns" in metadata: + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 3 + + if row["message"] == b"swh: Deposit 159 in collection swh": + # There is no deposit 159 in the deposit DB, for some reason + assert ( + hash_to_hex(row["id"]) == "8e9cee14a6ad39bca4347077b87fb5bbd8953bb1" + ) + return + elif row["message"] == b"hal: Deposit 342 in collection hal": + # They have status 'failed' and no swhid + return + + origin = None # TODO + discovery_date = None # TODO + + (origin, discovery_date) = handle_deposit_row( + row, discovery_date, origin, storage, deposit_cur, dry_run + ) + remove_atom_codemeta_metadata_with_xmlns(metadata) + if "client" in metadata: + del metadata["client"] # found in the deposit db + if "committer" in metadata: + del metadata["committer"] # found on the revision object + + elif "{http://www.w3.org/2005/Atom}id" in metadata: + assert ( + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata + or "{http://www.w3.org/2005/Atom}author" in metadata + ) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 4 + + origin = None + discovery_date = None # TODO + + (origin, discovery_date) = handle_deposit_row( + row, discovery_date, origin, storage, deposit_cur, dry_run + ) + remove_atom_codemeta_metadata_without_xmlns(metadata) + + elif hash_to_hex(row["id"]) == "a86747d201ab8f8657d145df4376676d5e47cf9f": + # deposit 91, is missing "{http://www.w3.org/2005/Atom}id" for some + # reason, and has an invalid oririn + return + + elif ( + isinstance(metadata.get("original_artifact"), dict) + and metadata["original_artifact"]["url"].startswith( + "https://files.pythonhosted.org/" + ) + ) or ( + isinstance(metadata.get("original_artifact"), list) + and len(metadata.get("original_artifact")) == 1 + and metadata["original_artifact"][0] + .get("url", "") + .startswith("https://files.pythonhosted.org/") + ): + if isinstance(metadata.get("original_artifact"), dict): + metadata["original_artifact"] = [metadata["original_artifact"]] + + assert len(metadata["original_artifact"]) == 1 + + # it's tempting here to do this: + # + # project_name = pypi_project_from_filename( + # metadata["original_artifact"][0]["filename"] + # ) + # origin = f"https://pypi.org/project/{project_name}/" + # assert_origin_exists(storage, origin) + # + # but unfortunately, the filename is user-provided, and doesn't + # necessarily match the package name on pypi. + + # TODO: on second thoughts, I think we can use this as a heuristic, + # then double-check by listing visits and snapshots from the origin; + # it should work for most packages. + + origin = None + + if "project" in metadata: + # pypi loader format 2 + + # same reason as above, we can't do this: + # if metadata["project"]: + # assert metadata["project"]["name"] == project_name + + load_metadata( + storage, + row["id"], + discovery_date, + metadata["project"], + PYPI_FORMAT, + authority=AUTHORITIES["pypi"], + origin=origin, + dry_run=dry_run, + ) + del metadata["project"] + else: + assert set(metadata) == {"original_artifact"}, set(metadata) + # pypi loader format 3 + pass # nothing to do, there's no metadata + + elif row["message"] == b"synthetic revision message": + assert isinstance(metadata["original_artifact"], list), metadata + assert not any("url" in d for d in metadata["original_artifact"]) + + # archive loader format 2 + + origin = None + + elif deposit_revision_message_re.match(row["message"]): + # deposit without metadata in the revision + + assert set(metadata) == {"original_artifact"}, metadata + + origin = None # TODO + discovery_date = None + + (origin, discovery_date) = handle_deposit_row( + row, discovery_date, origin, storage, deposit_cur, dry_run + ) + else: + assert False, f"Unable to detect type of metadata for row: {row}" + + # Ignore common intrinsic metadata keys + for key in ("intrinsic", "extra_headers"): + if key in metadata: + del metadata[key] + + # Ignore loader-specific intrinsic metadata keys + if type_ == "hg": + del metadata["node"] + elif type_ == "dsc": + if "package_info" in metadata: + del metadata["package_info"] + + if "original_artifact" in metadata: + for original_artifact in metadata["original_artifact"]: + # Rename keys to the expected format of original-artifacts-json. + rename_keys = [ + ("name", "filename"), # eg. from old Debian loader + ("size", "length"), # eg. from old PyPI loader + ] + for (old_name, new_name) in rename_keys: + if old_name in original_artifact: + assert new_name not in original_artifact + original_artifact[new_name] = original_artifact.pop(old_name) + + # Move the checksums to their own subdict, which is the expected format + # of original-artifacts-json. + if "sha1" in original_artifact: + assert "checksums" not in original_artifact + original_artifact["checksums"] = {} + for key in ("sha1", "sha256", "sha1_git", "blake2s256"): + if key in original_artifact: + original_artifact["checksums"][key] = original_artifact.pop(key) + + if "date" in original_artifact: + # The information comes from the package repository rather than SWH, + # so it shouldn't be in the 'original-artifacts' metadata + # (which has SWH as authority). + # Moreover, it's not a very useful information, so let's just drop it. + del original_artifact["date"] + + allowed_keys = { + "checksums", + "filename", + "length", + "url", + "archive_type", + } + assert set(original_artifact) <= allowed_keys, set(original_artifact) + + load_metadata( + storage, + row["id"], + discovery_date, + metadata["original_artifact"], + ORIGINAL_ARTIFACT_FORMAT, + authority=AUTHORITIES["swh"], + origin=origin, + dry_run=dry_run, + ) + del metadata["original_artifact"] + + assert metadata == {}, ( + f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): " + f"{metadata}" + ) + + +def create_fetchers(db): + with db.cursor() as cur: + cur.execute( + """ + INSERT INTO metadata_fetcher (name, version, metadata) + VALUES (%s, %s, %s) + ON CONFLICT DO NOTHING + """, + (FETCHER.name, FETCHER.version, FETCHER.metadata), + ) + + +def main(storage_dbconn, storage_url, deposit_dbconn, first_id, dry_run): + storage_db = BaseDb.connect(storage_dbconn) + deposit_db = BaseDb.connect(deposit_dbconn) + storage = get_storage("remote", url=storage_url) + + if not dry_run: + create_fetchers(storage_db) + # Not creating authorities, as the loaders are presumably already running + # and created them already. + # This also helps make sure this script doesn't accidentally create + # authorities that differ from what the loaders use. + + total_rows = 0 + with storage_db.cursor() as read_cur: + with deposit_db.cursor() as deposit_cur: + after_id = first_id + while True: + read_cur.execute( + f"SELECT {', '.join(REVISION_COLS)} FROM revision " + f"WHERE id > %s AND metadata IS NOT NULL ORDER BY id LIMIT 1000", + (after_id,), + ) + new_rows = 0 + for row in read_cur: + row_d = dict(zip(REVISION_COLS, row)) + handle_row(row_d, storage, deposit_cur, dry_run) + new_rows += 1 + + if new_rows == 0: + break + + after_id = row_d["id"] + + total_rows += new_rows + percents = ( + int.from_bytes(after_id[0:4], byteorder="big") * 100 / (1 << 32) + ) + print( + f"Migrated {total_rows/1000000.:.2f}M rows " + f"(~{percents:.1f}%, last revision: {after_id.hex()})" + ) + + +if __name__ == "__main__": + if len(sys.argv) == 4: + (_, storage_dbconn, storage_url, deposit_dbconn) = sys.argv + first_id = "00" * 20 + elif len(sys.argv) == 5: + (_, storage_dbconn, storage_url, deposit_dbconn, first_id) = sys.argv + else: + print( + f"Syntax: {sys.argv[0]} " + f" []" + ) + exit(1) + + if os.path.isfile("./origins.txt"): + # You can generate this file with: + # psql service=swh-replica \ + # -c "\copy (select digest(url, 'sha1') from origin) to stdout" \ + # | pv -l > origins.txt + print("Loading origins...") + with open("./origins.txt") as fd: + for line in fd: + digest = line.strip()[3:] + _origins.add(bytes.fromhex(digest)) + print("Done loading origins.") + + main(storage_dbconn, storage_url, deposit_dbconn, bytes.fromhex(first_id), True) diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py new file mode 100644 index 00000000..8edce012 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py @@ -0,0 +1,221 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_cran_package_from_url(): + files = [ + ("https://cran.r-project.org/src/contrib/shapeR_0.1-5.tar.gz", "shapeR"), + ("https://cran.r-project.org/src/contrib/hot.deck_1.1.tar.gz", "hot.deck"), + ] + + for (filename, project) in files: + assert cran_package_from_url(filename) == project + + +def test_cran(): + source_original_artifacts = [ + { + "length": 170623, + "filename": "ExtremeRisks_0.0.3.tar.gz", + "checksums": { + "sha1": "f2f19fc0f24b66b5ea9413366c632f3c229f7f3f", + "sha256": "6f232556313019809dde3554149a1399bb1901a366b4965af49dc007d01945c9", + }, + } + ] + dest_original_artifacts = [ + { + "length": 170623, + "filename": "ExtremeRisks_0.0.3.tar.gz", + "checksums": { + "sha1": "f2f19fc0f24b66b5ea9413366c632f3c229f7f3f", + "sha256": "6f232556313019809dde3554149a1399bb1901a366b4965af49dc007d01945c9", + }, + "url": "https://cran.r-project.org/src/contrib/ExtremeRisks_0.0.3.tar.gz", + } + ] + + row = { + "id": b"\x00\x03a\xaa3\x84,\xbd\xea_\xa6\xe7}\xb6\x96\xb97\xeb\xd2i", + "date": datetime.datetime(2020, 5, 5, 0, 0, tzinfo=datetime.timezone.utc,), + "committer_date": datetime.datetime( + 2020, 5, 5, 0, 0, tzinfo=datetime.timezone.utc, + ), + "type": "tar", + "message": b"0.0.3", + "metadata": { + "extrinsic": { + "raw": { + "url": "https://cran.r-project.org/src/contrib/ExtremeRisks_0.0.3.tar.gz", + "version": "0.0.3", + }, + "when": "2020-05-07T15:27:38.652281+00:00", + "provider": "https://cran.r-project.org/package=ExtremeRisks", + }, + "intrinsic": { + "raw": { + "URL": "mypage.unibocconi.it/simonepadoan/", + "Date": "2020-05-05", + "Title": "Extreme Risk Measures", + "Author": "Simone Padoan [cre, aut],\n Gilles Stupfler [aut]", + # ... + "Date/Publication": "2020-05-07 10:20:02 UTC", + }, + "tool": "DESCRIPTION", + }, + "original_artifact": source_original_artifacts, + }, + } + + origin_url = "https://cran.r-project.org/package=ExtremeRisks" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000361aa33842cbdea5fa6e77db696b937ebd269" + ), + discovery_date=datetime.datetime( + 2020, 5, 7, 15, 27, 38, 652281, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_cran_without_revision_date(): + """Tests a CRAN revision with a date in the metadata but not as revision date""" + source_original_artifacts = [ + { + "length": 8018, + "filename": "gofgamma_1.0.tar.gz", + "checksums": { + "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b", + "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03", + }, + } + ] + dest_original_artifacts = [ + { + "length": 8018, + "filename": "gofgamma_1.0.tar.gz", + "checksums": { + "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b", + "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03", + }, + "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz", + } + ] + + row = { + "id": b'\x00\x00\xd4\xef^\x16a"\xae\xe6\x86*\xd3\x8a\x18\xceS\x86\xcc>', + "date": None, + "committer_date": None, + "type": "tar", + "message": b"1.0", + "metadata": { + "extrinsic": { + "raw": { + "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz", + "version": "1.0", + }, + "when": "2020-04-30T11:01:57.832481+00:00", + "provider": "https://cran.r-project.org/package=gofgamma", + }, + "intrinsic": { + "raw": { + "Type": "Package", + "Title": "Goodness-of-Fit Tests for the Gamma Distribution", + "Author": "Lucas Butsch [aut],\n Bruno Ebner [aut, cre],\n Steffen Betsch [aut]", + # ... + }, + "tool": "DESCRIPTION", + }, + "original_artifact": source_original_artifacts, + }, + } + + origin_url = "https://cran.r-project.org/package=gofgamma" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0000d4ef5e166122aee6862ad38a18ce5386cc3e" + ), + discovery_date=datetime.datetime( + 2020, 4, 30, 11, 1, 57, 832481, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py new file mode 100644 index 00000000..6147e4dd --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py @@ -0,0 +1,273 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_debian_with_extrinsic(): + dest_original_artifacts = [ + { + "length": 2936, + "filename": "kalgebra_19.12.1-1.dsc", + "checksums": { + "sha1": "f869e9f1155b1ee6d28ae3b40060570152a358cd", + "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11", + }, + "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc", + }, + { + "length": 1156408, + "filename": "kalgebra_19.12.1.orig.tar.xz", + "checksums": { + "sha1": "e496032962212983a5359aebadfe13c4026fd45c", + "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a", + }, + "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz", + }, + { + "length": 10044, + "filename": "kalgebra_19.12.1-1.debian.tar.xz", + "checksums": { + "sha1": "b518bfc2ac708b40577c595bd539faa8b84572db", + "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67", + }, + "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz", + }, + { + "length": 488, + "filename": "kalgebra_19.12.1.orig.tar.xz.asc", + "checksums": { + "sha1": "ff53a5c21c1aef2b9caa38a02fa3488f43df4c20", + "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd", + }, + "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc", + }, + ] + + source_original_artifacts = [ + {k: v for (k, v) in d.items() if k != "url"} for d in dest_original_artifacts + ] + + row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "date": datetime.datetime( + 2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc, + ), + "date_offset": 60, + "type": "dsc", + "message": b"Synthetic revision for Debian source package kalgebra version 4:19.12.1-1", + "metadata": { + "extrinsic": { + "raw": { + "id": 2718802, + "name": "kalgebra", + "files": { + "kalgebra_19.12.1-1.dsc": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc", + "name": "kalgebra_19.12.1-1.dsc", + "size": 2936, + "md5sum": "fd28f604d4cc31a0a305543230f1622a", + "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11", + }, + "kalgebra_19.12.1.orig.tar.xz": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz", + "name": "kalgebra_19.12.1.orig.tar.xz", + "size": 1156408, + "md5sum": "34e09ed152da762d53101ea33634712b", + "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a", + }, + "kalgebra_19.12.1-1.debian.tar.xz": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz", + "name": "kalgebra_19.12.1-1.debian.tar.xz", + "size": 10044, + "md5sum": "4f639f36143898d97d044f273f038e58", + "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67", + }, + "kalgebra_19.12.1.orig.tar.xz.asc": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc", + "name": "kalgebra_19.12.1.orig.tar.xz.asc", + "size": 488, + "md5sum": "3c29291e4e6f0c294de80feb8e9fce4c", + "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd", + }, + }, + "version": "4:19.12.1-1", + "revision_id": None, + }, + "when": "2020-01-27T19:32:03.925498+00:00", + "provider": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc", + }, + "intrinsic": { + "raw": { + "name": "kalgebra", + "version": "4:19.12.1-1", + # ... + }, + "tool": "dsc", + }, + "original_artifact": source_original_artifacts, + }, + } + + storage = Mock() + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0000036c311ef33a281b05688f6eadcfc0943aee" + ), + discovery_date=datetime.datetime( + 2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + ), + ] + ) + ] + + +def test_debian_without_extrinsic(): + source_original_artifacts = [ + { + "name": "pymongo_1.10-1.dsc", + "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241", + "length": 99, + "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f", + "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b", + }, + { + "name": "pymongo_1.10.orig.tar.gz", + "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3", + "length": 99, + "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f", + "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad", + }, + { + "name": "pymongo_1.10-1.debian.tar.gz", + "sha1": "fbf378296613c8d55e043aec98896b3e50a94971", + "length": 99, + "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513", + "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec", + }, + ] + + dest_original_artifacts = [ + { + "length": 99, + "filename": "pymongo_1.10-1.dsc", + "checksums": { + "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241", + "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f", + "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b", + }, + }, + { + "length": 99, + "filename": "pymongo_1.10.orig.tar.gz", + "checksums": { + "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3", + "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f", + "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad", + }, + }, + { + "length": 99, + "filename": "pymongo_1.10-1.debian.tar.gz", + "checksums": { + "sha1": "fbf378296613c8d55e043aec98896b3e50a94971", + "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513", + "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec", + }, + }, + ] + + row = { + "id": b"\x00\x00\x01\xc2\x8c\x8f\xca\x01\xb9\x04\xde\x92\xa2d\n\x86l\xe0<\xb7", + "date": datetime.datetime( + 2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc + ), + "date_offset": 0, + "type": "dsc", + "message": b"Synthetic revision for Debian source package pymongo version 1.10-1", + "metadata": { + "package_info": { + "name": "pymongo", + "version": "1.10-1", + "changelog": { + # ... + }, + "maintainers": [ + {"name": "Federico Ceratto", "email": "federico.ceratto@gmail.com"}, + {"name": "Janos Guljas", "email": "janos@resenje.org"}, + ], + "pgp_signature": { + "date": "2011-03-31T21:02:44+00:00", + "keyid": "2BABC6254E66E7B8450AC3E1E6AA90171392B174", + "person": {"name": "David Paleino", "email": "d.paleino@gmail.com"}, + }, + "lister_metadata": {"id": 244296, "lister": "snapshot.debian.org"}, + }, + "original_artifact": source_original_artifacts, + }, + } + + storage = Mock() + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000001c28c8fca01b904de92a2640a866ce03cb7" + ), + discovery_date=datetime.datetime( + 2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + ), + ] + ) + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py new file mode 100644 index 00000000..1c018ac8 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py @@ -0,0 +1,1167 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock, MagicMock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import ( + DEPOSIT_COLS, + handle_row, + cran_package_from_url, +) + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) +SWH_DEPOSIT_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url="https://www.softwareheritage.org", + metadata={}, +) +HAL_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url="https://hal.archives-ouvertes.fr/", + metadata={}, +) +INTEL_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.DEPOSIT_CLIENT, + url="https://software.intel.com", + metadata={}, +) + + +def get_mock_deposit_cur(row_dicts): + rows = [tuple(d[key] for key in DEPOSIT_COLS) for d in row_dicts] + deposit_cur = MagicMock() + deposit_cur.__iter__.side_effect = [iter(rows)] + return deposit_cur + + +def test_deposit_1(): + """Has a provider and xmlns, and the metadata is in the revision twice + (at the root of the metadata dict, and in + metadata->extrinsic->raw->origin_metadata)""" + extrinsic_metadata = { + "title": "Je suis GPL", + "@xmlns": "http://www.w3.org/2005/Atom", + "client": "swh", + "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "codemeta:author": { + "codemeta:name": "Stefano Zacchiroli", + "codemeta:jobTitle": "Maintainer", + }, + "codemeta:license": { + "codemeta:url": "https://spdx.org/licenses/GPL-3.0-or-later.html", + "codemeta:name": "GNU General Public License v3.0 or later", + }, + # ... + } + original_artifacts = [ + { + "length": 80880, + "filename": "archive.zip", + "checksums": { + "sha1": "bad32a47a359e0e16ebdca2ad2dc6a771dac8f71", + "sha256": "182b7ee3b7b5b550e83d3bcfed029bb2f625ee760ebfe9557d5fd072bd4e22e4", + }, + } + ] + + row = { + "id": b"\x02#\x10\xdf\x16\xfd\x9eMO\x81\xfe6\xa1B\xe8-\xb9w\xc0\x1d", + "date": datetime.datetime(2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"swh: Deposit 467 in collection swh", + "metadata": { + "client": "swh", + "extrinsic": { + "raw": { + "origin": { + "url": "https://www.softwareheritage.org/check-deposit-2020-03-11T11:07:18.424476", + "type": "deposit", + }, + "branch_name": "master", + "origin_metadata": { + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": {"sword_version": 2}, + }, + "metadata": extrinsic_metadata, + }, + }, + "when": "2020-03-11T11:11:36.336283+00:00", + "provider": "https://deposit.softwareheritage.org/1/private/467/meta/", + }, + "original_artifact": original_artifacts, + **extrinsic_metadata, + }, + } + + origin_url = ( + "https://www.softwareheritage.org/check-deposit-2020-03-11T11:07:18.424476" + ) + + swhid = ( + f"swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" + f";origin={origin_url}" + f";visit=swh:1:snp:14433c19dbb03ad57c86b58b53a800d6a0e32dd3" + f";anchor=swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d" + f";path=/" + ) + + deposit_rows = [ + { + "deposit.id": 467, + "deposit.external_id": "check-deposit-2020-03-11T11:07:18.424476", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2020, 3, 11, 11, 7, 18, 688410, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://www.softwareheritage.org", + "deposit_collection.name": "swh", + "auth_user.username": "swh", + }, + { + "deposit.id": 467, + "deposit.external_id": "check-deposit-2020-03-11T11:07:18.424476", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2020, 3, 11, 11, 7, 18, 669428, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://www.softwareheritage.org", + "deposit_collection.name": "swh", + "auth_user.username": "swh", + }, + ] + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d" + ), + discovery_date=datetime.datetime( + 2020, 3, 11, 11, 7, 18, 688410, tzinfo=datetime.timezone.utc + ), + authority=SWH_DEPOSIT_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d" + ), + discovery_date=datetime.datetime( + 2020, 3, 11, 11, 11, 36, 336283, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_deposit_2_without_xmlns(): + """Has a provider, no xmlns, and the metadata is only in + metadata->extrinsic->raw->origin_metadata)""" + extrinsic_metadata = { + "{http://www.w3.org/2005/Atom}id": "hal-01243573", + "{http://www.w3.org/2005/Atom}author": { + "{http://www.w3.org/2005/Atom}name": "HAL", + "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr", + }, + "{http://www.w3.org/2005/Atom}client": "hal", + "{http://www.w3.org/2005/Atom}external_identifier": "hal-01243573", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://hal-test.archives-ouvertes.fr/hal-01243573", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": { + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter" + }, + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}version": 1, + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifier": "10.5281/zenodo.438684", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}dateCreated": "2017-11-16T14:54:23+01:00", + } + original_artifacts = [ + { + "length": 208357, + "filename": "archive.zip", + "checksums": { + "sha1": "fa0aec08e8a44ea144dba7ce366c8b5d66c14453", + "sha256": "f53c05fe947e88ce83751a93bd522b1f88478ea2e7b984c07fc7a7c68128bf87", + }, + } + ] + + row = { + "id": b"\x01\x16\xca\xb7\x19d\xd5\x9c\x85p\xb4\xc5r\x9b(\xbd\xd6<\x9bF", + "date": datetime.datetime( + 2018, 1, 17, 12, 54, 0, 723882, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2018, 1, 17, 12, 54, 0, 723882, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"hal: Deposit 82 in collection hal", + "metadata": { + "extrinsic": { + "raw": { + "origin": { + "url": "https://hal.archives-ouvertes.fr/hal-01243573", + "type": "deposit", + }, + "origin_metadata": { + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": {"sword_version": 2}, + }, + "metadata": extrinsic_metadata, + "provider": { + "metadata": {}, + "provider_url": "https://hal.archives-ouvertes.fr/", + "provider_name": "hal", + "provider_type": "deposit_client", + }, + }, + }, + "when": "2020-05-15T14:27:21.462270+00:00", + "provider": "https://deposit.softwareheritage.org/1/private/82/meta/", + }, + "original_artifact": original_artifacts, + }, + } + + swhid = ( + "swh:1:dir:e04b2a7b8a8838da0693e9fd992a10d6fd211b50" + ";origin=https://hal.archives-ouvertes.fr/hal-01243573" + ";visit=swh:1:snp:abc9ae594245a740235b6c039f044352a5f723ec" + ";anchor=swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46" + ";path=/" + ) + + deposit_rows = [ + { + "deposit.id": 82, + "deposit.external_id": "hal-01243573", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2018, 1, 17, 12, 54, 1, 533972, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + { + "deposit.id": 82, + "deposit.external_id": "hal-01243573", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2018, 1, 17, 12, 54, 0, 413748, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + ] + + origin_url = "https://hal.archives-ouvertes.fr/hal-01243573" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46" + ), + discovery_date=datetime.datetime( + 2018, 1, 17, 12, 54, 0, 413748, tzinfo=datetime.timezone.utc + ), + authority=HAL_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46" + ), + discovery_date=datetime.datetime( + 2020, 5, 15, 14, 27, 21, 462270, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_deposit_2_with_xmlns(): + """Has a provider, xmlns, and the metadata is only in + metadata->extrinsic->raw->origin_metadata)""" + extrinsic_metadata = { + "title": "Je suis GPL", + "@xmlns": "http://www.w3.org/2005/Atom", + "client": "swh", + "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "codemeta:author": { + "codemeta:name": "Stefano Zacchiroli", + "codemeta:jobTitle": "Maintainer", + }, + "codemeta:license": { + "codemeta:url": "https://spdx.org/licenses/GPL-3.0-or-later.html", + "codemeta:name": "GNU General Public License v3.0 or later", + }, + "external_identifier": "je-suis-gpl", + "codemeta:dateCreated": "2018-01-05", + } + original_artifacts = [ + { + "length": 80880, + "filename": "archive.zip", + "checksums": { + "sha1": "bad32a47a359e0e16ebdca2ad2dc6a771dac8f71", + "sha256": "182b7ee3b7b5b550e83d3bcfed029bb2f625ee760ebfe9557d5fd072bd4e22e4", + }, + } + ] + + row = { + "id": b'\x01"\x96nP\x93\x17\xae\xcejA\xd0\xf0\x88\xdas<\xc0\x9d\x0f', + "date": datetime.datetime(2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"swh: Deposit 687 in collection swh", + "metadata": { + "extrinsic": { + "raw": { + "origin": { + "url": "https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420", + "type": "deposit", + }, + "origin_metadata": { + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": {"sword_version": 2}, + }, + "metadata": extrinsic_metadata, + "provider": { + "metadata": {}, + "provider_url": "https://www.softwareheritage.org", + "provider_name": "swh", + "provider_type": "deposit_client", + }, + }, + }, + "when": "2020-06-26T13:50:22.640625+00:00", + "provider": "https://deposit.softwareheritage.org/1/private/687/meta/", + }, + "original_artifact": original_artifacts, + }, + } + + swhid = ( + "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea" + ";origin=https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420" + ";visit=swh:1:snp:8fd469e280fb0724175c64906627f619143d5bdb" + ";anchor=swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f" + ";path=/" + ) + deposit_rows = [ + { + "deposit.id": 687, + "deposit.external_id": "check-deposit-2020-06-26T13:50:07.564420", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2020, 6, 26, 13, 50, 8, 216113, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://www.softwareheritage.org", + "deposit_collection.name": "swh", + "auth_user.username": "swh", + }, + { + "deposit.id": 687, + "deposit.external_id": "check-deposit-2020-06-26T13:50:07.564420", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2020, 6, 26, 13, 50, 8, 150498, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://www.softwareheritage.org", + "deposit_collection.name": "swh", + "auth_user.username": "swh", + }, + ] + + origin_url = ( + "https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420" + ) + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f" + ), + discovery_date=datetime.datetime( + 2020, 6, 26, 13, 50, 8, 216113, tzinfo=datetime.timezone.utc + ), + authority=SWH_DEPOSIT_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f" + ), + discovery_date=datetime.datetime( + 2020, 6, 26, 13, 50, 22, 640625, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_deposit_3_and_wrong_external_id_in_metadata(): + extrinsic_metadata = { + "title": "VTune Perf tool", + "@xmlns": "http://www.w3.org/2005/Atom", + "client": "swh", + "codemeta:url": "https://software.intel.com/en-us/vtune", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "codemeta:author": { + "codemeta:name": "VTune developer", + "codemeta:jobTitle": "Software Engineer", + }, + "external_identifier": "vtune-perf-tool", + "codemeta:dateCreated": "2019-05-14", + "codemeta:description": "Modified version of Linux Perf tool which is used by Intel VTune Amplifier", + } + source_original_artifacts = [ + { + "name": "archive.zip", + "sha1": "07251dbb1d904d143fd7da9935701f17670d4d9b", + "length": 4350528, + "sha256": "1f7d111ac79e468002f3edf4b7b2487538d41f6bea362d49b2eb08a537efafb6", + "sha1_git": "e2d894efcaad4ff36f09eda3b3c0096416b03429", + "blake2s256": "e2c08b82efbc361fbb2d28aa8352668cd71217f165f63de16b61ed61ace7509d", + "archive_type": "zip", + } + ] + dest_original_artifacts = [ + { + "length": 4350528, + "archive_type": "zip", + "filename": "archive.zip", + "checksums": { + "sha1": "07251dbb1d904d143fd7da9935701f17670d4d9b", + "sha256": "1f7d111ac79e468002f3edf4b7b2487538d41f6bea362d49b2eb08a537efafb6", + "sha1_git": "e2d894efcaad4ff36f09eda3b3c0096416b03429", + "blake2s256": "e2c08b82efbc361fbb2d28aa8352668cd71217f165f63de16b61ed61ace7509d", + }, + } + ] + + row = { + "id": b"\t5`S\xc4\x9a\xd0\xf9\xe6.Q\xc2\x9d>a|y\x11@\xdf", + "date": datetime.datetime(2019, 5, 14, 0, 0, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2019, 5, 14, 0, 0, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"intel: Deposit 268 in collection intel", + "metadata": { + **extrinsic_metadata, + "original_artifact": source_original_artifacts, + }, + } + + swhid = ( + "swh:1:dir:527c8e4a67d391f2bf1bbc86dd94af5d5cfc8ef7" + ";origin=https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff" + ";visit=swh:1:snp:49d60943d9c061da1aba6266a811412f9db8de2e" + ";anchor=swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df" + ";path=/" + ) + deposit_rows = [ + { + "deposit.id": 268, + "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://software.intel.com", + "deposit_collection.name": "intel", + "auth_user.username": "intel", + }, + { + "deposit.id": 268, + "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2019, 5, 14, 7, 49, 36, 477061, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://software.intel.com", + "deposit_collection.name": "intel", + "auth_user.username": "intel", + }, + { + "deposit.id": 268, + "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2019, 5, 14, 7, 28, 33, 210100, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://software.intel.com", + "deposit_collection.name": "intel", + "auth_user.username": "intel", + }, + { + "deposit.id": 268, + "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2019, 5, 14, 7, 28, 33, 41454, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://software.intel.com", + "deposit_collection.name": "intel", + "auth_user.username": "intel", + }, + ] + + origin_url = "https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df" + ), + discovery_date=datetime.datetime( + 2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc + ), + authority=INTEL_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df" + ), + discovery_date=datetime.datetime( + 2019, 5, 14, 7, 28, 33, 210100, tzinfo=datetime.timezone.utc + ), + authority=INTEL_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df" + ), + discovery_date=datetime.datetime( + 2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_deposit_3_and_no_swhid(): + extrinsic_metadata = { + "id": "hal-02337300", + "@xmlns": "http://www.w3.org/2005/Atom", + "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"}, + "client": "hal", + "codemeta:url": "https://hal.archives-ouvertes.fr/hal-02337300", + "codemeta:name": "R package SMM, Simulation and Estimation of Multi-State Discrete-Time Semi-Markov and Markov Models", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "codemeta:author": [ + # ... + ], + # ... + } + original_artifacts = [ + # ... + ] + + row = { + "id": b"\x91\xe5\xca\x8b'K\xf1\xa8cFd2\xd7Q\xf7A\xbc\x94\xba&", + "date": datetime.datetime(2017, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2019, 11, 6, 14, 47, 30, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"hal: Deposit 342 in collection hal", + "metadata": {**extrinsic_metadata, "original_artifact": original_artifacts,}, + } + storage = Mock() + + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [] + + +def test_deposit_3_and_unknown_deposit(): + extrinsic_metadata = { + "title": "Je suis GPL", + "@xmlns": "http://www.w3.org/2005/Atom", + "client": "swh", + "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "codemeta:author": { + "codemeta:name": "Stefano Zacchiroli", + "codemeta:jobTitle": "Maintainer", + }, + # ... + } + + row = { + "id": b"\x8e\x9c\xee\x14\xa6\xad9\xbc\xa44pw\xb8\x7f\xb5\xbb\xd8\x95;\xb1", + "date": datetime.datetime( + 2018, 7, 23, 12, 25, 45, 907132, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2018, 7, 23, 12, 25, 45, 907132, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"swh: Deposit 159 in collection swh", + "metadata": extrinsic_metadata, + } + + origin_url = "https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff" + + storage = Mock() + + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [] + + +def test_deposit_4_without_xmlns(): + extrinsic_metadata = { + "{http://www.w3.org/2005/Atom}id": "hal-01243573", + "{http://www.w3.org/2005/Atom}author": { + "{http://www.w3.org/2005/Atom}name": "HAL", + "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr", + }, + "{http://www.w3.org/2005/Atom}client": "hal", + "{http://www.w3.org/2005/Atom}external_identifier": "hal-01243573", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://hal-test.archives-ouvertes.fr/hal-01243573", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": { + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter" + }, + # ... + } + + row = { + "id": b"\x03\x98\x7f\x05n\xafE\x96\xcd \xd7\xb2\xee\x01\xc9\xb8L\xed\xdf\xa8", + "date": datetime.datetime( + 2018, 1, 17, 12, 49, 30, 902891, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2018, 1, 17, 12, 49, 30, 902891, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b": Deposit 79 in collection hal", + "metadata": extrinsic_metadata, + } + + swhid = ( + "swh:1:dir:e04b2a7b8a8838da0693e9fd992a10d6fd211b50" + ";origin=https://hal.archives-ouvertes.fr/hal-01243573" + ";visit=swh:1:snp:c31851534c86676a040fb10f438728c90f1c9d55" + ";anchor=swh:1:rev:43549ebbe70c9cdf0be1647e6319392eaa06f3a3" + ";path=/" + ) + deposit_rows = [ + { + "deposit.id": 79, + "deposit.external_id": "hal-01243573", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2018, 1, 17, 12, 49, 31, 208347, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + { + "deposit.id": 79, + "deposit.external_id": "hal-01243573", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2018, 1, 17, 12, 49, 30, 645576, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + ] + + origin_url = "https://hal.archives-ouvertes.fr/hal-01243573" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:03987f056eaf4596cd20d7b2ee01c9b84ceddfa8" + ), + discovery_date=datetime.datetime( + 2018, 1, 17, 12, 49, 30, 645576, tzinfo=datetime.timezone.utc + ), + authority=HAL_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + # note: no original artifacts + ] + + +def test_deposit_4_wrong_origin(): + extrinsic_metadata = { + "{http://www.w3.org/2005/Atom}id": "hal-01588781", + "{http://www.w3.org/2005/Atom}author": { + "{http://www.w3.org/2005/Atom}name": "HAL", + "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr", + }, + "{http://www.w3.org/2005/Atom}client": "hal", + "{http://www.w3.org/2005/Atom}external_identifier": "hal-01588781", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem ", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": { + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter", + "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation": "Initiative pour la Recherche et l'Innovation sur le Logiciel Libre", + }, + # ... + } + + row = { + "id": b"-{\xcec\x1f\xc7\x91\x08\x03\x11\xeb\x83\\GB\x8eXjn\xa4", + "date": datetime.datetime( + 2018, 1, 10, 13, 14, 51, 77033, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2018, 1, 10, 13, 14, 51, 77033, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b": Deposit 75 in collection hal", + "metadata": extrinsic_metadata, + } + + swhid = ( + "swh:1:dir:d8971c651fe256942aa4499a3ccdbaa305d3bade" + ";origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01588781" + ";visit=swh:1:snp:7c70cc8ea5b79e376605fd6e9b3b04d98861ffc0" + ";anchor=swh:1:rev:2d7bce631fc791080311eb835c47428e586a6ea4" + ";path=/" + ) + deposit_rows = [ + { + "deposit.id": 75, + "deposit.external_id": "hal-01588781", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2018, 1, 10, 13, 14, 51, 523963, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + { + "deposit.id": 75, + "deposit.external_id": "hal-01588781", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2018, 1, 10, 13, 14, 50, 555143, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + ] + + origin_url = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:2d7bce631fc791080311eb835c47428e586a6ea4" + ), + discovery_date=datetime.datetime( + 2018, 1, 10, 13, 14, 50, 555143, tzinfo=datetime.timezone.utc + ), + authority=HAL_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + # note: no original artifacts + ] + + +def test_deposit_missing_metadata_in_revision(): + extrinsic_metadata = { + "id": "hal-01243573", + "@xmlns": "http://www.w3.org/2005/Atom", + "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"}, + "client": "hal", + "committer": "Administrateur Du Ccsd", + "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243573", + "codemeta:name": "The assignment problem", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, + "codemeta:version": "1", + "codemeta:identifier": {"#text": "10.5281/zenodo.438684", "@name": "doi",}, + "external_identifier": "hal-01243573", + "codemeta:dateCreated": "2017-11-16T14:54:23+01:00", + } + source_original_artifacts = [ + { + "name": "archive.zip", + "sha1": "e8e46324970cd5af7f98c5a86f33f47fa4a41b4a", + "length": 118650, + "sha256": "fec81b63d666c43524f966bbd3263da5bee55051d2b48c1659cca5f56fd953e5", + "sha1_git": "9da2bbd08bec590b36ede2ed43d74cd510b10a79", + "blake2s256": "5d0973ba3644cc2bcfdb41ff1891744337d6aa9547a7e59fe466f684b027f295", + "archive_type": "zip", + } + ] + dest_original_artifacts = [ + { + "length": 118650, + "archive_type": "zip", + "filename": "archive.zip", + "checksums": { + "sha1": "e8e46324970cd5af7f98c5a86f33f47fa4a41b4a", + "sha256": "fec81b63d666c43524f966bbd3263da5bee55051d2b48c1659cca5f56fd953e5", + "sha1_git": "9da2bbd08bec590b36ede2ed43d74cd510b10a79", + "blake2s256": "5d0973ba3644cc2bcfdb41ff1891744337d6aa9547a7e59fe466f684b027f295", + }, + } + ] + + row = { + "id": b"\x03@v\xf3\xf4\x1e\xe1 N\xb9\xf6@\x82\xcb\xe6\xe9P\xd7\xbb\x8a", + "date": datetime.datetime( + 2019, 2, 25, 15, 49, 16, 594536, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2019, 2, 25, 15, 49, 16, 594536, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"hal: Deposit 229 in collection hal", + "metadata": {"original_artifact": source_original_artifacts}, + } + + swhid = ( + "swh:1:dir:3d65b6f065118cb856272829b459f0dfa55549aa" + ";origin=https://hal-test.archives-ouvertes.fr/hal-01243573" + ";visit=swh:1:snp:322c54ff4023d3216a994bc9ff9ee524ed80ee1f" + ";anchor=swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a" + ";path=/" + ) + deposit_rows = [ + { + "deposit.id": 229, + "deposit.external_id": "hal-01243573", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": None, + "deposit_request.date": datetime.datetime( + 2019, 2, 25, 15, 54, 30, 102072, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + { + "deposit.id": 229, + "deposit.external_id": "hal-01243573", + "deposit.swh_id_context": swhid, + "deposit.status": "success", + "deposit_request.metadata": extrinsic_metadata, + "deposit_request.date": datetime.datetime( + 2019, 2, 25, 15, 49, 12, 302745, tzinfo=datetime.timezone.utc + ), + "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/", + "deposit_collection.name": "hal", + "auth_user.username": "hal", + }, + ] + + origin_url = "https://hal.archives-ouvertes.fr/hal-01243573" + # /!\ not https://hal-test.archives-ouvertes.fr/hal-01243573 + # do not trust the metadata! + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = get_mock_deposit_cur(deposit_rows) + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + deposit_cur.execute.assert_called_once() + deposit_cur.__iter__.assert_called_once() + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a" + ), + discovery_date=datetime.datetime( + 2019, 2, 25, 15, 49, 12, 302745, tzinfo=datetime.timezone.utc + ), + authority=HAL_AUTHORITY, + fetcher=FETCHER, + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a" + ), + discovery_date=datetime.datetime( + 2019, 2, 25, 15, 54, 30, 102072, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py b/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py new file mode 100644 index 00000000..a0479357 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py @@ -0,0 +1,108 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_gnu(): + original_artifacts = [ + { + "length": 842501, + "filename": "gperf-3.0.1.tar.gz", + "checksums": { + "sha1": "c4453ee492032b369006ee464f4dd4e2c0c0e650", + "sha256": "5be283ef62e1bd26abdaaf88b416dbea4b14c360b09befcda2f055656dc43f87", + "sha1_git": "bf1d5bb57d571101dd7b6acab2b78ae11bb861de", + "blake2s256": "661f84afeb1e0b914defe2b249d424af1dfe380a96016b3282ae758c70e19a70", + }, + } + ] + + row = { + "id": b"\x00\x1cqE\x8e@[%\xba\xcc\xc8\x0b\x99\xf6cM\xff\x9d+\x18", + "date": datetime.datetime(2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"swh-loader-package: synthetic revision message", + "metadata": { + "extrinsic": { + "raw": { + "url": "https://ftp.gnu.org/gnu/gperf/gperf-3.0.1.tar.gz", + "time": "2003-06-13T00:11:00+00:00", + "length": 842501, + "version": "3.0.1", + "filename": "gperf-3.0.1.tar.gz", + }, + "when": "2019-11-27T11:17:38.318997+00:00", + "provider": "https://ftp.gnu.org/gnu/gperf/", + }, + "intrinsic": {}, + "original_artifact": original_artifacts, + }, + } + + origin_url = "https://ftp.gnu.org/gnu/gperf/" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:001c71458e405b25baccc80b99f6634dff9d2b18" + ), + discovery_date=datetime.datetime( + 2019, 11, 27, 11, 17, 38, 318997, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py b/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py new file mode 100644 index 00000000..58541e67 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py @@ -0,0 +1,124 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) +NIX_UNSTABLE_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, + url="https://nix-community.github.io/nixpkgs-swh/sources-unstable.json", + metadata={}, +) + + +def test_nixguix(): + extrinsic_metadata = { + "url": "https://files.pythonhosted.org/packages/source/a/alerta/alerta-7.4.5.tar.gz", + "integrity": "sha256-km8RAaG1ep+tYR8eHVr3UWk+/MNEqdsBr1Di/g02LYQ=", + } + original_artifacts = [ + { + "length": 34903, + "filename": "alerta-7.4.5.tar.gz", + "checksums": { + "sha1": "66db4398b664de272fd5aa6610caa776b5e64651", + "sha256": "926f1101a1b57a9fad611f1e1d5af751693efcc344a9db01af50e2fe0d362d84", + }, + } + ] + + row = { + "id": b"\x00\x01\xbaM\xd0S\x94\x85\x02\x11\xd7\xb3\x85M\x99\x13\xd2:\xe3y", + "date": None, + "committer_date": None, + "type": "tar", + "message": b"", + "metadata": { + "extrinsic": { + "raw": extrinsic_metadata, + "when": "2020-06-03T11:25:05.259341+00:00", + "provider": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json", + }, + "original_artifact": original_artifacts, + }, + } + + origin_url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379" + ), + discovery_date=datetime.datetime( + 2020, 6, 3, 11, 25, 5, 259341, tzinfo=datetime.timezone.utc + ), + authority=NIX_UNSTABLE_AUTHORITY, + fetcher=FETCHER, + format="nixguix-sources-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379" + ), + discovery_date=datetime.datetime( + 2020, 6, 3, 11, 25, 5, 259341, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py b/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py new file mode 100644 index 00000000..ae82d559 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py @@ -0,0 +1,376 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import ( + handle_row, + npm_package_from_source_url, +) + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +NPM_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_npm_package_from_source_url(): + package_urls = [ + ( + "@l3ilkojr/jdinsults", + "https://registry.npmjs.org/@l3ilkojr/jdinsults/-/jdinsults-3.0.0.tgz", + ), + ("simplemaps", "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz"), + ( + "@piximi/components", + "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz", + ), + ( + "@chappa'ai/get-next-rc", + "https://registry.npmjs.org/@chappa%27ai/get-next-rc/-/get-next-rc-1.0.0.tgz", + ), + ] + + for (package_name, source_url) in package_urls: + assert npm_package_from_source_url(source_url) == package_name + + +def test_npm_1(): + """Tests loading a revision generated by a new NPM loader that + has a provider.""" + + extrinsic_metadata = { + "_id": "@l3ilkojr/jdinsults@3.0.0", + "dist": { + "shasum": "b7f0d66090e0285f4e95d082d39bcb0c1b8f4ec8", + "tarball": "https://registry.npmjs.org/@l3ilkojr/jdinsults/-/jdinsults-3.0.0.tgz", + "fileCount": 4, + "integrity": "sha512-qpv8Zg51g0l51VjODEooMUGSGanGUuQpzX5msfR7ZzbgTsgPbpDNyTIsQ0wQzI9RzCCUjS84Ii2VhMISEQcEUA==", + "unpackedSize": 1583, + "npm-signature": "-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.4\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJeUMS5CRA9TVsSAnZWagAAXpgP/0YgNOWN0U/Fz2RGeQhR\nVIKPvfGqZ2UfFxxUXWIc4QHvwyLCNUedCctpVdqnqmGJ9m/hj3K2zbRPD7Tm\n3nPl0HfzE7v3T8TDZfGhzW3c9mWxig+syr+sjo0EKyAgZVJ0mxbjOl4KHt+U\nQEwl/4falBsyYtK/pkCXWmmuC606QmPn/c6ZRD1Fw4vJjT9i5qi1KaBkIf6M\nnFmpOFxTcwxGGltOk3s3TKDtr8CIeWmdm3VkgsP2ErkPKAOcu12AT4/5tkg0\nDU+m1XmJb67rskb4Ncjvic/VutnPkEfNrk1IRXrmjDZBQbHtCJ7hd5ETmb9S\nE5WmMV8cpaGiW7AZvGTmkn5WETwQQU7po914zYiMg9+ozdwc7yC8cpGj/UoF\niKxsc1uxdfwWk/p3dShegEYM7sveloIXYsPaxbd84WRIfnwkWFZV82op96E3\neX+FRkhMfsHlK8OjZsBPXkppaB48jnZdm3GOOzT9YgyphV33j3J9GnNcDMDe\nriyCLV1BNSKDHElCDrvl1cBGg+C5qn/cTYjQdfEPPY2Hl2MgW9s4UV2s+YSx\n0BBd2A3j80wncP+Y7HFeC4Pv0SM0Pdq6xJaf3ELhj6j0rVZeTW1O3E/PFLXK\nnn/DZcsFXgIzjY+eBIMQgAhqyeJve8LeQNnGt3iNW10E2nZMpfc+dn0ESiwV\n2Gw4\r\n=8uqZ\r\n-----END PGP SIGNATURE-----\r\n", + }, + "name": "@l3ilkojr/jdinsults", + "version": "3.0.0", + "_npmUser": {"name": "l3ilkojr", "email": "l3ilkojr@example.com"}, + "_npmVersion": "6.13.6", + "description": "Generates insults", + "directories": {}, + "maintainers": [{"name": "l3ilkojr", "email": "l3ilkojr@example.com"}], + "_nodeVersion": "10.14.0", + "_hasShrinkwrap": False, + "_npmOperationalInternal": { + "tmp": "tmp/jdinsults_3.0.0_1582351545285_0.2614827716102821", + "host": "s3://npm-registry-packages", + }, + } + + original_artifacts = [ + { + "length": 1033, + "filename": "jdinsults-3.0.0.tgz", + "checksums": { + "sha1": "b7f0d66090e0285f4e95d082d39bcb0c1b8f4ec8", + "sha256": "42f22795ac883b02fded0b2bf3d8a77f6507d40bc67f28eea6b1b73eb59c515f", + }, + } + ] + + row = { + "id": b"\x00\x00\x02\xa4\x9b\xba\x17\xca\x8c\xf3\x7f_=\x16\xaa\xac\xf9S`\xfc", + "date": datetime.datetime(2020, 2, 22, 6, 5, 45, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2020, 2, 22, 6, 5, 45, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"3.0.0", + "metadata": { + "extrinsic": { + "raw": extrinsic_metadata, + "when": "2020-02-27T01:35:47.965375+00:00", + "provider": "https://replicate.npmjs.com/%40l3ilkojr%2Fjdinsults/", + }, + "intrinsic": { + "raw": {"name": "@l3ilkojr/jdinsults", "version": "3.0.0"}, + "tool": "package.json", + }, + "original_artifact": original_artifacts, + }, + } + + origin_url = "https://www.npmjs.com/package/@l3ilkojr/jdinsults" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000002a49bba17ca8cf37f5f3d16aaacf95360fc" + ), + discovery_date=datetime.datetime( + 2020, 2, 27, 1, 35, 47, 965375, tzinfo=datetime.timezone.utc, + ), + authority=NPM_AUTHORITY, + fetcher=FETCHER, + format="replicate-npm-package-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000002a49bba17ca8cf37f5f3d16aaacf95360fc" + ), + discovery_date=datetime.datetime( + 2020, 2, 27, 1, 35, 47, 965375, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_npm_2_unscoped(): + """Tests loading a revision generated by an old NPM loader that doesn't + have a provider; and the package name is unscoped (ie. doesn't contain a + slash).""" + + extrinsic_metadata = { + "bugs": {"url": "https://github.com/niwasawa/simplemaps/issues"}, + "name": "simplemaps", + "author": "Naoki Iwasawa", + "license": "MIT", + # ... + } + + package_source = { + "url": "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz", + "date": "2016-12-23T07:21:29.733Z", + "name": "simplemaps", + "sha1": "e2b8222930196def764527f5c61048c5b28fe3c4", + "sha256": "3ce94927bab5feafea5695d1fa4c2b8131413e53e249b32f9ac2ccff4d865a0b", + "version": "0.0.6", + "filename": "simplemaps-0.0.6.tgz", + "blake2s256": "6769b4009f8162be2e745604b153443d4907a85781d31a724217a3e2d42a7462", + } + + original_artifacts = [ + { + "filename": "simplemaps-0.0.6.tgz", + "checksums": { + "sha1": "e2b8222930196def764527f5c61048c5b28fe3c4", + "sha256": "3ce94927bab5feafea5695d1fa4c2b8131413e53e249b32f9ac2ccff4d865a0b", + "blake2s256": "6769b4009f8162be2e745604b153443d4907a85781d31a724217a3e2d42a7462", + }, + "url": "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz", + } + ] + + row = { + "id": b"\x00\x00\x04\xae\xed\t\xee\x08\x9cx\x12d\xc0M%d\xfdX\xfe\xb5", + "date": datetime.datetime( + 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"0.0.6", + "metadata": {"package": extrinsic_metadata, "package_source": package_source,}, + } + + origin_url = "https://www.npmjs.com/package/simplemaps" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000004aeed09ee089c781264c04d2564fd58feb5" + ), + discovery_date=datetime.datetime( + 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc, + ), + authority=NPM_AUTHORITY, + fetcher=FETCHER, + format="replicate-npm-package-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000004aeed09ee089c781264c04d2564fd58feb5" + ), + discovery_date=datetime.datetime( + 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_npm_2_scoped(): + """Tests loading a revision generated by an old NPM loader that doesn't + have a provider; and the package name is scoped (ie. in the format + @org/name).""" + + extrinsic_metadata = { + "bugs": {"url": "https://github.com/piximi/components/issues"}, + "name": "@piximi/components", + # ... + } + + package_source = { + "url": "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz", + "date": "2019-06-07T19:56:04.753Z", + "name": "@piximi/components", + "sha1": "4ab74e563cb61bb5b2022601a5133a2dd19d19ec", + "sha256": "69bb980bd6de3277b6bca86fd79c91f1c28db6910c8d03ecd05b32b78a35188f", + "version": "0.1.11", + "filename": "components-0.1.11.tgz", + "blake2s256": "ce33181d5eff25b70ffdd6f1a18acd472a1707ede23cd2adc6af272dfc40dbfd", + } + + original_artifacts = [ + { + "filename": "components-0.1.11.tgz", + "checksums": { + "sha1": "4ab74e563cb61bb5b2022601a5133a2dd19d19ec", + "sha256": "69bb980bd6de3277b6bca86fd79c91f1c28db6910c8d03ecd05b32b78a35188f", + "blake2s256": "ce33181d5eff25b70ffdd6f1a18acd472a1707ede23cd2adc6af272dfc40dbfd", + }, + "url": "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz", + } + ] + + row = { + "id": b"\x00\x00 \x19\xc5wXt\xbc\xed\x00zR\x9b\xd3\xb7\x8b\xf6\x04W", + "date": datetime.datetime(2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"0.1.11", + "metadata": {"package": extrinsic_metadata, "package_source": package_source,}, + } + + origin_url = "https://www.npmjs.com/package/@piximi/components" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:00002019c5775874bced007a529bd3b78bf60457" + ), + discovery_date=datetime.datetime( + 2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc, + ), + authority=NPM_AUTHORITY, + fetcher=FETCHER, + format="replicate-npm-package-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:00002019c5775874bced007a529bd3b78bf60457" + ), + discovery_date=datetime.datetime( + 2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py new file mode 100644 index 00000000..a84b042d --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -0,0 +1,356 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import copy +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import ( + handle_row, + pypi_project_from_filename, +) + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +PYPI_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={}, +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_pypi_project_from_filename(): + files = [ + ("django-agent-trust-0.1.8.tar.gz", "django-agent-trust"), + ("python_test-1.0.1.zip", "python_test"), + ("py-evm-0.2.0a9.tar.gz", "py-evm"), + ("collective.texttospeech-1.0rc1.tar.gz", "collective.texttospeech"), + ("flatland-fork-0.4.post1.dev40550160.zip", "flatland-fork"), + ] + + for (filename, project) in files: + assert pypi_project_from_filename(filename) == project + + +def test_pypi_1(): + """Tests loading a revision generated by a new PyPI loader that + has a provider.""" + + extrinsic_metadata = { + "url": "https://files.pythonhosted.org/packages/70/89/a498245baf1bf3dde73d3da00b4b067a8aa7c7378ad83472078803ea3e43/m3-ui-2.2.73.tar.gz", + "size": 3933168, + "digests": { + "md5": "a374ac3f655e97df5db5335e2142d344", + "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d", + }, + "has_sig": False, + "filename": "m3-ui-2.2.73.tar.gz", + "downloads": -1, + "md5_digest": "a374ac3f655e97df5db5335e2142d344", + "packagetype": "sdist", + "upload_time": "2019-11-11T06:21:20", + "comment_text": "", + "python_version": "source", + "requires_python": None, + "upload_time_iso_8601": "2019-11-11T06:21:20.073082Z", + } + + original_artifacts = [ + { + "length": 3933168, + "filename": "m3-ui-2.2.73.tar.gz", + "checksums": { + "sha1": "9f4ec7ce64b7fea4b122e85d47ea31146c367b03", + "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d", + }, + } + ] + + row = { + "id": b"\x00\x00\x07a{S\xe7\xb1E\x8fi]\xd0}\xe4\xceU\xaf\x15\x17", + "date": datetime.datetime( + 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc, + ), + "committer_date": datetime.datetime( + 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc, + ), + "type": "tar", + "message": b"2.2.73", + "metadata": { + "extrinsic": { + "raw": extrinsic_metadata, + "when": "2020-01-23T18:43:09.109407+00:00", + "provider": "https://pypi.org/pypi/m3-ui/json", + }, + "intrinsic": { + "raw": { + "name": "m3-ui", + "summary": "======", + "version": "2.2.73", + # ... + "metadata_version": "1.1", + }, + "tool": "PKG-INFO", + }, + "original_artifact": original_artifacts, + }, + } + + origin_url = "https://pypi.org/project/m3-ui/" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517" + ), + discovery_date=datetime.datetime( + 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc, + ), + authority=PYPI_AUTHORITY, + fetcher=FETCHER, + format="pypi-project-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517" + ), + discovery_date=datetime.datetime( + 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(original_artifacts).encode(), + origin=origin_url, + ), + ] + ), + ] + + +def test_pypi_2(): + """Tests loading a revision generated by an old PyPI loader that + does not have a provider, but has 'project' metadata.""" + + extrinsic_metadata = { + "name": "jupyterhub-simx", + "author": "Jupyter Development Team", + "license": "BSD", + "summary": "JupyterHub: A multi-user server for Jupyter notebooks", + "version": "1.0.5", + # ... + } + + source_original_artifacts = [ + { + "url": "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz", + "date": "2019-01-23T22:10:55", + "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168", + "size": 2346538, + "sha256": "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46", + "filename": "jupyterhub-simx-1.0.5.tar.gz", + "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02", + "blake2s256": "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322", + "archive_type": "tar", + } + ] + + dest_original_artifacts = [ + { + "url": "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz", + "filename": "jupyterhub-simx-1.0.5.tar.gz", + "archive_type": "tar", + "length": 2346538, + "checksums": { + "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168", + "sha256": "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46", + "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02", + "blake2s256": "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322", + }, + } + ] + + row = { + "id": b"\x00\x00\x04\xd68,J\xd4\xc0Q\x92fbl6U\x1f\x0eQ\xca", + "date": datetime.datetime( + 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc + ), + "committer_date": datetime.datetime( + 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"1.0.5", + "metadata": { + "project": extrinsic_metadata, + "original_artifact": source_original_artifacts, + }, + } + + origin_url = "https://pypi.org/project/jupyterhub-simx/" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca" + ), + discovery_date=datetime.datetime( + 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc, + ), + authority=PYPI_AUTHORITY, + fetcher=FETCHER, + format="pypi-project-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=None, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca" + ), + discovery_date=datetime.datetime( + 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=None, + ), + ] + ), + ] + + +def test_pypi_3(): + """Tests loading a revision generated by a vert old PyPI loader that + does not have a provider orhas 'project' metadata.""" + + source_original_artifact = { + "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", + "date": "2014-05-07T22:03:00", + "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", + "size": 46644, + "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", + "filename": "PyPDFLite-0.1.32.tar.gz", + "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", + "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", + "archive_type": "tar", + } + + dest_original_artifacts = [ + { + "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", + "filename": "PyPDFLite-0.1.32.tar.gz", + "archive_type": "tar", + "length": 46644, + "checksums": { + "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", + "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", + "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", + "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", + }, + } + ] + + row = { + "id": b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2", + "date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc), + "committer_date": datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc + ), + "type": "tar", + "message": b"0.1.32", + "metadata": {"original_artifact": source_original_artifact}, + } + + origin_url = "https://pypi.org/project/PyPDFLite/" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2" + ), + discovery_date=datetime.datetime( + 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifacts-json", + metadata=json.dumps(dest_original_artifacts).encode(), + origin=None, + ), + ] + ), + ]