Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122863
D3820.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
135 KB
Subscribers
None
D3820.diff
View Options
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -24,6 +24,9 @@
[mypy-django.*]
ignore_missing_imports = True
+[mypy-iso8601.*]
+ignore_missing_imports = True
+
[mypy-msgpack.*]
ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@
deprecated
typing-extensions
mypy_extensions
+iso8601
diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -0,0 +1,903 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""This is an executable script to migrate extrinsic revision metadata from
+the revision table to the new extrinsic metadata storage.
+
+This is designed to be as conservative as possible, following this principle:
+for each revision the script reads (in "handle_row"), it will read some of the
+fields, write them directly to the metadata storage, and remove them.
+Then it checks all the remaining fields are in a hardcoded list of fields that
+are known not to require migration.
+
+This means that every field that isn't migrated was explicitly reviewed while
+writing this script.
+
+Additionally, this script contains many assertions to prevent false positives
+in its heuristics.
+"""
+
+import datetime
+import hashlib
+import json
+import os
+import re
+import sys
+from typing import Any, Dict, Optional
+from urllib.parse import unquote, urlparse
+
+import iso8601
+
+from swh.core.db import BaseDb
+from swh.model.hashutil import hash_to_hex
+from swh.model.identifiers import SWHID, parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+)
+from swh.storage import get_storage
+
+# XML namespaces and fields for metadata coming from the deposit:
+
+CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
+ATOM_NS = "http://www.w3.org/2005/Atom"
+ATOM_KEYS = ["id", "author", "external_identifier", "title"]
+
+# columns of the revision table (of the storage DB)
+REVISION_COLS = ["id", "date", "committer_date", "type", "message", "metadata"]
+
+# columns of the tables of the deposit DB
+DEPOSIT_COLS = [
+ "deposit.id",
+ "deposit.external_id",
+ "deposit.swh_id_context",
+ "deposit.status",
+ "deposit_request.metadata",
+ "deposit_request.date",
+ "deposit_client.provider_url",
+ "deposit_collection.name",
+ "auth_user.username",
+]
+
+# Formats we write to the extrinsic metadata storage
+OLD_DEPOSIT_FORMAT = (
+ "sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces" # before february 2018
+)
+NEW_DEPOSIT_FORMAT = "sword-v2-atom-codemeta-v2-in-json" # after february 2018
+GNU_FORMAT = "gnu-tree-json"
+NIXGUIX_FORMAT = "nixguix-sources-json"
+NPM_FORMAT = "replicate-npm-package-json"
+ORIGINAL_ARTIFACT_FORMAT = "original-artifacts-json"
+PYPI_FORMAT = "pypi-project-json"
+
+# Information about this script, for traceability
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+
+# Authorities that we got the metadata from
+AUTHORITIES = {
+ "npmjs": MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}
+ ),
+ "pypi": MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={}
+ ),
+ "gnu": MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="https://ftp.gnu.org/", metadata={}
+ ),
+ "swh": MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+ ), # for original_artifact (which are checksums computed by SWH)
+}
+
+# Regular expression for the format of revision messages written by the
+# deposit loader
+deposit_revision_message_re = re.compile(
+ b"(?P<client>[a-z]*): "
+ b"Deposit (?P<deposit_id>[0-9]+) in collection (?P<collection>[a-z]+).*"
+)
+
+
+# not reliable, because PyPI allows arbitrary names
+def pypi_project_from_filename(filename):
+ match = re.match(
+ r"^(?P<project_name>[a-zA-Z0-9_.-]+)"
+ r"-[0-9.]+([a-z]+[0-9]+)?(\.dev[0-9]+)?\.(tar\.gz|zip)$",
+ filename,
+ )
+ assert match, filename
+ return match.group("project_name")
+
+
+def cran_package_from_url(filename):
+ match = re.match(
+ r"^https://cran\.r-project\.org/src/contrib/"
+ r"(?P<package_name>[a-zA-Z0-9.]+)_[0-9.-]+(\.tar\.gz)?$",
+ filename,
+ )
+ assert match, filename
+ return match.group("package_name")
+
+
+def npm_package_from_source_url(package_source_url):
+ match = re.match(
+ "^https://registry.npmjs.org/(?P<package_name>.*)/-/[^/]+.tgz$",
+ package_source_url,
+ )
+ assert match, package_source_url
+ return unquote(match.group("package_name"))
+
+
+def remove_atom_codemeta_metadata_with_xmlns(metadata):
+ """Removes all known Atom and Codemeta metadata fields from the dict,
+ assuming this is a dict generated by xmltodict without expanding namespaces.
+ """
+ keys_to_remove = ATOM_KEYS + ["@xmlns", "@xmlns:codemeta"]
+ for key in list(metadata):
+ if key.startswith("codemeta:") or key in keys_to_remove:
+ del metadata[key]
+
+
+def remove_atom_codemeta_metadata_without_xmlns(metadata):
+ """Removes all known Atom and Codemeta metadata fields from the dict,
+ assuming this is a dict generated by xmltodict with expanded namespaces.
+ """
+ for key in list(metadata):
+ if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)):
+ del metadata[key]
+
+
+# Cache of origins that are known to exist
+_origins = set()
+
+
+def assert_origin_exists(storage, origin):
+ assert (
+ hashlib.sha1(origin.encode()).digest() in _origins # very fast
+ or storage.origin_get([origin])[0] is not None # slow, but up to date
+ ), origin
+
+
+def load_metadata(
+ storage,
+ revision_id,
+ discovery_date: datetime.datetime,
+ metadata: Dict[str, Any],
+ format: str,
+ authority: MetadataAuthority,
+ origin: Optional[str],
+ dry_run: bool,
+):
+ """Does the actual loading to swh-storage."""
+ revision_swhid = SWHID(object_type="revision", object_id=hash_to_hex(revision_id))
+ obj = RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=revision_swhid,
+ discovery_date=discovery_date,
+ authority=authority,
+ fetcher=FETCHER,
+ format=format,
+ metadata=json.dumps(metadata).encode(),
+ origin=origin,
+ )
+ if not dry_run:
+ storage.raw_extrinsic_metadata_add([obj])
+
+
+def handle_deposit_row(
+ row,
+ discovery_date: Optional[datetime.datetime],
+ origin,
+ storage,
+ deposit_cur,
+ dry_run: bool,
+):
+ """Loads metadata from the deposit database (which is more reliable as the
+ metadata on the revision object, as some versions of the deposit loader were
+ a bit lossy; and they used very different format for the field in the
+ revision table).
+ """
+ parsed_message = deposit_revision_message_re.match(row["message"])
+ assert parsed_message is not None, row["message"]
+
+ deposit_id = int(parsed_message.group("deposit_id"))
+ collection = parsed_message.group("collection").decode()
+ client_name = parsed_message.group("client").decode()
+
+ deposit_cur.execute(
+ f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit "
+ f"INNER JOIN deposit_collection "
+ f" ON (deposit.collection_id=deposit_collection.id) "
+ f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) "
+ f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) "
+ f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) "
+ f"WHERE deposit.id = %s",
+ (deposit_id,),
+ )
+
+ provider_urls = set()
+ swhids = set()
+ metadata_entries = []
+ dates = set()
+ external_identifiers = set()
+ for deposit_request_row in deposit_cur:
+ deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row))
+
+ # Sanity checks to make sure we selected the right deposit
+ assert deposit_request["deposit.id"] == deposit_id
+ assert deposit_request["deposit_collection.name"] == collection, deposit_request
+ if client_name != "":
+ # Sometimes it's missing from the commit message
+ assert deposit_request["auth_user.username"] == client_name
+
+ # Date of the deposit request (either the initial request, of subsequent ones)
+ date = deposit_request["deposit_request.date"]
+ dates.add(date)
+
+ assert deposit_request["deposit.swh_id_context"], deposit_request
+ external_identifiers.add(deposit_request["deposit.external_id"])
+ swhids.add(deposit_request["deposit.swh_id_context"])
+
+ # Client of the deposit
+ provider_urls.add(deposit_request["deposit_client.provider_url"])
+
+ metadata = deposit_request["deposit_request.metadata"]
+ if metadata is not None:
+ json.dumps(metadata).encode() # check it's valid
+ if "@xmlns" in metadata:
+ assert metadata["@xmlns"] == ATOM_NS
+ assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
+ format = NEW_DEPOSIT_FORMAT
+ else:
+ assert "{http://www.w3.org/2005/Atom}id" in metadata
+ assert (
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata
+ or "{http://www.w3.org/2005/Atom}author" in metadata
+ )
+ format = OLD_DEPOSIT_FORMAT
+ metadata_entries.append((date, format, metadata))
+
+ if discovery_date is None:
+ discovery_date = max(dates)
+
+ # Sanity checks to make sure deposit requests are consistent with each other
+ assert len(metadata_entries) >= 1, deposit_id
+ assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}"
+ (provider_url,) = provider_urls
+ assert len(swhids) == 1
+ (swhid,) = swhids
+ assert (
+ len(external_identifiers) == 1
+ ), f"expected 1 external identifier, got {external_identifiers}"
+ (external_identifier,) = external_identifiers
+
+ # computed the origin from the external_identifier if we don't have one
+ if origin is None:
+ origin = f"{provider_url.strip('/')}/{external_identifier}"
+
+ # explicit list of mistakes that happened in the past, but shouldn't
+ # happen again:
+ if origin == "https://hal.archives-ouvertes.fr/hal-01588781":
+ # deposit id 75
+ origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01588782":
+ # deposit id 76
+ origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01592430":
+ # deposit id 143
+ origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592430"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01588927":
+ origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01593875":
+ # deposit id 175
+ origin = "https://hal-preprod.archives-ouvertes.fr/hal-01593875"
+ elif deposit_id == 160:
+ assert origin == "https://www.softwareheritage.org/je-suis-gpl", origin
+ origin = "https://forge.softwareheritage.org/source/jesuisgpl/"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01588942":
+ # deposit id 90
+ origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01592499":
+ # deposit id 162
+ origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592499"
+ elif origin == "https://hal.archives-ouvertes.fr/hal-01588935":
+ # deposit id 89
+ origin = "https://hal-preprod.archives-ouvertes.fr/hal-01588935"
+
+ assert_origin_exists(storage, origin)
+
+ # check the origin we computed matches the one in the deposit db
+ swhid_origin = parse_swhid(swhid).metadata["origin"]
+ if origin is not None:
+ # explicit list of mistakes that happened in the past, but shouldn't
+ # happen again:
+ exceptions = [
+ (
+ # deposit id 229
+ "https://hal.archives-ouvertes.fr/hal-01243573",
+ "https://hal-test.archives-ouvertes.fr/hal-01243573",
+ ),
+ (
+ # deposit id 199
+ "https://hal.archives-ouvertes.fr/hal-01243065",
+ "https://hal-test.archives-ouvertes.fr/hal-01243065",
+ ),
+ (
+ # deposit id 164
+ "https://hal.archives-ouvertes.fr/hal-01593855",
+ "https://hal-preprod.archives-ouvertes.fr/hal-01593855",
+ ),
+ ]
+ if (origin, swhid_origin) not in exceptions:
+ assert origin == swhid_origin, (
+ f"the origin we guessed from the deposit db or revision ({origin}) "
+ f"doesn't match the one in the deposit db's SWHID ({swhid})"
+ )
+
+ authority = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider_url, metadata={},
+ )
+
+ for (date, format, metadata) in metadata_entries:
+ load_metadata(
+ storage,
+ row["id"],
+ date,
+ metadata,
+ format,
+ authority=authority,
+ origin=origin,
+ dry_run=dry_run,
+ )
+
+ return (origin, discovery_date)
+
+
+def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool):
+ type_ = row["type"]
+
+ # default date in case we can't find a better one
+ discovery_date = row["date"] or row["committer_date"]
+
+ metadata = row["metadata"]
+
+ if metadata is None:
+ return
+
+ if type_ == "dsc":
+ origin = None # TODO: I can't find how to get it reliably
+
+ # TODO: the debian loader writes the changelog date as the revision's
+ # author date and committer date. Instead, we should use the visit's date,
+ # but I cannot find a way to reliably get it without the origin
+
+ if "extrinsic" in metadata:
+ extrinsic_files = metadata["extrinsic"]["raw"]["files"]
+ for artifact_entry in metadata["original_artifact"]:
+ extrinsic_file = extrinsic_files[artifact_entry["filename"]]
+ for key in ("sha256",):
+ assert artifact_entry["checksums"][key] == extrinsic_file[key]
+ artifact_entry["url"] = extrinsic_file["uri"]
+ del metadata["extrinsic"]
+
+ elif type_ == "tar":
+ provider = metadata.get("extrinsic", {}).get("provider")
+ if provider is not None:
+ # This is the format all the package loaders currently write, and
+ # it is the easiest, thanks to the 'provider' and 'when' fields,
+ # which have all the information we need to tell them easily
+ # and generate accurate metadata
+
+ discovery_date = iso8601.parse_date(metadata["extrinsic"]["when"])
+
+ # New versions of the loaders write the provider; use it.
+ if provider.startswith("https://replicate.npmjs.com/"):
+ # npm loader format 1
+
+ parsed_url = urlparse(provider)
+ assert re.match("^/[^/]+/?$", parsed_url.path), parsed_url
+ package_name = unquote(parsed_url.path.strip("/"))
+ origin = "https://www.npmjs.com/package/" + package_name
+ assert_origin_exists(storage, origin)
+
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["extrinsic"]["raw"],
+ NPM_FORMAT,
+ authority=AUTHORITIES["npmjs"],
+ origin=origin,
+ dry_run=dry_run,
+ )
+ del metadata["extrinsic"]
+
+ elif provider.startswith("https://pypi.org/"):
+ # pypi loader format 1
+
+ match = re.match(
+ "https://pypi.org/pypi/(?P<project_name>.*)/json", provider
+ )
+ assert match, f"unexpected provider URL format: {provider}"
+ project_name = match.group("project_name")
+ origin = f"https://pypi.org/project/{project_name}/"
+ assert_origin_exists(storage, origin)
+
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["extrinsic"]["raw"],
+ PYPI_FORMAT,
+ authority=AUTHORITIES["pypi"],
+ origin=origin,
+ dry_run=dry_run,
+ )
+ del metadata["extrinsic"]
+
+ elif provider.startswith("https://cran.r-project.org/"):
+ # cran loader
+
+ provider = metadata["extrinsic"]["provider"]
+ if provider.startswith("https://cran.r-project.org/package="):
+ origin = metadata["extrinsic"]["provider"]
+ else:
+ package_name = cran_package_from_url(provider)
+ origin = f"https://cran.r-project.org/package={package_name}"
+ # TODO https://forge.softwareheritage.org/T2536
+ assert origin is not None
+ if (
+ hashlib.sha1(origin.encode()).digest() not in _origins
+ and storage.origin_get([origin])[0] is None
+ ):
+ print("MISSING CRAN ORIGIN", hash_to_hex(row["id"]), origin)
+ return
+
+ raw_extrinsic_metadata = metadata["extrinsic"]["raw"]
+
+ # this is actually intrinsic, ignore it
+ del raw_extrinsic_metadata["version"]
+
+ # Copy the URL to the original_artifacts metadata
+ assert len(metadata["original_artifact"]) == 1
+ assert "url" not in metadata["original_artifact"][0]
+ metadata["original_artifact"][0]["url"] = raw_extrinsic_metadata["url"]
+ del raw_extrinsic_metadata["url"]
+
+ assert (
+ raw_extrinsic_metadata == {}
+ ), f"Unexpected metadata keys: {list(raw_extrinsic_metadata)}"
+
+ del metadata["extrinsic"]
+
+ elif provider.startswith("https://nix-community.github.io/nixpkgs-swh/"):
+ # nixguix loader
+ origin = provider
+ assert_origin_exists(storage, origin)
+
+ authority = MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url=provider, metadata={},
+ )
+ assert row["date"] is None # the nixguix loader does not write dates
+
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["extrinsic"]["raw"],
+ NIXGUIX_FORMAT,
+ authority=authority,
+ origin=origin,
+ dry_run=dry_run,
+ )
+ del metadata["extrinsic"]
+
+ elif provider.startswith("https://ftp.gnu.org/"):
+ # archive loader format 1
+
+ origin = provider
+ assert_origin_exists(storage, origin)
+
+ assert len(metadata["original_artifact"]) == 1
+ metadata["original_artifact"][0]["url"] = metadata["extrinsic"]["raw"][
+ "url"
+ ]
+
+ # Remove duplicate keys of original_artifacts
+ for key in ("url", "time", "length", "version", "filename"):
+ del metadata["extrinsic"]["raw"][key]
+
+ assert metadata["extrinsic"]["raw"] == {}
+ del metadata["extrinsic"]
+
+ elif provider.startswith("https://deposit.softwareheritage.org/"):
+ origin = metadata["extrinsic"]["raw"]["origin"]["url"]
+ assert_origin_exists(storage, origin)
+
+ if "@xmlns" in metadata:
+ assert metadata["@xmlns"] == ATOM_NS
+ assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
+ assert "intrinsic" not in metadata
+ assert "extra_headers" not in metadata
+
+ # deposit loader format 1
+ # in this case, the metadata seems to be both directly in metadata
+ # and in metadata["extrinsic"]["raw"]["metadata"]
+
+ (origin, discovery_date) = handle_deposit_row(
+ row, discovery_date, origin, storage, deposit_cur, dry_run
+ )
+
+ remove_atom_codemeta_metadata_with_xmlns(metadata)
+ if "client" in metadata:
+ del metadata["client"]
+ del metadata["extrinsic"]
+ else:
+ # deposit loader format 2
+ actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][
+ "metadata"
+ ]
+ if "@xmlns" in actual_metadata:
+ assert actual_metadata["@xmlns"] == ATOM_NS
+ assert actual_metadata["@xmlns:codemeta"] in (
+ CODEMETA_NS,
+ [CODEMETA_NS],
+ )
+ else:
+ assert "{http://www.w3.org/2005/Atom}id" in actual_metadata
+ assert (
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
+ in actual_metadata
+ )
+
+ (origin, discovery_date) = handle_deposit_row(
+ row, discovery_date, origin, storage, deposit_cur, dry_run
+ )
+
+ del metadata["extrinsic"]
+ else:
+ assert False, f"unknown provider {provider}"
+
+ # Older versions don't write the provider; use heuristics instead.
+ elif (
+ metadata.get("package_source", {})
+ .get("url", "")
+ .startswith("https://registry.npmjs.org/")
+ ):
+ # npm loader format 2
+
+ package_source_url = metadata["package_source"]["url"]
+ package_name = npm_package_from_source_url(package_source_url)
+ origin = "https://www.npmjs.com/package/" + package_name
+ assert_origin_exists(storage, origin)
+
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["package"],
+ NPM_FORMAT,
+ authority=AUTHORITIES["npmjs"],
+ origin=origin,
+ dry_run=dry_run,
+ )
+ del metadata["package"]
+
+ assert "original_artifact" not in metadata
+
+ # rebuild an "original_artifact"-like metadata dict from what we
+ # can salvage of "package_source"
+ package_source_metadata = metadata["package_source"]
+ keep_keys = {"blake2s256", "filename", "sha1", "sha256", "url"}
+ discard_keys = {
+ "date", # is equal to the revision date
+ "name", # was loaded above
+ "version", # same
+ }
+ assert (
+ set(package_source_metadata) == keep_keys | discard_keys
+ ), package_source_metadata
+
+ # will be loaded below
+ metadata["original_artifact"] = [
+ {
+ "filename": package_source_metadata["filename"],
+ "checksums": {
+ "sha1": package_source_metadata["sha1"],
+ "sha256": package_source_metadata["sha256"],
+ "blake2s256": package_source_metadata["blake2s256"],
+ },
+ "url": package_source_metadata["url"],
+ }
+ ]
+ del metadata["package_source"]
+
+ elif "@xmlns" in metadata:
+ assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
+ assert "intrinsic" not in metadata
+ assert "extra_headers" not in metadata
+
+ # deposit loader format 3
+
+ if row["message"] == b"swh: Deposit 159 in collection swh":
+ # There is no deposit 159 in the deposit DB, for some reason
+ assert (
+ hash_to_hex(row["id"]) == "8e9cee14a6ad39bca4347077b87fb5bbd8953bb1"
+ )
+ return
+ elif row["message"] == b"hal: Deposit 342 in collection hal":
+ # They have status 'failed' and no swhid
+ return
+
+ origin = None # TODO
+ discovery_date = None # TODO
+
+ (origin, discovery_date) = handle_deposit_row(
+ row, discovery_date, origin, storage, deposit_cur, dry_run
+ )
+ remove_atom_codemeta_metadata_with_xmlns(metadata)
+ if "client" in metadata:
+ del metadata["client"] # found in the deposit db
+ if "committer" in metadata:
+ del metadata["committer"] # found on the revision object
+
+ elif "{http://www.w3.org/2005/Atom}id" in metadata:
+ assert (
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata
+ or "{http://www.w3.org/2005/Atom}author" in metadata
+ )
+ assert "intrinsic" not in metadata
+ assert "extra_headers" not in metadata
+
+ # deposit loader format 4
+
+ origin = None
+ discovery_date = None # TODO
+
+ (origin, discovery_date) = handle_deposit_row(
+ row, discovery_date, origin, storage, deposit_cur, dry_run
+ )
+ remove_atom_codemeta_metadata_without_xmlns(metadata)
+
+ elif hash_to_hex(row["id"]) == "a86747d201ab8f8657d145df4376676d5e47cf9f":
+ # deposit 91, is missing "{http://www.w3.org/2005/Atom}id" for some
+ # reason, and has an invalid oririn
+ return
+
+ elif (
+ isinstance(metadata.get("original_artifact"), dict)
+ and metadata["original_artifact"]["url"].startswith(
+ "https://files.pythonhosted.org/"
+ )
+ ) or (
+ isinstance(metadata.get("original_artifact"), list)
+ and len(metadata.get("original_artifact")) == 1
+ and metadata["original_artifact"][0]
+ .get("url", "")
+ .startswith("https://files.pythonhosted.org/")
+ ):
+ if isinstance(metadata.get("original_artifact"), dict):
+ metadata["original_artifact"] = [metadata["original_artifact"]]
+
+ assert len(metadata["original_artifact"]) == 1
+
+ # it's tempting here to do this:
+ #
+ # project_name = pypi_project_from_filename(
+ # metadata["original_artifact"][0]["filename"]
+ # )
+ # origin = f"https://pypi.org/project/{project_name}/"
+ # assert_origin_exists(storage, origin)
+ #
+ # but unfortunately, the filename is user-provided, and doesn't
+ # necessarily match the package name on pypi.
+
+ # TODO: on second thoughts, I think we can use this as a heuristic,
+ # then double-check by listing visits and snapshots from the origin;
+ # it should work for most packages.
+
+ origin = None
+
+ if "project" in metadata:
+ # pypi loader format 2
+
+ # same reason as above, we can't do this:
+ # if metadata["project"]:
+ # assert metadata["project"]["name"] == project_name
+
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["project"],
+ PYPI_FORMAT,
+ authority=AUTHORITIES["pypi"],
+ origin=origin,
+ dry_run=dry_run,
+ )
+ del metadata["project"]
+ else:
+ assert set(metadata) == {"original_artifact"}, set(metadata)
+ # pypi loader format 3
+ pass # nothing to do, there's no metadata
+
+ elif row["message"] == b"synthetic revision message":
+ assert isinstance(metadata["original_artifact"], list), metadata
+ assert not any("url" in d for d in metadata["original_artifact"])
+
+ # archive loader format 2
+
+ origin = None
+
+ elif deposit_revision_message_re.match(row["message"]):
+ # deposit without metadata in the revision
+
+ assert set(metadata) == {"original_artifact"}, metadata
+
+ origin = None # TODO
+ discovery_date = None
+
+ (origin, discovery_date) = handle_deposit_row(
+ row, discovery_date, origin, storage, deposit_cur, dry_run
+ )
+ else:
+ assert False, f"Unable to detect type of metadata for row: {row}"
+
+ # Ignore common intrinsic metadata keys
+ for key in ("intrinsic", "extra_headers"):
+ if key in metadata:
+ del metadata[key]
+
+ # Ignore loader-specific intrinsic metadata keys
+ if type_ == "hg":
+ del metadata["node"]
+ elif type_ == "dsc":
+ if "package_info" in metadata:
+ del metadata["package_info"]
+
+ if "original_artifact" in metadata:
+ for original_artifact in metadata["original_artifact"]:
+ # Rename keys to the expected format of original-artifacts-json.
+ rename_keys = [
+ ("name", "filename"), # eg. from old Debian loader
+ ("size", "length"), # eg. from old PyPI loader
+ ]
+ for (old_name, new_name) in rename_keys:
+ if old_name in original_artifact:
+ assert new_name not in original_artifact
+ original_artifact[new_name] = original_artifact.pop(old_name)
+
+ # Move the checksums to their own subdict, which is the expected format
+ # of original-artifacts-json.
+ if "sha1" in original_artifact:
+ assert "checksums" not in original_artifact
+ original_artifact["checksums"] = {}
+ for key in ("sha1", "sha256", "sha1_git", "blake2s256"):
+ if key in original_artifact:
+ original_artifact["checksums"][key] = original_artifact.pop(key)
+
+ if "date" in original_artifact:
+ # The information comes from the package repository rather than SWH,
+ # so it shouldn't be in the 'original-artifacts' metadata
+ # (which has SWH as authority).
+ # Moreover, it's not a very useful information, so let's just drop it.
+ del original_artifact["date"]
+
+ allowed_keys = {
+ "checksums",
+ "filename",
+ "length",
+ "url",
+ "archive_type",
+ }
+ assert set(original_artifact) <= allowed_keys, set(original_artifact)
+
+ load_metadata(
+ storage,
+ row["id"],
+ discovery_date,
+ metadata["original_artifact"],
+ ORIGINAL_ARTIFACT_FORMAT,
+ authority=AUTHORITIES["swh"],
+ origin=origin,
+ dry_run=dry_run,
+ )
+ del metadata["original_artifact"]
+
+ assert metadata == {}, (
+ f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): "
+ f"{metadata}"
+ )
+
+
+def create_fetchers(db):
+ with db.cursor() as cur:
+ cur.execute(
+ """
+ INSERT INTO metadata_fetcher (name, version, metadata)
+ VALUES (%s, %s, %s)
+ ON CONFLICT DO NOTHING
+ """,
+ (FETCHER.name, FETCHER.version, FETCHER.metadata),
+ )
+
+
+def main(storage_dbconn, storage_url, deposit_dbconn, first_id, dry_run):
+ storage_db = BaseDb.connect(storage_dbconn)
+ deposit_db = BaseDb.connect(deposit_dbconn)
+ storage = get_storage("remote", url=storage_url)
+
+ if not dry_run:
+ create_fetchers(storage_db)
+ # Not creating authorities, as the loaders are presumably already running
+ # and created them already.
+ # This also helps make sure this script doesn't accidentally create
+ # authorities that differ from what the loaders use.
+
+ total_rows = 0
+ with storage_db.cursor() as read_cur:
+ with deposit_db.cursor() as deposit_cur:
+ after_id = first_id
+ while True:
+ read_cur.execute(
+ f"SELECT {', '.join(REVISION_COLS)} FROM revision "
+ f"WHERE id > %s AND metadata IS NOT NULL ORDER BY id LIMIT 1000",
+ (after_id,),
+ )
+ new_rows = 0
+ for row in read_cur:
+ row_d = dict(zip(REVISION_COLS, row))
+ handle_row(row_d, storage, deposit_cur, dry_run)
+ new_rows += 1
+
+ if new_rows == 0:
+ break
+
+ after_id = row_d["id"]
+
+ total_rows += new_rows
+ percents = (
+ int.from_bytes(after_id[0:4], byteorder="big") * 100 / (1 << 32)
+ )
+ print(
+ f"Migrated {total_rows/1000000.:.2f}M rows "
+ f"(~{percents:.1f}%, last revision: {after_id.hex()})"
+ )
+
+
+if __name__ == "__main__":
+ if len(sys.argv) == 4:
+ (_, storage_dbconn, storage_url, deposit_dbconn) = sys.argv
+ first_id = "00" * 20
+ elif len(sys.argv) == 5:
+ (_, storage_dbconn, storage_url, deposit_dbconn, first_id) = sys.argv
+ else:
+ print(
+ f"Syntax: {sys.argv[0]} <storage_dbconn> <storage_url> "
+ f"<deposit_dbconn> [<first id>]"
+ )
+ exit(1)
+
+ if os.path.isfile("./origins.txt"):
+ # You can generate this file with:
+ # psql service=swh-replica \
+ # -c "\copy (select digest(url, 'sha1') from origin) to stdout" \
+ # | pv -l > origins.txt
+ print("Loading origins...")
+ with open("./origins.txt") as fd:
+ for line in fd:
+ digest = line.strip()[3:]
+ _origins.add(bytes.fromhex(digest))
+ print("Done loading origins.")
+
+ main(storage_dbconn, storage_url, deposit_dbconn, bytes.fromhex(first_id), True)
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ Origin,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
+
+def test_cran_package_from_url():
+ files = [
+ ("https://cran.r-project.org/src/contrib/shapeR_0.1-5.tar.gz", "shapeR"),
+ ("https://cran.r-project.org/src/contrib/hot.deck_1.1.tar.gz", "hot.deck"),
+ ]
+
+ for (filename, project) in files:
+ assert cran_package_from_url(filename) == project
+
+
+def test_cran():
+ source_original_artifacts = [
+ {
+ "length": 170623,
+ "filename": "ExtremeRisks_0.0.3.tar.gz",
+ "checksums": {
+ "sha1": "f2f19fc0f24b66b5ea9413366c632f3c229f7f3f",
+ "sha256": "6f232556313019809dde3554149a1399bb1901a366b4965af49dc007d01945c9",
+ },
+ }
+ ]
+ dest_original_artifacts = [
+ {
+ "length": 170623,
+ "filename": "ExtremeRisks_0.0.3.tar.gz",
+ "checksums": {
+ "sha1": "f2f19fc0f24b66b5ea9413366c632f3c229f7f3f",
+ "sha256": "6f232556313019809dde3554149a1399bb1901a366b4965af49dc007d01945c9",
+ },
+ "url": "https://cran.r-project.org/src/contrib/ExtremeRisks_0.0.3.tar.gz",
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x03a\xaa3\x84,\xbd\xea_\xa6\xe7}\xb6\x96\xb97\xeb\xd2i",
+ "date": datetime.datetime(2020, 5, 5, 0, 0, tzinfo=datetime.timezone.utc,),
+ "committer_date": datetime.datetime(
+ 2020, 5, 5, 0, 0, tzinfo=datetime.timezone.utc,
+ ),
+ "type": "tar",
+ "message": b"0.0.3",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "url": "https://cran.r-project.org/src/contrib/ExtremeRisks_0.0.3.tar.gz",
+ "version": "0.0.3",
+ },
+ "when": "2020-05-07T15:27:38.652281+00:00",
+ "provider": "https://cran.r-project.org/package=ExtremeRisks",
+ },
+ "intrinsic": {
+ "raw": {
+ "URL": "mypage.unibocconi.it/simonepadoan/",
+ "Date": "2020-05-05",
+ "Title": "Extreme Risk Measures",
+ "Author": "Simone Padoan [cre, aut],\n Gilles Stupfler [aut]",
+ # ...
+ "Date/Publication": "2020-05-07 10:20:02 UTC",
+ },
+ "tool": "DESCRIPTION",
+ },
+ "original_artifact": source_original_artifacts,
+ },
+ }
+
+ origin_url = "https://cran.r-project.org/package=ExtremeRisks"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(row, storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000361aa33842cbdea5fa6e77db696b937ebd269"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 5, 7, 15, 27, 38, 652281, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_cran_without_revision_date():
+ """Tests a CRAN revision with a date in the metadata but not as revision date"""
+ source_original_artifacts = [
+ {
+ "length": 8018,
+ "filename": "gofgamma_1.0.tar.gz",
+ "checksums": {
+ "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b",
+ "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03",
+ },
+ }
+ ]
+ dest_original_artifacts = [
+ {
+ "length": 8018,
+ "filename": "gofgamma_1.0.tar.gz",
+ "checksums": {
+ "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b",
+ "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03",
+ },
+ "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz",
+ }
+ ]
+
+ row = {
+ "id": b'\x00\x00\xd4\xef^\x16a"\xae\xe6\x86*\xd3\x8a\x18\xceS\x86\xcc>',
+ "date": None,
+ "committer_date": None,
+ "type": "tar",
+ "message": b"1.0",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz",
+ "version": "1.0",
+ },
+ "when": "2020-04-30T11:01:57.832481+00:00",
+ "provider": "https://cran.r-project.org/package=gofgamma",
+ },
+ "intrinsic": {
+ "raw": {
+ "Type": "Package",
+ "Title": "Goodness-of-Fit Tests for the Gamma Distribution",
+ "Author": "Lucas Butsch [aut],\n Bruno Ebner [aut, cre],\n Steffen Betsch [aut]",
+ # ...
+ },
+ "tool": "DESCRIPTION",
+ },
+ "original_artifact": source_original_artifacts,
+ },
+ }
+
+ origin_url = "https://cran.r-project.org/package=gofgamma"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0000d4ef5e166122aee6862ad38a18ce5386cc3e"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 4, 30, 11, 1, 57, 832481, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py
@@ -0,0 +1,273 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
+
+def test_debian_with_extrinsic():
+ dest_original_artifacts = [
+ {
+ "length": 2936,
+ "filename": "kalgebra_19.12.1-1.dsc",
+ "checksums": {
+ "sha1": "f869e9f1155b1ee6d28ae3b40060570152a358cd",
+ "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11",
+ },
+ "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc",
+ },
+ {
+ "length": 1156408,
+ "filename": "kalgebra_19.12.1.orig.tar.xz",
+ "checksums": {
+ "sha1": "e496032962212983a5359aebadfe13c4026fd45c",
+ "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a",
+ },
+ "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz",
+ },
+ {
+ "length": 10044,
+ "filename": "kalgebra_19.12.1-1.debian.tar.xz",
+ "checksums": {
+ "sha1": "b518bfc2ac708b40577c595bd539faa8b84572db",
+ "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67",
+ },
+ "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz",
+ },
+ {
+ "length": 488,
+ "filename": "kalgebra_19.12.1.orig.tar.xz.asc",
+ "checksums": {
+ "sha1": "ff53a5c21c1aef2b9caa38a02fa3488f43df4c20",
+ "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd",
+ },
+ "url": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc",
+ },
+ ]
+
+ source_original_artifacts = [
+ {k: v for (k, v) in d.items() if k != "url"} for d in dest_original_artifacts
+ ]
+
+ row = {
+ "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee",
+ "date": datetime.datetime(
+ 2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc,
+ ),
+ "date_offset": 60,
+ "type": "dsc",
+ "message": b"Synthetic revision for Debian source package kalgebra version 4:19.12.1-1",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "id": 2718802,
+ "name": "kalgebra",
+ "files": {
+ "kalgebra_19.12.1-1.dsc": {
+ "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc",
+ "name": "kalgebra_19.12.1-1.dsc",
+ "size": 2936,
+ "md5sum": "fd28f604d4cc31a0a305543230f1622a",
+ "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11",
+ },
+ "kalgebra_19.12.1.orig.tar.xz": {
+ "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz",
+ "name": "kalgebra_19.12.1.orig.tar.xz",
+ "size": 1156408,
+ "md5sum": "34e09ed152da762d53101ea33634712b",
+ "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a",
+ },
+ "kalgebra_19.12.1-1.debian.tar.xz": {
+ "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz",
+ "name": "kalgebra_19.12.1-1.debian.tar.xz",
+ "size": 10044,
+ "md5sum": "4f639f36143898d97d044f273f038e58",
+ "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67",
+ },
+ "kalgebra_19.12.1.orig.tar.xz.asc": {
+ "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc",
+ "name": "kalgebra_19.12.1.orig.tar.xz.asc",
+ "size": 488,
+ "md5sum": "3c29291e4e6f0c294de80feb8e9fce4c",
+ "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd",
+ },
+ },
+ "version": "4:19.12.1-1",
+ "revision_id": None,
+ },
+ "when": "2020-01-27T19:32:03.925498+00:00",
+ "provider": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc",
+ },
+ "intrinsic": {
+ "raw": {
+ "name": "kalgebra",
+ "version": "4:19.12.1-1",
+ # ...
+ },
+ "tool": "dsc",
+ },
+ "original_artifact": source_original_artifacts,
+ },
+ }
+
+ storage = Mock()
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0000036c311ef33a281b05688f6eadcfc0943aee"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ ),
+ ]
+ )
+ ]
+
+
+def test_debian_without_extrinsic():
+ source_original_artifacts = [
+ {
+ "name": "pymongo_1.10-1.dsc",
+ "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241",
+ "length": 99,
+ "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f",
+ "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b",
+ },
+ {
+ "name": "pymongo_1.10.orig.tar.gz",
+ "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3",
+ "length": 99,
+ "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f",
+ "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad",
+ },
+ {
+ "name": "pymongo_1.10-1.debian.tar.gz",
+ "sha1": "fbf378296613c8d55e043aec98896b3e50a94971",
+ "length": 99,
+ "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513",
+ "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec",
+ },
+ ]
+
+ dest_original_artifacts = [
+ {
+ "length": 99,
+ "filename": "pymongo_1.10-1.dsc",
+ "checksums": {
+ "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241",
+ "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f",
+ "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b",
+ },
+ },
+ {
+ "length": 99,
+ "filename": "pymongo_1.10.orig.tar.gz",
+ "checksums": {
+ "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3",
+ "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f",
+ "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad",
+ },
+ },
+ {
+ "length": 99,
+ "filename": "pymongo_1.10-1.debian.tar.gz",
+ "checksums": {
+ "sha1": "fbf378296613c8d55e043aec98896b3e50a94971",
+ "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513",
+ "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec",
+ },
+ },
+ ]
+
+ row = {
+ "id": b"\x00\x00\x01\xc2\x8c\x8f\xca\x01\xb9\x04\xde\x92\xa2d\n\x86l\xe0<\xb7",
+ "date": datetime.datetime(
+ 2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc
+ ),
+ "date_offset": 0,
+ "type": "dsc",
+ "message": b"Synthetic revision for Debian source package pymongo version 1.10-1",
+ "metadata": {
+ "package_info": {
+ "name": "pymongo",
+ "version": "1.10-1",
+ "changelog": {
+ # ...
+ },
+ "maintainers": [
+ {"name": "Federico Ceratto", "email": "federico.ceratto@gmail.com"},
+ {"name": "Janos Guljas", "email": "janos@resenje.org"},
+ ],
+ "pgp_signature": {
+ "date": "2011-03-31T21:02:44+00:00",
+ "keyid": "2BABC6254E66E7B8450AC3E1E6AA90171392B174",
+ "person": {"name": "David Paleino", "email": "d.paleino@gmail.com"},
+ },
+ "lister_metadata": {"id": 244296, "lister": "snapshot.debian.org"},
+ },
+ "original_artifact": source_original_artifacts,
+ },
+ }
+
+ storage = Mock()
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000001c28c8fca01b904de92a2640a866ce03cb7"
+ ),
+ discovery_date=datetime.datetime(
+ 2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ ),
+ ]
+ )
+ ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
@@ -0,0 +1,1167 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock, MagicMock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ Origin,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import (
+ DEPOSIT_COLS,
+ handle_row,
+ cran_package_from_url,
+)
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+SWH_DEPOSIT_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url="https://www.softwareheritage.org",
+ metadata={},
+)
+HAL_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url="https://hal.archives-ouvertes.fr/",
+ metadata={},
+)
+INTEL_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.DEPOSIT_CLIENT,
+ url="https://software.intel.com",
+ metadata={},
+)
+
+
+def get_mock_deposit_cur(row_dicts):
+ rows = [tuple(d[key] for key in DEPOSIT_COLS) for d in row_dicts]
+ deposit_cur = MagicMock()
+ deposit_cur.__iter__.side_effect = [iter(rows)]
+ return deposit_cur
+
+
+def test_deposit_1():
+ """Has a provider and xmlns, and the metadata is in the revision twice
+ (at the root of the metadata dict, and in
+ metadata->extrinsic->raw->origin_metadata)"""
+ extrinsic_metadata = {
+ "title": "Je suis GPL",
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "client": "swh",
+ "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "codemeta:author": {
+ "codemeta:name": "Stefano Zacchiroli",
+ "codemeta:jobTitle": "Maintainer",
+ },
+ "codemeta:license": {
+ "codemeta:url": "https://spdx.org/licenses/GPL-3.0-or-later.html",
+ "codemeta:name": "GNU General Public License v3.0 or later",
+ },
+ # ...
+ }
+ original_artifacts = [
+ {
+ "length": 80880,
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "bad32a47a359e0e16ebdca2ad2dc6a771dac8f71",
+ "sha256": "182b7ee3b7b5b550e83d3bcfed029bb2f625ee760ebfe9557d5fd072bd4e22e4",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x02#\x10\xdf\x16\xfd\x9eMO\x81\xfe6\xa1B\xe8-\xb9w\xc0\x1d",
+ "date": datetime.datetime(2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"swh: Deposit 467 in collection swh",
+ "metadata": {
+ "client": "swh",
+ "extrinsic": {
+ "raw": {
+ "origin": {
+ "url": "https://www.softwareheritage.org/check-deposit-2020-03-11T11:07:18.424476",
+ "type": "deposit",
+ },
+ "branch_name": "master",
+ "origin_metadata": {
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {"sword_version": 2},
+ },
+ "metadata": extrinsic_metadata,
+ },
+ },
+ "when": "2020-03-11T11:11:36.336283+00:00",
+ "provider": "https://deposit.softwareheritage.org/1/private/467/meta/",
+ },
+ "original_artifact": original_artifacts,
+ **extrinsic_metadata,
+ },
+ }
+
+ origin_url = (
+ "https://www.softwareheritage.org/check-deposit-2020-03-11T11:07:18.424476"
+ )
+
+ swhid = (
+ f"swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea"
+ f";origin={origin_url}"
+ f";visit=swh:1:snp:14433c19dbb03ad57c86b58b53a800d6a0e32dd3"
+ f";anchor=swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d"
+ f";path=/"
+ )
+
+ deposit_rows = [
+ {
+ "deposit.id": 467,
+ "deposit.external_id": "check-deposit-2020-03-11T11:07:18.424476",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2020, 3, 11, 11, 7, 18, 688410, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://www.softwareheritage.org",
+ "deposit_collection.name": "swh",
+ "auth_user.username": "swh",
+ },
+ {
+ "deposit.id": 467,
+ "deposit.external_id": "check-deposit-2020-03-11T11:07:18.424476",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2020, 3, 11, 11, 7, 18, 669428, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://www.softwareheritage.org",
+ "deposit_collection.name": "swh",
+ "auth_user.username": "swh",
+ },
+ ]
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 3, 11, 11, 7, 18, 688410, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_DEPOSIT_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:022310df16fd9e4d4f81fe36a142e82db977c01d"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 3, 11, 11, 11, 36, 336283, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_deposit_2_without_xmlns():
+ """Has a provider, no xmlns, and the metadata is only in
+ metadata->extrinsic->raw->origin_metadata)"""
+ extrinsic_metadata = {
+ "{http://www.w3.org/2005/Atom}id": "hal-01243573",
+ "{http://www.w3.org/2005/Atom}author": {
+ "{http://www.w3.org/2005/Atom}name": "HAL",
+ "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr",
+ },
+ "{http://www.w3.org/2005/Atom}client": "hal",
+ "{http://www.w3.org/2005/Atom}external_identifier": "hal-01243573",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://hal-test.archives-ouvertes.fr/hal-01243573",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": {
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter"
+ },
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}version": 1,
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifier": "10.5281/zenodo.438684",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}dateCreated": "2017-11-16T14:54:23+01:00",
+ }
+ original_artifacts = [
+ {
+ "length": 208357,
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "fa0aec08e8a44ea144dba7ce366c8b5d66c14453",
+ "sha256": "f53c05fe947e88ce83751a93bd522b1f88478ea2e7b984c07fc7a7c68128bf87",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x01\x16\xca\xb7\x19d\xd5\x9c\x85p\xb4\xc5r\x9b(\xbd\xd6<\x9bF",
+ "date": datetime.datetime(
+ 2018, 1, 17, 12, 54, 0, 723882, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2018, 1, 17, 12, 54, 0, 723882, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"hal: Deposit 82 in collection hal",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "origin": {
+ "url": "https://hal.archives-ouvertes.fr/hal-01243573",
+ "type": "deposit",
+ },
+ "origin_metadata": {
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {"sword_version": 2},
+ },
+ "metadata": extrinsic_metadata,
+ "provider": {
+ "metadata": {},
+ "provider_url": "https://hal.archives-ouvertes.fr/",
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ },
+ },
+ },
+ "when": "2020-05-15T14:27:21.462270+00:00",
+ "provider": "https://deposit.softwareheritage.org/1/private/82/meta/",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ swhid = (
+ "swh:1:dir:e04b2a7b8a8838da0693e9fd992a10d6fd211b50"
+ ";origin=https://hal.archives-ouvertes.fr/hal-01243573"
+ ";visit=swh:1:snp:abc9ae594245a740235b6c039f044352a5f723ec"
+ ";anchor=swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46"
+ ";path=/"
+ )
+
+ deposit_rows = [
+ {
+ "deposit.id": 82,
+ "deposit.external_id": "hal-01243573",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2018, 1, 17, 12, 54, 1, 533972, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ {
+ "deposit.id": 82,
+ "deposit.external_id": "hal-01243573",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2018, 1, 17, 12, 54, 0, 413748, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ ]
+
+ origin_url = "https://hal.archives-ouvertes.fr/hal-01243573"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46"
+ ),
+ discovery_date=datetime.datetime(
+ 2018, 1, 17, 12, 54, 0, 413748, tzinfo=datetime.timezone.utc
+ ),
+ authority=HAL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0116cab71964d59c8570b4c5729b28bdd63c9b46"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 5, 15, 14, 27, 21, 462270, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_deposit_2_with_xmlns():
+ """Has a provider, xmlns, and the metadata is only in
+ metadata->extrinsic->raw->origin_metadata)"""
+ extrinsic_metadata = {
+ "title": "Je suis GPL",
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "client": "swh",
+ "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "codemeta:author": {
+ "codemeta:name": "Stefano Zacchiroli",
+ "codemeta:jobTitle": "Maintainer",
+ },
+ "codemeta:license": {
+ "codemeta:url": "https://spdx.org/licenses/GPL-3.0-or-later.html",
+ "codemeta:name": "GNU General Public License v3.0 or later",
+ },
+ "external_identifier": "je-suis-gpl",
+ "codemeta:dateCreated": "2018-01-05",
+ }
+ original_artifacts = [
+ {
+ "length": 80880,
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "bad32a47a359e0e16ebdca2ad2dc6a771dac8f71",
+ "sha256": "182b7ee3b7b5b550e83d3bcfed029bb2f625ee760ebfe9557d5fd072bd4e22e4",
+ },
+ }
+ ]
+
+ row = {
+ "id": b'\x01"\x96nP\x93\x17\xae\xcejA\xd0\xf0\x88\xdas<\xc0\x9d\x0f',
+ "date": datetime.datetime(2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2018, 1, 5, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"swh: Deposit 687 in collection swh",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "origin": {
+ "url": "https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420",
+ "type": "deposit",
+ },
+ "origin_metadata": {
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {"sword_version": 2},
+ },
+ "metadata": extrinsic_metadata,
+ "provider": {
+ "metadata": {},
+ "provider_url": "https://www.softwareheritage.org",
+ "provider_name": "swh",
+ "provider_type": "deposit_client",
+ },
+ },
+ },
+ "when": "2020-06-26T13:50:22.640625+00:00",
+ "provider": "https://deposit.softwareheritage.org/1/private/687/meta/",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ swhid = (
+ "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea"
+ ";origin=https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420"
+ ";visit=swh:1:snp:8fd469e280fb0724175c64906627f619143d5bdb"
+ ";anchor=swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 687,
+ "deposit.external_id": "check-deposit-2020-06-26T13:50:07.564420",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2020, 6, 26, 13, 50, 8, 216113, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://www.softwareheritage.org",
+ "deposit_collection.name": "swh",
+ "auth_user.username": "swh",
+ },
+ {
+ "deposit.id": 687,
+ "deposit.external_id": "check-deposit-2020-06-26T13:50:07.564420",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2020, 6, 26, 13, 50, 8, 150498, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://www.softwareheritage.org",
+ "deposit_collection.name": "swh",
+ "auth_user.username": "swh",
+ },
+ ]
+
+ origin_url = (
+ "https://www.softwareheritage.org/check-deposit-2020-06-26T13:50:07.564420"
+ )
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 6, 26, 13, 50, 8, 216113, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_DEPOSIT_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0122966e509317aece6a41d0f088da733cc09d0f"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 6, 26, 13, 50, 22, 640625, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_deposit_3_and_wrong_external_id_in_metadata():
+ extrinsic_metadata = {
+ "title": "VTune Perf tool",
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "client": "swh",
+ "codemeta:url": "https://software.intel.com/en-us/vtune",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "codemeta:author": {
+ "codemeta:name": "VTune developer",
+ "codemeta:jobTitle": "Software Engineer",
+ },
+ "external_identifier": "vtune-perf-tool",
+ "codemeta:dateCreated": "2019-05-14",
+ "codemeta:description": "Modified version of Linux Perf tool which is used by Intel VTune Amplifier",
+ }
+ source_original_artifacts = [
+ {
+ "name": "archive.zip",
+ "sha1": "07251dbb1d904d143fd7da9935701f17670d4d9b",
+ "length": 4350528,
+ "sha256": "1f7d111ac79e468002f3edf4b7b2487538d41f6bea362d49b2eb08a537efafb6",
+ "sha1_git": "e2d894efcaad4ff36f09eda3b3c0096416b03429",
+ "blake2s256": "e2c08b82efbc361fbb2d28aa8352668cd71217f165f63de16b61ed61ace7509d",
+ "archive_type": "zip",
+ }
+ ]
+ dest_original_artifacts = [
+ {
+ "length": 4350528,
+ "archive_type": "zip",
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "07251dbb1d904d143fd7da9935701f17670d4d9b",
+ "sha256": "1f7d111ac79e468002f3edf4b7b2487538d41f6bea362d49b2eb08a537efafb6",
+ "sha1_git": "e2d894efcaad4ff36f09eda3b3c0096416b03429",
+ "blake2s256": "e2c08b82efbc361fbb2d28aa8352668cd71217f165f63de16b61ed61ace7509d",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\t5`S\xc4\x9a\xd0\xf9\xe6.Q\xc2\x9d>a|y\x11@\xdf",
+ "date": datetime.datetime(2019, 5, 14, 0, 0, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2019, 5, 14, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"intel: Deposit 268 in collection intel",
+ "metadata": {
+ **extrinsic_metadata,
+ "original_artifact": source_original_artifacts,
+ },
+ }
+
+ swhid = (
+ "swh:1:dir:527c8e4a67d391f2bf1bbc86dd94af5d5cfc8ef7"
+ ";origin=https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff"
+ ";visit=swh:1:snp:49d60943d9c061da1aba6266a811412f9db8de2e"
+ ";anchor=swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 268,
+ "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://software.intel.com",
+ "deposit_collection.name": "intel",
+ "auth_user.username": "intel",
+ },
+ {
+ "deposit.id": 268,
+ "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2019, 5, 14, 7, 49, 36, 477061, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://software.intel.com",
+ "deposit_collection.name": "intel",
+ "auth_user.username": "intel",
+ },
+ {
+ "deposit.id": 268,
+ "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2019, 5, 14, 7, 28, 33, 210100, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://software.intel.com",
+ "deposit_collection.name": "intel",
+ "auth_user.username": "intel",
+ },
+ {
+ "deposit.id": 268,
+ "deposit.external_id": "f80482de-90a8-4c32-bce4-6f6918d492ff",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2019, 5, 14, 7, 28, 33, 41454, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://software.intel.com",
+ "deposit_collection.name": "intel",
+ "auth_user.username": "intel",
+ },
+ ]
+
+ origin_url = "https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc
+ ),
+ authority=INTEL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 5, 14, 7, 28, 33, 210100, tzinfo=datetime.timezone.utc
+ ),
+ authority=INTEL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:09356053c49ad0f9e62e51c29d3e617c791140df"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 5, 14, 7, 49, 36, 775072, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_deposit_3_and_no_swhid():
+ extrinsic_metadata = {
+ "id": "hal-02337300",
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"},
+ "client": "hal",
+ "codemeta:url": "https://hal.archives-ouvertes.fr/hal-02337300",
+ "codemeta:name": "R package SMM, Simulation and Estimation of Multi-State Discrete-Time Semi-Markov and Markov Models",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "codemeta:author": [
+ # ...
+ ],
+ # ...
+ }
+ original_artifacts = [
+ # ...
+ ]
+
+ row = {
+ "id": b"\x91\xe5\xca\x8b'K\xf1\xa8cFd2\xd7Q\xf7A\xbc\x94\xba&",
+ "date": datetime.datetime(2017, 1, 1, 0, 0, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2019, 11, 6, 14, 47, 30, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"hal: Deposit 342 in collection hal",
+ "metadata": {**extrinsic_metadata, "original_artifact": original_artifacts,},
+ }
+ storage = Mock()
+
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == []
+
+
+def test_deposit_3_and_unknown_deposit():
+ extrinsic_metadata = {
+ "title": "Je suis GPL",
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "client": "swh",
+ "codemeta:url": "https://forge.softwareheritage.org/source/jesuisgpl/",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "codemeta:author": {
+ "codemeta:name": "Stefano Zacchiroli",
+ "codemeta:jobTitle": "Maintainer",
+ },
+ # ...
+ }
+
+ row = {
+ "id": b"\x8e\x9c\xee\x14\xa6\xad9\xbc\xa44pw\xb8\x7f\xb5\xbb\xd8\x95;\xb1",
+ "date": datetime.datetime(
+ 2018, 7, 23, 12, 25, 45, 907132, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2018, 7, 23, 12, 25, 45, 907132, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"swh: Deposit 159 in collection swh",
+ "metadata": extrinsic_metadata,
+ }
+
+ origin_url = "https://software.intel.com/f80482de-90a8-4c32-bce4-6f6918d492ff"
+
+ storage = Mock()
+
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == []
+
+
+def test_deposit_4_without_xmlns():
+ extrinsic_metadata = {
+ "{http://www.w3.org/2005/Atom}id": "hal-01243573",
+ "{http://www.w3.org/2005/Atom}author": {
+ "{http://www.w3.org/2005/Atom}name": "HAL",
+ "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr",
+ },
+ "{http://www.w3.org/2005/Atom}client": "hal",
+ "{http://www.w3.org/2005/Atom}external_identifier": "hal-01243573",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://hal-test.archives-ouvertes.fr/hal-01243573",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": {
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter"
+ },
+ # ...
+ }
+
+ row = {
+ "id": b"\x03\x98\x7f\x05n\xafE\x96\xcd \xd7\xb2\xee\x01\xc9\xb8L\xed\xdf\xa8",
+ "date": datetime.datetime(
+ 2018, 1, 17, 12, 49, 30, 902891, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2018, 1, 17, 12, 49, 30, 902891, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b": Deposit 79 in collection hal",
+ "metadata": extrinsic_metadata,
+ }
+
+ swhid = (
+ "swh:1:dir:e04b2a7b8a8838da0693e9fd992a10d6fd211b50"
+ ";origin=https://hal.archives-ouvertes.fr/hal-01243573"
+ ";visit=swh:1:snp:c31851534c86676a040fb10f438728c90f1c9d55"
+ ";anchor=swh:1:rev:43549ebbe70c9cdf0be1647e6319392eaa06f3a3"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 79,
+ "deposit.external_id": "hal-01243573",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2018, 1, 17, 12, 49, 31, 208347, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ {
+ "deposit.id": 79,
+ "deposit.external_id": "hal-01243573",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2018, 1, 17, 12, 49, 30, 645576, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ ]
+
+ origin_url = "https://hal.archives-ouvertes.fr/hal-01243573"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:03987f056eaf4596cd20d7b2ee01c9b84ceddfa8"
+ ),
+ discovery_date=datetime.datetime(
+ 2018, 1, 17, 12, 49, 30, 645576, tzinfo=datetime.timezone.utc
+ ),
+ authority=HAL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ # note: no original artifacts
+ ]
+
+
+def test_deposit_4_wrong_origin():
+ extrinsic_metadata = {
+ "{http://www.w3.org/2005/Atom}id": "hal-01588781",
+ "{http://www.w3.org/2005/Atom}author": {
+ "{http://www.w3.org/2005/Atom}name": "HAL",
+ "{http://www.w3.org/2005/Atom}email": "hal@ccsd.cnrs.fr",
+ },
+ "{http://www.w3.org/2005/Atom}client": "hal",
+ "{http://www.w3.org/2005/Atom}external_identifier": "hal-01588781",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url": "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "The assignment problem ",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author": {
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name": "Morane Gruenpeter",
+ "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation": "Initiative pour la Recherche et l'Innovation sur le Logiciel Libre",
+ },
+ # ...
+ }
+
+ row = {
+ "id": b"-{\xcec\x1f\xc7\x91\x08\x03\x11\xeb\x83\\GB\x8eXjn\xa4",
+ "date": datetime.datetime(
+ 2018, 1, 10, 13, 14, 51, 77033, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2018, 1, 10, 13, 14, 51, 77033, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b": Deposit 75 in collection hal",
+ "metadata": extrinsic_metadata,
+ }
+
+ swhid = (
+ "swh:1:dir:d8971c651fe256942aa4499a3ccdbaa305d3bade"
+ ";origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
+ ";visit=swh:1:snp:7c70cc8ea5b79e376605fd6e9b3b04d98861ffc0"
+ ";anchor=swh:1:rev:2d7bce631fc791080311eb835c47428e586a6ea4"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 75,
+ "deposit.external_id": "hal-01588781",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2018, 1, 10, 13, 14, 51, 523963, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ {
+ "deposit.id": 75,
+ "deposit.external_id": "hal-01588781",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2018, 1, 10, 13, 14, 50, 555143, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ ]
+
+ origin_url = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:2d7bce631fc791080311eb835c47428e586a6ea4"
+ ),
+ discovery_date=datetime.datetime(
+ 2018, 1, 10, 13, 14, 50, 555143, tzinfo=datetime.timezone.utc
+ ),
+ authority=HAL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ # note: no original artifacts
+ ]
+
+
+def test_deposit_missing_metadata_in_revision():
+ extrinsic_metadata = {
+ "id": "hal-01243573",
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"},
+ "client": "hal",
+ "committer": "Administrateur Du Ccsd",
+ "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243573",
+ "codemeta:name": "The assignment problem",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "codemeta:author": {"codemeta:name": "Morane Gruenpeter"},
+ "codemeta:version": "1",
+ "codemeta:identifier": {"#text": "10.5281/zenodo.438684", "@name": "doi",},
+ "external_identifier": "hal-01243573",
+ "codemeta:dateCreated": "2017-11-16T14:54:23+01:00",
+ }
+ source_original_artifacts = [
+ {
+ "name": "archive.zip",
+ "sha1": "e8e46324970cd5af7f98c5a86f33f47fa4a41b4a",
+ "length": 118650,
+ "sha256": "fec81b63d666c43524f966bbd3263da5bee55051d2b48c1659cca5f56fd953e5",
+ "sha1_git": "9da2bbd08bec590b36ede2ed43d74cd510b10a79",
+ "blake2s256": "5d0973ba3644cc2bcfdb41ff1891744337d6aa9547a7e59fe466f684b027f295",
+ "archive_type": "zip",
+ }
+ ]
+ dest_original_artifacts = [
+ {
+ "length": 118650,
+ "archive_type": "zip",
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "e8e46324970cd5af7f98c5a86f33f47fa4a41b4a",
+ "sha256": "fec81b63d666c43524f966bbd3263da5bee55051d2b48c1659cca5f56fd953e5",
+ "sha1_git": "9da2bbd08bec590b36ede2ed43d74cd510b10a79",
+ "blake2s256": "5d0973ba3644cc2bcfdb41ff1891744337d6aa9547a7e59fe466f684b027f295",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x03@v\xf3\xf4\x1e\xe1 N\xb9\xf6@\x82\xcb\xe6\xe9P\xd7\xbb\x8a",
+ "date": datetime.datetime(
+ 2019, 2, 25, 15, 49, 16, 594536, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2019, 2, 25, 15, 49, 16, 594536, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"hal: Deposit 229 in collection hal",
+ "metadata": {"original_artifact": source_original_artifacts},
+ }
+
+ swhid = (
+ "swh:1:dir:3d65b6f065118cb856272829b459f0dfa55549aa"
+ ";origin=https://hal-test.archives-ouvertes.fr/hal-01243573"
+ ";visit=swh:1:snp:322c54ff4023d3216a994bc9ff9ee524ed80ee1f"
+ ";anchor=swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 229,
+ "deposit.external_id": "hal-01243573",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2019, 2, 25, 15, 54, 30, 102072, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ {
+ "deposit.id": 229,
+ "deposit.external_id": "hal-01243573",
+ "deposit.swh_id_context": swhid,
+ "deposit.status": "success",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2019, 2, 25, 15, 49, 12, 302745, tzinfo=datetime.timezone.utc
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ ]
+
+ origin_url = "https://hal.archives-ouvertes.fr/hal-01243573"
+ # /!\ not https://hal-test.archives-ouvertes.fr/hal-01243573
+ # do not trust the metadata!
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 2, 25, 15, 49, 12, 302745, tzinfo=datetime.timezone.utc
+ ),
+ authority=HAL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:034076f3f41ee1204eb9f64082cbe6e950d7bb8a"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 2, 25, 15, 54, 30, 102072, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py b/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py
@@ -0,0 +1,108 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ Origin,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
+
+def test_gnu():
+ original_artifacts = [
+ {
+ "length": 842501,
+ "filename": "gperf-3.0.1.tar.gz",
+ "checksums": {
+ "sha1": "c4453ee492032b369006ee464f4dd4e2c0c0e650",
+ "sha256": "5be283ef62e1bd26abdaaf88b416dbea4b14c360b09befcda2f055656dc43f87",
+ "sha1_git": "bf1d5bb57d571101dd7b6acab2b78ae11bb861de",
+ "blake2s256": "661f84afeb1e0b914defe2b249d424af1dfe380a96016b3282ae758c70e19a70",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x1cqE\x8e@[%\xba\xcc\xc8\x0b\x99\xf6cM\xff\x9d+\x18",
+ "date": datetime.datetime(2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"swh-loader-package: synthetic revision message",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "url": "https://ftp.gnu.org/gnu/gperf/gperf-3.0.1.tar.gz",
+ "time": "2003-06-13T00:11:00+00:00",
+ "length": 842501,
+ "version": "3.0.1",
+ "filename": "gperf-3.0.1.tar.gz",
+ },
+ "when": "2019-11-27T11:17:38.318997+00:00",
+ "provider": "https://ftp.gnu.org/gnu/gperf/",
+ },
+ "intrinsic": {},
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ origin_url = "https://ftp.gnu.org/gnu/gperf/"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(row, storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:001c71458e405b25baccc80b99f6634dff9d2b18"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 11, 27, 11, 17, 38, 318997, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py b/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ Origin,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import handle_row, cran_package_from_url
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+NIX_UNSTABLE_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.FORGE,
+ url="https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
+ metadata={},
+)
+
+
+def test_nixguix():
+ extrinsic_metadata = {
+ "url": "https://files.pythonhosted.org/packages/source/a/alerta/alerta-7.4.5.tar.gz",
+ "integrity": "sha256-km8RAaG1ep+tYR8eHVr3UWk+/MNEqdsBr1Di/g02LYQ=",
+ }
+ original_artifacts = [
+ {
+ "length": 34903,
+ "filename": "alerta-7.4.5.tar.gz",
+ "checksums": {
+ "sha1": "66db4398b664de272fd5aa6610caa776b5e64651",
+ "sha256": "926f1101a1b57a9fad611f1e1d5af751693efcc344a9db01af50e2fe0d362d84",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x01\xbaM\xd0S\x94\x85\x02\x11\xd7\xb3\x85M\x99\x13\xd2:\xe3y",
+ "date": None,
+ "committer_date": None,
+ "type": "tar",
+ "message": b"",
+ "metadata": {
+ "extrinsic": {
+ "raw": extrinsic_metadata,
+ "when": "2020-06-03T11:25:05.259341+00:00",
+ "provider": "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ origin_url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(row, storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 6, 3, 11, 25, 5, 259341, tzinfo=datetime.timezone.utc
+ ),
+ authority=NIX_UNSTABLE_AUTHORITY,
+ fetcher=FETCHER,
+ format="nixguix-sources-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 6, 3, 11, 25, 5, 259341, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py b/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_npm.py
@@ -0,0 +1,376 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ Origin,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import (
+ handle_row,
+ npm_package_from_source_url,
+)
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+NPM_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={},
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
+
+def test_npm_package_from_source_url():
+ package_urls = [
+ (
+ "@l3ilkojr/jdinsults",
+ "https://registry.npmjs.org/@l3ilkojr/jdinsults/-/jdinsults-3.0.0.tgz",
+ ),
+ ("simplemaps", "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz"),
+ (
+ "@piximi/components",
+ "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz",
+ ),
+ (
+ "@chappa'ai/get-next-rc",
+ "https://registry.npmjs.org/@chappa%27ai/get-next-rc/-/get-next-rc-1.0.0.tgz",
+ ),
+ ]
+
+ for (package_name, source_url) in package_urls:
+ assert npm_package_from_source_url(source_url) == package_name
+
+
+def test_npm_1():
+ """Tests loading a revision generated by a new NPM loader that
+ has a provider."""
+
+ extrinsic_metadata = {
+ "_id": "@l3ilkojr/jdinsults@3.0.0",
+ "dist": {
+ "shasum": "b7f0d66090e0285f4e95d082d39bcb0c1b8f4ec8",
+ "tarball": "https://registry.npmjs.org/@l3ilkojr/jdinsults/-/jdinsults-3.0.0.tgz",
+ "fileCount": 4,
+ "integrity": "sha512-qpv8Zg51g0l51VjODEooMUGSGanGUuQpzX5msfR7ZzbgTsgPbpDNyTIsQ0wQzI9RzCCUjS84Ii2VhMISEQcEUA==",
+ "unpackedSize": 1583,
+ "npm-signature": "-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.4\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJeUMS5CRA9TVsSAnZWagAAXpgP/0YgNOWN0U/Fz2RGeQhR\nVIKPvfGqZ2UfFxxUXWIc4QHvwyLCNUedCctpVdqnqmGJ9m/hj3K2zbRPD7Tm\n3nPl0HfzE7v3T8TDZfGhzW3c9mWxig+syr+sjo0EKyAgZVJ0mxbjOl4KHt+U\nQEwl/4falBsyYtK/pkCXWmmuC606QmPn/c6ZRD1Fw4vJjT9i5qi1KaBkIf6M\nnFmpOFxTcwxGGltOk3s3TKDtr8CIeWmdm3VkgsP2ErkPKAOcu12AT4/5tkg0\nDU+m1XmJb67rskb4Ncjvic/VutnPkEfNrk1IRXrmjDZBQbHtCJ7hd5ETmb9S\nE5WmMV8cpaGiW7AZvGTmkn5WETwQQU7po914zYiMg9+ozdwc7yC8cpGj/UoF\niKxsc1uxdfwWk/p3dShegEYM7sveloIXYsPaxbd84WRIfnwkWFZV82op96E3\neX+FRkhMfsHlK8OjZsBPXkppaB48jnZdm3GOOzT9YgyphV33j3J9GnNcDMDe\nriyCLV1BNSKDHElCDrvl1cBGg+C5qn/cTYjQdfEPPY2Hl2MgW9s4UV2s+YSx\n0BBd2A3j80wncP+Y7HFeC4Pv0SM0Pdq6xJaf3ELhj6j0rVZeTW1O3E/PFLXK\nnn/DZcsFXgIzjY+eBIMQgAhqyeJve8LeQNnGt3iNW10E2nZMpfc+dn0ESiwV\n2Gw4\r\n=8uqZ\r\n-----END PGP SIGNATURE-----\r\n",
+ },
+ "name": "@l3ilkojr/jdinsults",
+ "version": "3.0.0",
+ "_npmUser": {"name": "l3ilkojr", "email": "l3ilkojr@example.com"},
+ "_npmVersion": "6.13.6",
+ "description": "Generates insults",
+ "directories": {},
+ "maintainers": [{"name": "l3ilkojr", "email": "l3ilkojr@example.com"}],
+ "_nodeVersion": "10.14.0",
+ "_hasShrinkwrap": False,
+ "_npmOperationalInternal": {
+ "tmp": "tmp/jdinsults_3.0.0_1582351545285_0.2614827716102821",
+ "host": "s3://npm-registry-packages",
+ },
+ }
+
+ original_artifacts = [
+ {
+ "length": 1033,
+ "filename": "jdinsults-3.0.0.tgz",
+ "checksums": {
+ "sha1": "b7f0d66090e0285f4e95d082d39bcb0c1b8f4ec8",
+ "sha256": "42f22795ac883b02fded0b2bf3d8a77f6507d40bc67f28eea6b1b73eb59c515f",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x00\x02\xa4\x9b\xba\x17\xca\x8c\xf3\x7f_=\x16\xaa\xac\xf9S`\xfc",
+ "date": datetime.datetime(2020, 2, 22, 6, 5, 45, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2020, 2, 22, 6, 5, 45, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"3.0.0",
+ "metadata": {
+ "extrinsic": {
+ "raw": extrinsic_metadata,
+ "when": "2020-02-27T01:35:47.965375+00:00",
+ "provider": "https://replicate.npmjs.com/%40l3ilkojr%2Fjdinsults/",
+ },
+ "intrinsic": {
+ "raw": {"name": "@l3ilkojr/jdinsults", "version": "3.0.0"},
+ "tool": "package.json",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ origin_url = "https://www.npmjs.com/package/@l3ilkojr/jdinsults"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000002a49bba17ca8cf37f5f3d16aaacf95360fc"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 2, 27, 1, 35, 47, 965375, tzinfo=datetime.timezone.utc,
+ ),
+ authority=NPM_AUTHORITY,
+ fetcher=FETCHER,
+ format="replicate-npm-package-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000002a49bba17ca8cf37f5f3d16aaacf95360fc"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 2, 27, 1, 35, 47, 965375, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_npm_2_unscoped():
+ """Tests loading a revision generated by an old NPM loader that doesn't
+ have a provider; and the package name is unscoped (ie. doesn't contain a
+ slash)."""
+
+ extrinsic_metadata = {
+ "bugs": {"url": "https://github.com/niwasawa/simplemaps/issues"},
+ "name": "simplemaps",
+ "author": "Naoki Iwasawa",
+ "license": "MIT",
+ # ...
+ }
+
+ package_source = {
+ "url": "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz",
+ "date": "2016-12-23T07:21:29.733Z",
+ "name": "simplemaps",
+ "sha1": "e2b8222930196def764527f5c61048c5b28fe3c4",
+ "sha256": "3ce94927bab5feafea5695d1fa4c2b8131413e53e249b32f9ac2ccff4d865a0b",
+ "version": "0.0.6",
+ "filename": "simplemaps-0.0.6.tgz",
+ "blake2s256": "6769b4009f8162be2e745604b153443d4907a85781d31a724217a3e2d42a7462",
+ }
+
+ original_artifacts = [
+ {
+ "filename": "simplemaps-0.0.6.tgz",
+ "checksums": {
+ "sha1": "e2b8222930196def764527f5c61048c5b28fe3c4",
+ "sha256": "3ce94927bab5feafea5695d1fa4c2b8131413e53e249b32f9ac2ccff4d865a0b",
+ "blake2s256": "6769b4009f8162be2e745604b153443d4907a85781d31a724217a3e2d42a7462",
+ },
+ "url": "https://registry.npmjs.org/simplemaps/-/simplemaps-0.0.6.tgz",
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x00\x04\xae\xed\t\xee\x08\x9cx\x12d\xc0M%d\xfdX\xfe\xb5",
+ "date": datetime.datetime(
+ 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"0.0.6",
+ "metadata": {"package": extrinsic_metadata, "package_source": package_source,},
+ }
+
+ origin_url = "https://www.npmjs.com/package/simplemaps"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000004aeed09ee089c781264c04d2564fd58feb5"
+ ),
+ discovery_date=datetime.datetime(
+ 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc,
+ ),
+ authority=NPM_AUTHORITY,
+ fetcher=FETCHER,
+ format="replicate-npm-package-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000004aeed09ee089c781264c04d2564fd58feb5"
+ ),
+ discovery_date=datetime.datetime(
+ 2016, 12, 23, 7, 21, 29, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_npm_2_scoped():
+ """Tests loading a revision generated by an old NPM loader that doesn't
+ have a provider; and the package name is scoped (ie. in the format
+ @org/name)."""
+
+ extrinsic_metadata = {
+ "bugs": {"url": "https://github.com/piximi/components/issues"},
+ "name": "@piximi/components",
+ # ...
+ }
+
+ package_source = {
+ "url": "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz",
+ "date": "2019-06-07T19:56:04.753Z",
+ "name": "@piximi/components",
+ "sha1": "4ab74e563cb61bb5b2022601a5133a2dd19d19ec",
+ "sha256": "69bb980bd6de3277b6bca86fd79c91f1c28db6910c8d03ecd05b32b78a35188f",
+ "version": "0.1.11",
+ "filename": "components-0.1.11.tgz",
+ "blake2s256": "ce33181d5eff25b70ffdd6f1a18acd472a1707ede23cd2adc6af272dfc40dbfd",
+ }
+
+ original_artifacts = [
+ {
+ "filename": "components-0.1.11.tgz",
+ "checksums": {
+ "sha1": "4ab74e563cb61bb5b2022601a5133a2dd19d19ec",
+ "sha256": "69bb980bd6de3277b6bca86fd79c91f1c28db6910c8d03ecd05b32b78a35188f",
+ "blake2s256": "ce33181d5eff25b70ffdd6f1a18acd472a1707ede23cd2adc6af272dfc40dbfd",
+ },
+ "url": "https://registry.npmjs.org/@piximi/components/-/components-0.1.11.tgz",
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x00 \x19\xc5wXt\xbc\xed\x00zR\x9b\xd3\xb7\x8b\xf6\x04W",
+ "date": datetime.datetime(2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"0.1.11",
+ "metadata": {"package": extrinsic_metadata, "package_source": package_source,},
+ }
+
+ origin_url = "https://www.npmjs.com/package/@piximi/components"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:00002019c5775874bced007a529bd3b78bf60457"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc,
+ ),
+ authority=NPM_AUTHORITY,
+ fetcher=FETCHER,
+ format="replicate-npm-package-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:00002019c5775874bced007a529bd3b78bf60457"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 6, 7, 19, 56, 4, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py
@@ -0,0 +1,356 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+# flake8: noqa
+# because of long lines
+
+import copy
+import datetime
+import json
+from unittest.mock import call, Mock
+
+from swh.model.identifiers import parse_swhid
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ Origin,
+ RawExtrinsicMetadata,
+)
+
+from swh.storage.migrate_extrinsic_metadata import (
+ handle_row,
+ pypi_project_from_filename,
+)
+
+
+FETCHER = MetadataFetcher(
+ name="migrate-extrinsic-metadata-from-revisions", version="0.0.1",
+)
+PYPI_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={},
+)
+SWH_AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.REGISTRY,
+ url="https://softwareheritage.org/",
+ metadata={},
+)
+
+
+def test_pypi_project_from_filename():
+ files = [
+ ("django-agent-trust-0.1.8.tar.gz", "django-agent-trust"),
+ ("python_test-1.0.1.zip", "python_test"),
+ ("py-evm-0.2.0a9.tar.gz", "py-evm"),
+ ("collective.texttospeech-1.0rc1.tar.gz", "collective.texttospeech"),
+ ("flatland-fork-0.4.post1.dev40550160.zip", "flatland-fork"),
+ ]
+
+ for (filename, project) in files:
+ assert pypi_project_from_filename(filename) == project
+
+
+def test_pypi_1():
+ """Tests loading a revision generated by a new PyPI loader that
+ has a provider."""
+
+ extrinsic_metadata = {
+ "url": "https://files.pythonhosted.org/packages/70/89/a498245baf1bf3dde73d3da00b4b067a8aa7c7378ad83472078803ea3e43/m3-ui-2.2.73.tar.gz",
+ "size": 3933168,
+ "digests": {
+ "md5": "a374ac3f655e97df5db5335e2142d344",
+ "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d",
+ },
+ "has_sig": False,
+ "filename": "m3-ui-2.2.73.tar.gz",
+ "downloads": -1,
+ "md5_digest": "a374ac3f655e97df5db5335e2142d344",
+ "packagetype": "sdist",
+ "upload_time": "2019-11-11T06:21:20",
+ "comment_text": "",
+ "python_version": "source",
+ "requires_python": None,
+ "upload_time_iso_8601": "2019-11-11T06:21:20.073082Z",
+ }
+
+ original_artifacts = [
+ {
+ "length": 3933168,
+ "filename": "m3-ui-2.2.73.tar.gz",
+ "checksums": {
+ "sha1": "9f4ec7ce64b7fea4b122e85d47ea31146c367b03",
+ "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x00\x07a{S\xe7\xb1E\x8fi]\xd0}\xe4\xceU\xaf\x15\x17",
+ "date": datetime.datetime(
+ 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc,
+ ),
+ "committer_date": datetime.datetime(
+ 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc,
+ ),
+ "type": "tar",
+ "message": b"2.2.73",
+ "metadata": {
+ "extrinsic": {
+ "raw": extrinsic_metadata,
+ "when": "2020-01-23T18:43:09.109407+00:00",
+ "provider": "https://pypi.org/pypi/m3-ui/json",
+ },
+ "intrinsic": {
+ "raw": {
+ "name": "m3-ui",
+ "summary": "======",
+ "version": "2.2.73",
+ # ...
+ "metadata_version": "1.1",
+ },
+ "tool": "PKG-INFO",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ origin_url = "https://pypi.org/project/m3-ui/"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc,
+ ),
+ authority=PYPI_AUTHORITY,
+ fetcher=FETCHER,
+ format="pypi-project-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 1, 23, 18, 43, 9, 109407, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_pypi_2():
+ """Tests loading a revision generated by an old PyPI loader that
+ does not have a provider, but has 'project' metadata."""
+
+ extrinsic_metadata = {
+ "name": "jupyterhub-simx",
+ "author": "Jupyter Development Team",
+ "license": "BSD",
+ "summary": "JupyterHub: A multi-user server for Jupyter notebooks",
+ "version": "1.0.5",
+ # ...
+ }
+
+ source_original_artifacts = [
+ {
+ "url": "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz",
+ "date": "2019-01-23T22:10:55",
+ "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168",
+ "size": 2346538,
+ "sha256": "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46",
+ "filename": "jupyterhub-simx-1.0.5.tar.gz",
+ "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02",
+ "blake2s256": "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322",
+ "archive_type": "tar",
+ }
+ ]
+
+ dest_original_artifacts = [
+ {
+ "url": "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz",
+ "filename": "jupyterhub-simx-1.0.5.tar.gz",
+ "archive_type": "tar",
+ "length": 2346538,
+ "checksums": {
+ "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168",
+ "sha256": "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46",
+ "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02",
+ "blake2s256": "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"\x00\x00\x04\xd68,J\xd4\xc0Q\x92fbl6U\x1f\x0eQ\xca",
+ "date": datetime.datetime(
+ 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc
+ ),
+ "committer_date": datetime.datetime(
+ 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"1.0.5",
+ "metadata": {
+ "project": extrinsic_metadata,
+ "original_artifact": source_original_artifacts,
+ },
+ }
+
+ origin_url = "https://pypi.org/project/jupyterhub-simx/"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc,
+ ),
+ authority=PYPI_AUTHORITY,
+ fetcher=FETCHER,
+ format="pypi-project-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=None,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca"
+ ),
+ discovery_date=datetime.datetime(
+ 2019, 1, 23, 22, 10, 55, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=None,
+ ),
+ ]
+ ),
+ ]
+
+
+def test_pypi_3():
+ """Tests loading a revision generated by a vert old PyPI loader that
+ does not have a provider orhas 'project' metadata."""
+
+ source_original_artifact = {
+ "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
+ "date": "2014-05-07T22:03:00",
+ "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
+ "size": 46644,
+ "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
+ "filename": "PyPDFLite-0.1.32.tar.gz",
+ "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
+ "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
+ "archive_type": "tar",
+ }
+
+ dest_original_artifacts = [
+ {
+ "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
+ "filename": "PyPDFLite-0.1.32.tar.gz",
+ "archive_type": "tar",
+ "length": 46644,
+ "checksums": {
+ "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
+ "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
+ "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
+ "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2",
+ "date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"0.1.32",
+ "metadata": {"original_artifact": source_original_artifact},
+ }
+
+ origin_url = "https://pypi.org/project/PyPDFLite/"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = None
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ assert storage.method_calls == [
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2"
+ ),
+ discovery_date=datetime.datetime(
+ 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc,
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(dest_original_artifacts).encode(),
+ origin=None,
+ ),
+ ]
+ ),
+ ]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 8:30 AM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226208
Attached To
D3820: Add a Python script to migrate extrinsic metadata from revision metadata.
Event Timeline
Log In to Comment