diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -24,6 +24,9 @@ [mypy-django.*] ignore_missing_imports = True +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-msgpack.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ deprecated typing-extensions mypy_extensions +iso8601 diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py new file mode 100644 --- /dev/null +++ b/swh/storage/migrate_extrinsic_metadata.py @@ -0,0 +1,545 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +import json +import re +import sys +from typing import Any, Dict, Optional +from urllib.parse import unquote, urlparse + +import iso8601 + +from swh.core.db import BaseDb +from swh.model.hashutil import hash_to_hex +from swh.model.identifiers import SWHID +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) +from swh.storage import get_storage +from swh.storage.algos.snapshot import visits_and_snapshots_get_from_revision + +CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" + +ATOM_NS = "http://www.w3.org/2005/Atom" +ATOM_KEYS = ["id", "author", "external_identifier", "title"] + +REVISION_COLS = ["id", "date", "committer_date", "type", "message", "metadata"] + +DEPOSIT_COLS = [ + "deposit.id", + "deposit_request.metadata", + "deposit_request.date", + "deposit_client.provider_url", + "deposit_collection.name", + "auth_user.username", +] + +OLD_DEPOSIT_FORMAT = ( + "sword-v2-atom-codemeta-v2-in-json-with-expanded-namespaces" # before february 2018 +) +NEW_DEPOSIT_FORMAT = "sword-v2-atom-codemeta-v2-in-json" # after february 2018 +GNU_FORMAT = "gnu-tree-json" +NIXGUIX_FORMAT = "nixguix-sources-json" +NPM_FORMAT = "replicate-npm-package-json" +ORIGINAL_ARTIFACT_FORMAT = "original-artifact-json" +PYPI_FORMAT = "pypi-project-json" + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +AUTHORITIES = { + "npmjs": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={} + ), + "pypi": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={} + ), + "gnu": MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://ftp.gnu.org/", metadata={} + ), + "swh": MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, + ), +} + +deposit_revision_message_re = re.compile( + b"(?P[a-z]*): " + b"Deposit (?P[0-9]+) in collection (?P[a-z]+).*" +) + + +def remove_atom_codemeta_metadata_with_xmlns(metadata): + keys_to_remove = ATOM_KEYS + ["@xmlns", "@xmlns:codemeta"] + for key in list(metadata): + if key.startswith("codemeta:") or key in keys_to_remove: + del metadata[key] + + +def remove_atom_codemeta_metadata_without_xmlns(metadata): + for key in list(metadata): + if key.startswith(("{%s}" % ATOM_NS, "{%s}" % CODEMETA_NS)): + del metadata[key] + + +def load_metadata( + storage, + revision_id, + discovery_date: datetime.datetime, + metadata: Dict[str, Any], + format: str, + authority: MetadataAuthority, + origin: Optional[str], + dry_run: bool, +): + revision_swhid = SWHID(object_type="revision", object_id=hash_to_hex(revision_id)) + obj = RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=revision_swhid, + discovery_date=discovery_date, + authority=authority, + fetcher=FETCHER, + format=format, + metadata=json.dumps(metadata).encode(), + origin=origin, + ) + if not dry_run: + storage.raw_extrinsic_metadata_add([obj]) + + +def handle_deposit_row(row, storage, deposit_cur, dry_run: bool): + parsed_message = deposit_revision_message_re.match(row["message"]) + assert parsed_message is not None, row["message"] + + deposit_id = int(parsed_message.group("deposit_id")) + collection = parsed_message.group("collection").decode() + client_name = parsed_message.group("client").decode() + + deposit_cur.execute( + f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit " + f"INNER JOIN deposit_collection " + f" ON (deposit.collection_id=deposit_collection.id) " + f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) " + f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) " + f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) " + f"WHERE deposit.id = %s", + (deposit_id,), + ) + + provider_urls = set() + metadata_entries = [] + for deposit_request_row in deposit_cur: + deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row)) + + # Sanity checks to make sure we selected the right deposit + assert deposit_request["deposit.id"] == deposit_id + assert deposit_request["deposit_collection.name"] == collection, deposit_request + if client_name != "": + # Sometimes it's missing from the commit message + assert deposit_request["auth_user.username"] == client_name + + provider_urls.add(deposit_request["deposit_client.provider_url"]) + date = deposit_request_row["deposit_request.date"] + metadata = deposit_request["deposit_request.metadata"] + if metadata is not None: + json.dumps(metadata).encode() # check it's valid + if "@xmlns" in metadata: + assert metadata["@xmlns"] == ATOM_NS + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + format = NEW_DEPOSIT_FORMAT + else: + assert "{http://www.w3.org/2005/Atom}id" in metadata + assert "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata + format = OLD_DEPOSIT_FORMAT + metadata_entries.append((date, format, metadata)) + + assert len(metadata_entries) >= 1, deposit_id + assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}" + (provider_url,) = provider_urls + + authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url=provider_url, metadata={}, + ) + + for (date, format, metadata) in metadata_entries: + load_metadata( + storage, + row["id"], + date, + metadata, + format, + authority=authority, + origin=None, # TODO + dry_run=dry_run, + ) + + +def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool): + type_ = row["type"] + date = row["date"] or row["committer_date"] + + metadata = row["metadata"] + + if metadata is None: + return + + if type_ == "dsc": + origin = None # TODO: I can't find how to get it reliably + + # TODO: the debian loader writes the changelog date as the revision's + # author date and committer date. Instead, we should use the visit's date, + # but I cannot find a way to reliably get it without the origin + + if "extrinsic" in metadata: + extrinsic_files = metadata["extrinsic"]["raw"]["files"] + for artifact_entry in metadata["original_artifact"]: + extrinsic_file = extrinsic_files[artifact_entry["filename"]] + for key in ("sha256",): + assert artifact_entry["checksums"][key] == extrinsic_file[key] + artifact_entry["url"] = extrinsic_file["uri"] + del metadata["extrinsic"] + + elif type_ == "tar": + provider = metadata.get("extrinsic", {}).get("provider") + if provider is not None: + # New versions of the loaders write the provider; use it. + if provider.startswith("https://replicate.npmjs.com/"): + # npm loader format 1 + + parsed_url = urlparse(provider) + assert re.match("^/[^/]+/?$", parsed_url.path), parsed_url + package_name = unquote(parsed_url.path.strip("/")) + origin = "https://www.npmjs.com/package/" + package_name + assert storage.origin_get([origin])[0] is not None + + load_metadata( + storage, + row["id"], + row["date"], + metadata["extrinsic"]["raw"], + NPM_FORMAT, + authority=AUTHORITIES["npmjs"], + origin=origin, + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://pypi.org/"): + # pypi loader format 1 + + match = re.match( + "https://pypi.org/pypi/(?P.*)/json", provider + ) + assert match, f"unexpected provider URL format: {provider}" + project_name = match.group("project_name") + origin = f"https://pypi.org/project/{project_name}/" + assert storage.origin_get([origin])[0] is not None + + load_metadata( + storage, + row["id"], + row["date"], + metadata["extrinsic"]["raw"], + PYPI_FORMAT, + authority=AUTHORITIES["pypi"], + origin=origin, + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://cran.r-project.org/"): + # cran loader + + assert date is None + date = iso8601.parse_date(metadata["extrinsic"]["when"]) + + origin = metadata["extrinsic"]["provider"] + assert storage.origin_get([origin])[0] is not None + + # the metadata is intrinsic, so there is nothing to do. + del metadata["extrinsic"] + + elif provider.startswith("https://nix-community.github.io/nixpkgs-swh/"): + # nixguix loader + origin = provider + + authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url=provider, metadata={}, + ) + assert row["date"] is None # the nixguix loader does not write dates + + # Let's figure out which visits produced this revision + dates = set() + for (visit, status, snapshot) in visits_and_snapshots_get_from_revision( + storage, provider, row["id"] + ): + dates.add(visit.date) + + for date in dates: + load_metadata( + storage, + row["id"], + date, + metadata["extrinsic"]["raw"], + NIXGUIX_FORMAT, + authority=authority, + origin=origin, + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://ftp.gnu.org/"): + # archive loader + + origin = None # TODO + load_metadata( + storage, + row["id"], + row["date"], + metadata["extrinsic"]["raw"], + GNU_FORMAT, + authority=AUTHORITIES["gnu"], + origin=None, # TODO + dry_run=dry_run, + ) + del metadata["extrinsic"] + + elif provider.startswith("https://deposit.softwareheritage.org/"): + origin = None # TODO + + if "@xmlns" in metadata: + assert metadata["@xmlns"] == ATOM_NS + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 1 + # (pretty rare? In id order, the first revision with this format + # is 022310df16fd9e4d4f81fe36a142e82db977c01d) + # in the case, the metadata seems to be both directly in metadata + # and in metadata["extrinsic"]["raw"]["metadata"] + + handle_deposit_row(row, storage, deposit_cur, dry_run) + + remove_atom_codemeta_metadata_with_xmlns(metadata) + if "client" in metadata: + del metadata["client"] + del metadata["extrinsic"] + else: + # deposit loader format 2 + actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][ + "metadata" + ] + assert actual_metadata["@xmlns"] == ATOM_NS + assert actual_metadata["@xmlns:codemeta"] in ( + CODEMETA_NS, + [CODEMETA_NS], + ) + + handle_deposit_row(row, storage, deposit_cur, dry_run) + + del metadata["extrinsic"] + + else: + assert False, f"unknown provider {provider}" + + # Older versions don't write the provider; use heuristics instead. + elif ( + metadata.get("package_source", {}) + .get("url", "") + .startswith("https://registry.npmjs.org/") + ): + # npm loader format 2 + + origin = None # TODO + + load_metadata( + storage, + row["id"], + row["date"], + metadata["package"], + NPM_FORMAT, + authority=AUTHORITIES["npmjs"], + origin=None, # TODO + dry_run=dry_run, + ) + del metadata["package"] + + assert "original_artifact" not in metadata + + # rebuild an "original_artifact"-like metadata dict from what we + # can salvage of "package_source" + package_source_metadata = metadata["package_source"] + keep_keys = {"blake2s256", "filename", "sha1", "sha256", "url"} + discard_keys = { + "date", # is equal to the revision date + "name", # was loaded above + "version", # same + } + assert ( + set(package_source_metadata) == keep_keys | discard_keys + ), package_source_metadata + + # will be loaded below + metadata["original_artifact"] = { + k: metadata["package_source"][k] for k in keep_keys + } + del metadata["package_source"] + + elif "project" in metadata: + assert metadata["original_artifact"]["url"].startswith( + "https://files.pythonhosted.org/" + ) + # pypi loader format 2 + + origin = None # TODO + + load_metadata( + storage, + row["id"], + row["date"], + metadata["project"], + PYPI_FORMAT, + authority=AUTHORITIES["pypi"], + origin=None, # TODO + dry_run=dry_run, + ) + del metadata["project"] + + elif "@xmlns" in metadata: + assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS]) + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 3 and 4 + + origin = None # TODO + + handle_deposit_row(row, storage, deposit_cur, dry_run) + remove_atom_codemeta_metadata_with_xmlns(metadata) + if "client" in metadata: + del metadata["client"] # found in the deposit db + if "committer" in metadata: + del metadata["committer"] # found on the revision object + + elif "{http://www.w3.org/2005/Atom}id" in metadata: + assert "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata + assert "intrinsic" not in metadata + assert "extra_headers" not in metadata + + # deposit loader format 5 + + origin = None # TODO + + handle_deposit_row(row, storage, deposit_cur, dry_run) + remove_atom_codemeta_metadata_without_xmlns(metadata) + + # Ignore common intrinsic metadata keys + for key in ("intrinsic", "extra_headers"): + if key in metadata: + del metadata[key] + + # Ignore loader-specific intrinsic metadata keys + if type_ == "hg": + del metadata["node"] + elif type_ == "dsc": + if "package_info" in metadata: + del metadata["package_info"] + + if "original_artifact" in metadata: + load_metadata( + storage, + row["id"], + date, + metadata["original_artifact"], + ORIGINAL_ARTIFACT_FORMAT, + authority=AUTHORITIES["swh"], + origin=origin, + dry_run=dry_run, + ) + del metadata["original_artifact"] + + assert metadata == {}, ( + f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): " + f"{metadata}" + ) + + +def create_fetchers(db): + with db.cursor() as cur: + cur.execute( + """ + INSERT INTO metadata_fetcher (name, version, metadata) + VALUES (%s, %s, %s) + ON CONFLICT DO NOTHING + """, + (FETCHER.name, FETCHER.version, FETCHER.metadata), + ) + + +def main(storage_dbconn, storage_url, deposit_dbconn, first_id, dry_run): + storage_db = BaseDb.connect(storage_dbconn) + deposit_db = BaseDb.connect(deposit_dbconn) + storage = get_storage("remote", url=storage_url) + + if not dry_run: + create_fetchers(storage_db) + # Not creating authorities, as the loaders are presumably already running + # and created them already. + # This also helps make sure this script doesn't accidentally create + # authorities that differ from what the loaders use. + + total_rows = 0 + with storage_db.cursor() as read_cur: + with deposit_db.cursor() as deposit_cur: + after_id = first_id + while True: + read_cur.execute( + f"SELECT {', '.join(REVISION_COLS)} FROM revision " + f"WHERE id > %s AND metadata IS NOT NULL ORDER BY id LIMIT 1000", + (after_id,), + ) + new_rows = 0 + for row in read_cur: + row_d = dict(zip(REVISION_COLS, row)) + handle_row(row_d, storage, deposit_cur, dry_run) + new_rows += 1 + + if new_rows == 0: + break + + after_id = row_d["id"] + + total_rows += new_rows + percents = ( + int.from_bytes(after_id[0:4], byteorder="big") * 100 / (1 << 32) + ) + print( + f"Migrated {total_rows/1000000.:.2f}M rows " + f"(~{percents:.1f}%, last revision: {after_id.hex()})" + ) + + +if __name__ == "__main__": + if len(sys.argv) == 4: + (_, storage_dbconn, storage_url, deposit_dbconn) = sys.argv + first_id = "00" * 20 + elif len(sys.argv) == 5: + (_, storage_dbconn, storage_url, deposit_dbconn, first_id) = sys.argv + else: + print( + f"Syntax: {sys.argv[0]} " + f" []" + ) + exit(1) + main(storage_dbconn, storage_url, deposit_dbconn, bytes.fromhex(first_id), True) diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_cran.py @@ -0,0 +1,108 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_cran(): + original_artifact = [ + { + "length": 8018, + "filename": "gofgamma_1.0.tar.gz", + "checksums": { + "sha1": "58f2993140f9e9e1a136554f0af0174a252f2c7b", + "sha256": "55408f004642b5043bb01de831a7e7a0b9f24a30cb0151e70c2d37abdc508d03", + }, + } + ] + + row = { + "id": b'\x00\x00\xd4\xef^\x16a"\xae\xe6\x86*\xd3\x8a\x18\xceS\x86\xcc>', + "date": None, + "committer_date": None, + "type": "tar", + "message": b"1.0", + "metadata": { + "extrinsic": { + "raw": { + "url": "https://cran.r-project.org/src/contrib/gofgamma_1.0.tar.gz", + "version": "1.0", + }, + "when": "2020-04-30T11:01:57.832481+00:00", + "provider": "https://cran.r-project.org/package=gofgamma", + }, + "intrinsic": { + "raw": { + "Type": "Package", + "Title": "Goodness-of-Fit Tests for the Gamma Distribution", + "Author": "Lucas Butsch [aut],\n Bruno Ebner [aut, cre],\n Steffen Betsch [aut]", + # ... + }, + "tool": "DESCRIPTION", + }, + "original_artifact": original_artifact, + }, + } + + origin_url = "https://cran.r-project.org/package=gofgamma" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0000d4ef5e166122aee6862ad38a18ce5386cc3e" + ), + discovery_date=datetime.datetime( + 2020, 4, 30, 11, 1, 57, 832481, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifact-json", + metadata=json.dumps(original_artifact).encode(), + origin=origin_url, + ), + ] + ), + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_debian.py @@ -0,0 +1,234 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_debian_with_extrinsic(): + original_artifact = [ + { + "length": 2936, + "filename": "kalgebra_19.12.1-1.dsc", + "checksums": { + "sha1": "f869e9f1155b1ee6d28ae3b40060570152a358cd", + "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11", + }, + }, + { + "length": 1156408, + "filename": "kalgebra_19.12.1.orig.tar.xz", + "checksums": { + "sha1": "e496032962212983a5359aebadfe13c4026fd45c", + "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a", + }, + }, + { + "length": 10044, + "filename": "kalgebra_19.12.1-1.debian.tar.xz", + "checksums": { + "sha1": "b518bfc2ac708b40577c595bd539faa8b84572db", + "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67", + }, + }, + { + "length": 488, + "filename": "kalgebra_19.12.1.orig.tar.xz.asc", + "checksums": { + "sha1": "ff53a5c21c1aef2b9caa38a02fa3488f43df4c20", + "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd", + }, + }, + ] + + row = { + "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", + "date": datetime.datetime( + 2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc, + ), + "date_offset": 60, + "type": "dsc", + "message": b"Synthetic revision for Debian source package kalgebra version 4:19.12.1-1", + "metadata": { + "extrinsic": { + "raw": { + "id": 2718802, + "name": "kalgebra", + "files": { + "kalgebra_19.12.1-1.dsc": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc", + "name": "kalgebra_19.12.1-1.dsc", + "size": 2936, + "md5sum": "fd28f604d4cc31a0a305543230f1622a", + "sha256": "75f77150aefdaa4bcf8bc5b1e9b8b90b5cb1651b76a068c5e58e5b83658d5d11", + }, + "kalgebra_19.12.1.orig.tar.xz": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz", + "name": "kalgebra_19.12.1.orig.tar.xz", + "size": 1156408, + "md5sum": "34e09ed152da762d53101ea33634712b", + "sha256": "49d623186800eb8f6fbb91eb43fb14dff78e112624c9cda6b331d494d610b16a", + }, + "kalgebra_19.12.1-1.debian.tar.xz": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.debian.tar.xz", + "name": "kalgebra_19.12.1-1.debian.tar.xz", + "size": 10044, + "md5sum": "4f639f36143898d97d044f273f038e58", + "sha256": "1a30acd2699c3769da302f7a0c63a7d7b060f80925b38c8c43ce3bec92744d67", + }, + "kalgebra_19.12.1.orig.tar.xz.asc": { + "uri": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1.orig.tar.xz.asc", + "name": "kalgebra_19.12.1.orig.tar.xz.asc", + "size": 488, + "md5sum": "3c29291e4e6f0c294de80feb8e9fce4c", + "sha256": "a37e0b95bb1f16b19b0587bc5d3b99ba63a195d7f6335c4a359122ad96d682dd", + }, + }, + "version": "4:19.12.1-1", + "revision_id": None, + }, + "when": "2020-01-27T19:32:03.925498+00:00", + "provider": "http://deb.debian.org/debian//pool/main/k/kalgebra/kalgebra_19.12.1-1.dsc", + }, + "intrinsic": { + "raw": { + "name": "kalgebra", + "version": "4:19.12.1-1", + # ... + }, + "tool": "dsc", + }, + "original_artifact": original_artifact, + }, + } + + storage = Mock() + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:0000036c311ef33a281b05688f6eadcfc0943aee" + ), + discovery_date=datetime.datetime( + 2020, 1, 26, 22, 3, 24, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifact-json", + metadata=json.dumps(original_artifact).encode(), + ), + ] + ) + ] + + +def test_debian_without_extrinsic(): + original_artifact = [ + { + "name": "pymongo_1.10-1.dsc", + "sha1": "81877c1ae4406c2519b9cc9c4557cf6b0775a241", + "length": 99, + "sha256": "40269a73f38ee4c2f9cc021f1d5d091cc59ca6e778c339684b7be030e29e282f", + "sha1_git": "0ac7bdb8e4d10926c5d3e51baa2be7bb29a3966b", + }, + { + "name": "pymongo_1.10.orig.tar.gz", + "sha1": "4f4c97641b86ac8f21396281bd1a7369236693c3", + "length": 99, + "sha256": "0b6bffb310782ffaeb3916c75790742ec5830c63a758fc711cd1f557eb5a4b5f", + "sha1_git": "19ef0adda8868520d1ef9d4164b3ace4df1d62ad", + }, + { + "name": "pymongo_1.10-1.debian.tar.gz", + "sha1": "fbf378296613c8d55e043aec98896b3e50a94971", + "length": 99, + "sha256": "3970cc70fe3ba6499a9c56ba4b4c6c3782f56433d0d17d72b7a0e2ceae31b513", + "sha1_git": "2eea9904806050a8fda95edd5d4fa60d29c1fdec", + }, + ] + + row = { + "id": b"\x00\x00\x01\xc2\x8c\x8f\xca\x01\xb9\x04\xde\x92\xa2d\n\x86l\xe0<\xb7", + "date": datetime.datetime( + 2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc + ), + "date_offset": 0, + "type": "dsc", + "message": b"Synthetic revision for Debian source package pymongo version 1.10-1", + "metadata": { + "package_info": { + "name": "pymongo", + "version": "1.10-1", + "changelog": { + # ... + }, + "maintainers": [ + {"name": "Federico Ceratto", "email": "federico.ceratto@gmail.com"}, + {"name": "Janos Guljas", "email": "janos@resenje.org"}, + ], + "pgp_signature": { + "date": "2011-03-31T21:02:44+00:00", + "keyid": "2BABC6254E66E7B8450AC3E1E6AA90171392B174", + "person": {"name": "David Paleino", "email": "d.paleino@gmail.com"}, + }, + "lister_metadata": {"id": 244296, "lister": "snapshot.debian.org"}, + }, + "original_artifact": original_artifact, + }, + } + + storage = Mock() + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000001c28c8fca01b904de92a2640a866ce03cb7" + ), + discovery_date=datetime.datetime( + 2011, 3, 31, 20, 17, 41, tzinfo=datetime.timezone.utc + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifact-json", + metadata=json.dumps(original_artifact).encode(), + ), + ] + ) + ] diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py @@ -0,0 +1,150 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# flake8: noqa +# because of long lines + +import datetime +import json +from unittest.mock import call, Mock + +from swh.model.identifiers import parse_swhid +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + Origin, + RawExtrinsicMetadata, +) + +from swh.storage.migrate_extrinsic_metadata import handle_row + + +FETCHER = MetadataFetcher( + name="migrate-extrinsic-metadata-from-revisions", version="0.0.1", +) +PYPI_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://pypi.org/", metadata={}, +) +SWH_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) + + +def test_pypi_1(): + extrinsic_metadata = { + "url": "https://files.pythonhosted.org/packages/70/89/a498245baf1bf3dde73d3da00b4b067a8aa7c7378ad83472078803ea3e43/m3-ui-2.2.73.tar.gz", + "size": 3933168, + "digests": { + "md5": "a374ac3f655e97df5db5335e2142d344", + "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d", + }, + "has_sig": False, + "filename": "m3-ui-2.2.73.tar.gz", + "downloads": -1, + "md5_digest": "a374ac3f655e97df5db5335e2142d344", + "packagetype": "sdist", + "upload_time": "2019-11-11T06:21:20", + "comment_text": "", + "python_version": "source", + "requires_python": None, + "upload_time_iso_8601": "2019-11-11T06:21:20.073082Z", + } + + original_artifact = [ + { + "length": 3933168, + "filename": "m3-ui-2.2.73.tar.gz", + "checksums": { + "sha1": "9f4ec7ce64b7fea4b122e85d47ea31146c367b03", + "sha256": "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d", + }, + } + ] + + row = { + "id": b"\x00\x00\x07a{S\xe7\xb1E\x8fi]\xd0}\xe4\xceU\xaf\x15\x17", + "date": datetime.datetime( + 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc, + ), + "committer_date": datetime.datetime( + 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc, + ), + "type": "tar", + "message": b"2.2.73", + "metadata": { + "extrinsic": { + "raw": extrinsic_metadata, + "when": "2020-01-23T18:43:09.109407+00:00", + "provider": "https://pypi.org/pypi/m3-ui/json", + }, + "intrinsic": { + "raw": { + "name": "m3-ui", + "summary": "======", + "version": "2.2.73", + # ... + "metadata_version": "1.1", + }, + "tool": "PKG-INFO", + }, + "original_artifact": original_artifact, + }, + } + + origin_url = "https://pypi.org/project/m3-ui/" + + storage = Mock() + + def origin_get(urls): + assert urls == [origin_url] + return [Origin(url=origin_url)] + + storage.origin_get.side_effect = origin_get + deposit_cur = None + handle_row(row, storage, deposit_cur, dry_run=False) + + assert storage.method_calls == [ + call.origin_get([origin_url]), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517" + ), + discovery_date=datetime.datetime( + 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc, + ), + authority=PYPI_AUTHORITY, + fetcher=FETCHER, + format="pypi-project-json", + metadata=json.dumps(extrinsic_metadata).encode(), + origin=origin_url, + ), + ] + ), + call.raw_extrinsic_metadata_add( + [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=parse_swhid( + "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517" + ), + discovery_date=datetime.datetime( + 2019, 11, 11, 6, 21, 20, tzinfo=datetime.timezone.utc, + ), + authority=SWH_AUTHORITY, + fetcher=FETCHER, + format="original-artifact-json", + metadata=json.dumps(original_artifact).encode(), + origin=origin_url, + ), + ] + ), + ]