diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py index f5888fa..bf8d118 100644 --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,325 +1,351 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime +from datetime import timezone import json import logging from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union import attr import requests from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import cached_method, download from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Person, Revision, RevisionType, Sha1Git, TimestampWithTimezone, ) from swh.storage.algos.snapshot import snapshot_get_all_branches logger = logging.getLogger(__name__) +def now() -> datetime.datetime: + return datetime.datetime.now(tz=timezone.utc) + + @attr.s class DepositPackageInfo(BasePackageInfo): filename = attr.ib(type=str) # instead of Optional[str] raw_info = attr.ib(type=Dict[str, Any]) author_date = attr.ib(type=datetime.datetime) """codemeta:dateCreated if any, deposit completed_date otherwise""" commit_date = attr.ib(type=datetime.datetime) """codemeta:datePublished if any, deposit completed_date otherwise""" client = attr.ib(type=str) id = attr.ib(type=int) """Internal ID of the deposit in the deposit DB""" collection = attr.ib(type=str) """The collection in the deposit; see SWORD specification.""" author = attr.ib(type=Person) committer = attr.ib(type=Person) revision_parents = attr.ib(type=Tuple[Sha1Git, ...]) """Revisions created from previous deposits, that will be used as parents of the revision created for this deposit.""" @classmethod def from_metadata( cls, metadata: Dict[str, Any], url: str, filename: str ) -> "DepositPackageInfo": # Note: # `date` and `committer_date` are always transmitted by the deposit read api # which computes itself the values. The loader needs to use those to create the # revision. - raw_metadata_from_origin = json.dumps( - metadata["origin_metadata"]["metadata"] - ).encode() - metadata = metadata.copy() - # FIXME: this removes information from 'raw' metadata - depo = metadata.pop("deposit") - + all_metadata_raw: List[str] = metadata["metadata_raw"] + raw_info = { + "origin": metadata["origin"], + "origin_metadata": { + "metadata": json.dumps(metadata["metadata_dict"]), + "provider": metadata["provider"], + "tool": metadata["tool"], + }, + } + depo = metadata["deposit"] return cls( url=url, filename=filename, author_date=depo["author_date"], commit_date=depo["committer_date"], client=depo["client"], id=depo["id"], collection=depo["collection"], author=parse_author(depo["author"]), committer=parse_author(depo["committer"]), revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), - raw_info=metadata, + raw_info=raw_info, revision_extrinsic_metadata=[ RawExtrinsicMetadataCore( - format="sword-v2-atom-codemeta-v2-in-json", - metadata=raw_metadata_from_origin, - ), + discovery_date=now(), + metadata=raw_metadata.encode(), + format="sword-v2-atom-codemeta-v2", + ) + for raw_metadata in all_metadata_raw ], ) class DepositLoader(PackageLoader[DepositPackageInfo]): """Load pypi origin's artifact releases into swh archive. """ visit_type = "deposit" def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) config_deposit = self.config["deposit"] self.deposit_id = deposit_id self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ["HEAD"] def get_metadata_authority(self) -> MetadataAuthority: - provider = self.metadata()["origin_metadata"]["provider"] - assert provider["provider_type"] == "deposit_client" + provider = self.metadata()["provider"] + assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value return MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider["provider_url"], metadata={ "name": provider["provider_name"], **(provider["metadata"] or {}), }, ) def get_metadata_fetcher(self) -> MetadataFetcher: - tool = self.metadata()["origin_metadata"]["tool"] + tool = self.metadata()["tool"] return MetadataFetcher( name=tool["name"], version=tool["version"], metadata=tool["configuration"], ) def get_package_info( self, version: str ) -> Iterator[Tuple[str, DepositPackageInfo]]: p_info = DepositPackageInfo.from_metadata( self.metadata(), url=self.url, filename="archive.zip", ) yield "HEAD", p_info def download_package( self, p_info: DepositPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] def build_revision( self, p_info: DepositPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: message = ( f"{p_info.client}: Deposit {p_info.id} in collection {p_info.collection}" ).encode("utf-8") return Revision( type=RevisionType.TAR, message=message, author=p_info.author, date=TimestampWithTimezone.from_dict(p_info.author_date), committer=p_info.committer, committer_date=TimestampWithTimezone.from_dict(p_info.commit_date), parents=p_info.revision_parents, directory=directory, synthetic=True, metadata={ "extrinsic": { "provider": self.client.metadata_url(self.deposit_id), "when": self.visit_date.isoformat(), "raw": p_info.raw_info, }, }, ) def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: - origin_metadata = self.metadata()["origin_metadata"] + metadata = self.metadata() + all_metadata_raw: List[str] = metadata["metadata_raw"] + origin_metadata = json.dumps( + { + "metadata": all_metadata_raw, + "provider": metadata["provider"], + "tool": metadata["tool"], + } + ).encode() return [ RawExtrinsicMetadataCore( - format="sword-v2-atom-codemeta-v2-in-json", - metadata=json.dumps(origin_metadata["metadata"]).encode(), + discovery_date=now(), + metadata=raw_meta.encode(), + format="sword-v2-atom-codemeta-v2", + ) + for raw_meta in all_metadata_raw + ] + [ + RawExtrinsicMetadataCore( + discovery_date=now(), + metadata=origin_metadata, + format="original-artifacts-json", ) ] @cached_method def metadata(self): """Returns metadata from the deposit server""" return self.client.metadata_get(self.deposit_id) def load(self) -> Dict: # First making sure the deposit is known prior to trigger a loading try: self.metadata() except ValueError: logger.error(f"Unknown deposit {self.deposit_id}, ignoring") return {"status": "failed"} # Then usual loading r = super().load() success = r["status"] != "failed" # Update deposit status try: if not success: self.client.status_update(self.deposit_id, status="failed") return r snapshot_id = hash_to_bytes(r["snapshot_id"]) snapshot = snapshot_get_all_branches(self.storage, snapshot_id) if not snapshot: return r branches = snapshot.branches logger.debug("branches: %s", branches) if not branches: return r rev_id = branches[b"HEAD"].target revision = self.storage.revision_get([rev_id])[0] if not revision: return r # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( self.deposit_id, status="done", revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(revision.directory), snapshot_id=r["snapshot_id"], origin_url=self.url, ) except Exception: logger.exception("Problem when trying to update the deposit's status") return {"status": "failed"} return r def parse_author(author) -> Person: """See prior fixme """ return Person( fullname=author["fullname"].encode("utf-8"), name=author["name"].encode("utf-8"), email=author["email"].encode("utf-8"), ) class ApiClient: """Private Deposit Api client """ def __init__(self, url, auth: Optional[Mapping[str, str]]): self.base_url = url.rstrip("/") self.auth = None if not auth else (auth["username"], auth["password"]) def do(self, method: str, url: str, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in get/post/put Returns: The request's execution output """ method_fn = getattr(requests, method) if self.auth: kwargs["auth"] = self.auth return method_fn(url, *args, **kwargs) def archive_get( self, deposit_id: Union[int, str], tmpdir: str, filename: str ) -> Tuple[str, Dict]: """Retrieve deposit's archive artifact locally """ url = f"{self.base_url}/{deposit_id}/raw/" return download(url, dest=tmpdir, filename=filename, auth=self.auth) def metadata_url(self, deposit_id: Union[int, str]) -> str: return f"{self.base_url}/{deposit_id}/meta/" def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: """Retrieve deposit's metadata artifact as json """ url = self.metadata_url(deposit_id) r = self.do("get", url) if r.ok: return r.json() msg = f"Problem when retrieving deposit metadata at {url}" logger.error(msg) raise ValueError(msg) def status_update( self, deposit_id: Union[int, str], status: str, revision_id: Optional[str] = None, directory_id: Optional[str] = None, snapshot_id: Optional[str] = None, origin_url: Optional[str] = None, ): """Update deposit's information including status, and persistent identifiers result of the loading. """ url = f"{self.base_url}/{deposit_id}/update/" payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if snapshot_id: payload["snapshot_id"] = snapshot_id if origin_url: payload["origin_url"] = origin_url self.do("put", url, json=payload) diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json index 4af7553..d054b58 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json @@ -1,66 +1,62 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "origin_metadata": { - "metadata": { - "@xmlns": [ - "http://www.w3.org/2005/Atom" - ], - "author": [ - "some awesome author", - "another one", - "no one" - ], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "external_identifier": "some-external-id", - "url": "https://hal-test.archives-ouvertes.fr/some-external-id" - }, - "provider": { - "provider_name": "hal", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - "metadata": null - }, - "tool": { - "name": "swh-deposit", - "version": "0.0.1", - "configuration": { - "sword_version": "2" - } + "metadata_raw" : ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one"], + "metadata_dict": { + "author": [ + "some awesome author", + "another one", + "no one" + ], + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "external_identifier": "some-external-id", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id" + }, + "provider": { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": null + }, + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2" } }, "deposit": { "id": "666", "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0, "negative_utc": false }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0, "negative_utc": false }, "revision_parents": [] } } diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json index cbfab58..9d73fac 100644 --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json @@ -1,67 +1,65 @@ { "origin": { "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "origin_metadata": { - "metadata": { - "@xmlns": [ - "http://www.w3.org/2005/Atom" - ], - "author": [ - "some awesome author", - "another one", - "no one" - ], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "codemeta:datePublished": "2017-10-08T15:00:00Z", - "external_identifier": "some-external-id", - "url": "https://hal-test.archives-ouvertes.fr/some-external-id" - }, - "provider": { - "provider_name": "hal", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - "metadata": null - }, - "tool": { - "name": "swh-deposit", - "version": "0.0.1", - "configuration": { - "sword_version": "2" - } + "metadata_raw": ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", +"someone" + ], + "metadata_dict": { + "author": [ + "some awesome author", + "another one", + "no one" + ], + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "codemeta:datePublished": "2017-10-08T15:00:00Z", + "external_identifier": "some-external-id", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id" + }, + "provider": { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": null + }, + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2" } }, "deposit": { "id": 777, "client": "hal", "collection": "hal", "author": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "author_date": { "timestamp": { "seconds": 1507389428, "microseconds": 0 }, "offset": 0, "negative_utc": false }, "committer": { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org" }, "committer_date": { "timestamp": { "seconds": 1507474800, "microseconds": 0 }, "offset": 0, "negative_utc": false }, "revision_parents": [] } } diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index f29d3f9..f70ae98 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,398 +1,429 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import re +from typing import List import attr import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.loader.package.deposit.loader import DepositLoader +from swh.loader.package.loader import now from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.identifiers import SWHID from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, Snapshot, SnapshotBranch, TargetType, ) DEPOSIT_URL = "https://deposit.softwareheritage.org/1/private" @pytest.fixture def requests_mock_datadir(requests_mock_datadir): """Enhance default mock data to mock put requests as the loader does some internal update queries there. """ requests_mock_datadir.put(re.compile("https")) return requests_mock_datadir def test_deposit_init_ok(swh_config, swh_loader_config): url = "some-url" deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config["deposit"]["url"] def test_deposit_loading_unknown_deposit(swh_config, requests_mock_datadir): """Loading an unknown deposit should fail no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url" unknown_deposit_id = 667 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 0, "origin_visit": 0, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=[f"{DEPOSIT_URL}/666/raw/",] ) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, requests_mock_datadir_missing_one ): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url-2" deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(loader.storage, url, status="partial", type="deposit") stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_revision_metadata_structure(swh_config, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb") revision = loader.storage.revision_get([expected_revision_id])[0] assert revision is not None check_metadata_paths( revision.metadata, paths=[ ("extrinsic.provider", str), ("extrinsic.when", str), ("extrinsic.raw", dict), ("original_artifact", list), ], ) # Only 2 top-level keys now assert set(revision.metadata.keys()) == {"extrinsic", "original_artifact"} for original_artifact in revision.metadata["original_artifact"]: check_metadata_paths( original_artifact, paths=[("filename", str), ("length", int), ("checksums", dict),], ) def test_deposit_loading_ok(swh_config, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = "b2b327b33dc85818bd23c3ccda8b7e675a66ecbd" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches(loader.storage, url, status="full", type="deposit") stats = get_stats(loader.storage) assert { "content": 303, "directory": 12, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats revision_id_hex = "637318680351f5d78856d13264faebbd91efe9bb" revision_id = hash_to_bytes(revision_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=revision_id, target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) revision = loader.storage.revision_get([revision_id])[0] assert revision is not None # check metadata fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check origin metadata orig_meta = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.ORIGIN, url, authority ) assert orig_meta.next_page_token is None - assert len(orig_meta.results) == 1 + raw_meta = loader.client.metadata_get(deposit_id) + all_metadata_raw: List[str] = raw_meta["metadata_raw"] + # 2 raw metadata xml + 1 json dict + assert len(orig_meta.results) == len(all_metadata_raw) + 1 orig_meta0 = orig_meta.results[0] assert orig_meta0.authority == authority assert orig_meta0.fetcher == fetcher # Check revision metadata revision_swhid = SWHID(object_type="revision", object_id=revision_id) - rev_meta = loader.storage.raw_extrinsic_metadata_get( + actual_rev_meta = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.REVISION, revision_swhid, authority ) - assert rev_meta.next_page_token is None - assert len(rev_meta.results) == 1 - rev_meta0 = rev_meta.results[0] - assert rev_meta0.authority == authority - assert rev_meta0.fetcher == fetcher + assert actual_rev_meta.next_page_token is None + assert len(actual_rev_meta.results) == len(all_metadata_raw) + for rev_meta in actual_rev_meta.results: + assert rev_meta.authority == authority + assert rev_meta.fetcher == fetcher + assert rev_meta.metadata.decode() in all_metadata_raw # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "revision_id": revision_id_hex, "directory_id": hash_to_hex(revision.directory), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body def test_deposit_loading_ok_2(swh_config, requests_mock_datadir): """Field dates should be se appropriately """ external_id = "some-external-id" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 777 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = "3e68440fdd7c81d283f8f3aebb6f0c8657864192" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches(loader.storage, url, status="full", type="deposit") revision_id = "564d18943d71be80d0d73b43a77cfb205bcde96c" expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes(revision_id), target_type=TargetType.REVISION ) }, ) check_snapshot(expected_snapshot, storage=loader.storage) raw_meta = loader.client.metadata_get(deposit_id) # Ensure the date fields are set appropriately in the revision # Retrieve the revision revision = loader.storage.revision_get([hash_to_bytes(revision_id)])[0] assert revision assert revision.date.to_dict() == raw_meta["deposit"]["author_date"] assert revision.committer_date.to_dict() == raw_meta["deposit"]["committer_date"] read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/" + provider = { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": None, + } + tool = { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": {"sword_version": "2"}, + } assert revision.metadata == { "extrinsic": { "provider": read_api, "raw": { "origin": {"type": "deposit", "url": url,}, "origin_metadata": { - "metadata": { - "@xmlns": ["http://www.w3.org/2005/Atom"], - "author": ["some awesome author", "another one", "no one",], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "codemeta:datePublished": "2017-10-08T15:00:00Z", - "external_identifier": "some-external-id", - "url": url, - }, - "provider": { - "metadata": None, - "provider_name": "hal", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - }, - "tool": { - "configuration": {"sword_version": "2"}, - "name": "swh-deposit", - "version": "0.0.1", - }, + "metadata": json.dumps(raw_meta["metadata_dict"]), + "provider": provider, + "tool": tool, }, }, "when": revision.metadata["extrinsic"]["when"], # dynamic }, "original_artifact": [ { "checksums": { "sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d", "sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa }, "filename": "archive.zip", "length": 956830, "url": "https://deposit.softwareheritage.org/1/private/777/raw/", } ], } fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check the origin metadata swh side - orig_meta = loader.storage.raw_extrinsic_metadata_get( + origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.ORIGIN, url, authority ) - assert orig_meta.next_page_token is None - assert len(orig_meta.results) == 1 - - orig_meta0 = orig_meta.results[0] + assert origin_extrinsic_metadata.next_page_token is None + all_metadata_raw: List[str] = raw_meta["metadata_raw"] + # 1 raw metadata xml + 1 json dict + assert len(origin_extrinsic_metadata.results) == len(all_metadata_raw) + 1 + + expected_metadata = [] + for idx, raw_meta in enumerate(all_metadata_raw): + origin_meta = origin_extrinsic_metadata.results[idx] + expected_metadata.append( + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + id=url, + discovery_date=origin_meta.discovery_date, + metadata=raw_meta.encode(), + format="sword-v2-atom-codemeta-v2", + authority=authority, + fetcher=fetcher, + ) + ) - expected_metadata = RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, - id=url, - discovery_date=orig_meta0.discovery_date, - metadata=json.dumps( - { - "@xmlns": ["http://www.w3.org/2005/Atom"], - "author": ["some awesome author", "another one", "no one"], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "codemeta:datePublished": "2017-10-08T15:00:00Z", - "external_identifier": "some-external-id", - "url": "https://hal-test.archives-ouvertes.fr/some-external-id", - } - ).encode(), - format="sword-v2-atom-codemeta-v2-in-json", - authority=authority, - fetcher=fetcher, + origin_metadata = { + "metadata": all_metadata_raw, + "provider": provider, + "tool": tool, + } + expected_metadata.append( + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + id=url, + discovery_date=origin_extrinsic_metadata.results[-1].discovery_date, + metadata=json.dumps(origin_metadata).encode(), + format="original-artifacts-json", + authority=authority, + fetcher=fetcher, + ) ) - assert orig_meta0 == expected_metadata + assert len(origin_extrinsic_metadata.results) == len(expected_metadata) + for orig_meta in origin_extrinsic_metadata.results: + assert orig_meta in expected_metadata # Check the revision metadata swh side revision_swhid = SWHID(object_type="revision", object_id=revision_id) - rev_meta = loader.storage.raw_extrinsic_metadata_get( + actual_revision_metadata = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.REVISION, revision_swhid, authority ) - assert rev_meta.next_page_token is None + assert actual_revision_metadata.next_page_token is None + assert len(actual_revision_metadata.results) == len(all_metadata_raw) - assert len(rev_meta.results) == 1 - - rev_meta0 = rev_meta.results[0] - - assert rev_meta0 == attr.evolve( - expected_metadata, + rev_metadata_template = RawExtrinsicMetadata( type=MetadataTargetType.REVISION, id=revision_swhid, + format="sword-v2-atom-codemeta-v2", + authority=authority, + fetcher=fetcher, origin=url, + # to satisfy the constructor + discovery_date=now(), + metadata=b"", ) + expected_revision_metadata = [] + for idx, raw_meta in enumerate(all_metadata_raw): + rev_metadata = actual_revision_metadata.results[idx] + expected_revision_metadata.append( + attr.evolve( + rev_metadata_template, + discovery_date=rev_metadata.discovery_date, + metadata=raw_meta.encode(), + ) + ) + + assert actual_revision_metadata.results == expected_revision_metadata + # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "revision_id": revision_id, "directory_id": hash_to_hex(revision.directory), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body