diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import datetime +from datetime import timezone import json import logging from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union @@ -33,6 +34,10 @@ logger = logging.getLogger(__name__) +def now() -> datetime.datetime: + return datetime.datetime.now(tz=timezone.utc) + + @attr.s class DepositPackageInfo(BasePackageInfo): filename = attr.ib(type=str) # instead of Optional[str] @@ -62,13 +67,16 @@ # which computes itself the values. The loader needs to use those to create the # revision. - raw_metadata_from_origin = json.dumps( - metadata["origin_metadata"]["metadata"] - ).encode() - metadata = metadata.copy() - # FIXME: this removes information from 'raw' metadata - depo = metadata.pop("deposit") - + all_raw_metadata: List[str] = metadata["raw_metadata"] + raw_info = { + "origin": metadata["origin"], + "origin_metadata": { + "metadata": all_raw_metadata, + "provider": metadata["provider"], + "tool": metadata["tool"], + }, + } + depo = metadata["deposit"] return cls( url=url, filename=filename, @@ -80,12 +88,14 @@ author=parse_author(depo["author"]), committer=parse_author(depo["committer"]), revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), - raw_info=metadata, + raw_info=raw_info, revision_extrinsic_metadata=[ RawExtrinsicMetadataCore( - format="sword-v2-atom-codemeta-v2-in-json", - metadata=raw_metadata_from_origin, - ), + discovery_date=now(), + metadata=raw_metadata.encode(), + format="sword-v2-atom-codemeta-v2", + ) + for raw_metadata in all_raw_metadata ], ) @@ -117,8 +127,8 @@ return ["HEAD"] def get_metadata_authority(self) -> MetadataAuthority: - provider = self.metadata()["origin_metadata"]["provider"] - assert provider["provider_type"] == "deposit_client" + provider = self.metadata()["provider"] + assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value return MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url=provider["provider_url"], @@ -129,7 +139,7 @@ ) def get_metadata_fetcher(self) -> MetadataFetcher: - tool = self.metadata()["origin_metadata"]["tool"] + tool = self.metadata()["tool"] return MetadataFetcher( name=tool["name"], version=tool["version"], metadata=tool["configuration"], ) @@ -177,11 +187,27 @@ ) def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: - origin_metadata = self.metadata()["origin_metadata"] + metadata = self.metadata() + all_raw_metadata: List[str] = metadata["raw_metadata"] + origin_metadata = json.dumps( + { + "metadata": all_raw_metadata, + "provider": metadata["provider"], + "tool": metadata["tool"], + } + ).encode() return [ RawExtrinsicMetadataCore( - format="sword-v2-atom-codemeta-v2-in-json", - metadata=json.dumps(origin_metadata["metadata"]).encode(), + discovery_date=now(), + metadata=raw_meta.encode(), + format="sword-v2-atom-codemeta-v2", + ) + for raw_meta in all_raw_metadata + ] + [ + RawExtrinsicMetadataCore( + discovery_date=now(), + metadata=origin_metadata, + format="original-artifacts-json", ) ] diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json @@ -3,32 +3,18 @@ "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "origin_metadata": { - "metadata": { - "@xmlns": [ - "http://www.w3.org/2005/Atom" - ], - "author": [ - "some awesome author", - "another one", - "no one" - ], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "external_identifier": "some-external-id", - "url": "https://hal-test.archives-ouvertes.fr/some-external-id" - }, - "provider": { - "provider_name": "hal", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - "metadata": null - }, - "tool": { - "name": "swh-deposit", - "version": "0.0.1", - "configuration": { - "sword_version": "2" - } + "raw_metadata" : ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one"], + "provider": { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": null + }, + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2" } }, "deposit": { diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json --- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json +++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json @@ -3,33 +3,20 @@ "url": "https://hal-test.archives-ouvertes.fr/some-external-id", "type": "deposit" }, - "origin_metadata": { - "metadata": { - "@xmlns": [ - "http://www.w3.org/2005/Atom" - ], - "author": [ - "some awesome author", - "another one", - "no one" - ], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "codemeta:datePublished": "2017-10-08T15:00:00Z", - "external_identifier": "some-external-id", - "url": "https://hal-test.archives-ouvertes.fr/some-external-id" - }, - "provider": { - "provider_name": "hal", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - "metadata": null - }, - "tool": { - "name": "swh-deposit", - "version": "0.0.1", - "configuration": { - "sword_version": "2" - } + "raw_metadata": ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one", +"someone" + ], + "provider": { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": null + }, + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2" } }, "deposit": { diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -5,12 +5,14 @@ import json import re +from typing import List import attr import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.loader.package.deposit.loader import DepositLoader +from swh.loader.package.loader import now from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex @@ -199,21 +201,25 @@ MetadataTargetType.ORIGIN, url, authority ) assert orig_meta.next_page_token is None - assert len(orig_meta.results) == 1 + raw_meta = loader.client.metadata_get(deposit_id) + all_raw_metadata: List[str] = raw_meta["raw_metadata"] + # 2 raw metadata xml + 1 json dict + assert len(orig_meta.results) == len(all_raw_metadata) + 1 orig_meta0 = orig_meta.results[0] assert orig_meta0.authority == authority assert orig_meta0.fetcher == fetcher # Check revision metadata revision_swhid = SWHID(object_type="revision", object_id=revision_id) - rev_meta = loader.storage.raw_extrinsic_metadata_get( + actual_rev_meta = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.REVISION, revision_swhid, authority ) - assert rev_meta.next_page_token is None - assert len(rev_meta.results) == 1 - rev_meta0 = rev_meta.results[0] - assert rev_meta0.authority == authority - assert rev_meta0.fetcher == fetcher + assert actual_rev_meta.next_page_token is None + assert len(actual_rev_meta.results) == len(all_raw_metadata) + for rev_meta in actual_rev_meta.results: + assert rev_meta.authority == authority + assert rev_meta.fetcher == fetcher + assert rev_meta.metadata.decode() in all_raw_metadata # Retrieve the information for deposit status update query to the deposit urls = [ @@ -278,31 +284,26 @@ read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/" + provider = { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": None, + } + tool = { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": {"sword_version": "2"}, + } assert revision.metadata == { "extrinsic": { "provider": read_api, "raw": { "origin": {"type": "deposit", "url": url,}, "origin_metadata": { - "metadata": { - "@xmlns": ["http://www.w3.org/2005/Atom"], - "author": ["some awesome author", "another one", "no one",], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "codemeta:datePublished": "2017-10-08T15:00:00Z", - "external_identifier": "some-external-id", - "url": url, - }, - "provider": { - "metadata": None, - "provider_name": "hal", - "provider_type": "deposit_client", - "provider_url": "https://hal-test.archives-ouvertes.fr/", - }, - "tool": { - "configuration": {"sword_version": "2"}, - "name": "swh-deposit", - "version": "0.0.1", - }, + "metadata": raw_meta["raw_metadata"], + "provider": provider, + "tool": tool, }, }, "when": revision.metadata["extrinsic"]["when"], # dynamic @@ -328,54 +329,84 @@ ) # Check the origin metadata swh side - orig_meta = loader.storage.raw_extrinsic_metadata_get( + origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.ORIGIN, url, authority ) - assert orig_meta.next_page_token is None - assert len(orig_meta.results) == 1 - - orig_meta0 = orig_meta.results[0] + assert origin_extrinsic_metadata.next_page_token is None + all_raw_metadata: List[str] = raw_meta["raw_metadata"] + # 1 raw metadata xml + 1 json dict + assert len(origin_extrinsic_metadata.results) == len(all_raw_metadata) + 1 + + expected_metadata = [] + for idx, raw_meta in enumerate(all_raw_metadata): + origin_meta = origin_extrinsic_metadata.results[idx] + expected_metadata.append( + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + id=url, + discovery_date=origin_meta.discovery_date, + metadata=raw_meta.encode(), + format="sword-v2-atom-codemeta-v2", + authority=authority, + fetcher=fetcher, + ) + ) - expected_metadata = RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, - id=url, - discovery_date=orig_meta0.discovery_date, - metadata=json.dumps( - { - "@xmlns": ["http://www.w3.org/2005/Atom"], - "author": ["some awesome author", "another one", "no one"], - "codemeta:dateCreated": "2017-10-07T15:17:08Z", - "codemeta:datePublished": "2017-10-08T15:00:00Z", - "external_identifier": "some-external-id", - "url": "https://hal-test.archives-ouvertes.fr/some-external-id", - } - ).encode(), - format="sword-v2-atom-codemeta-v2-in-json", - authority=authority, - fetcher=fetcher, + origin_metadata = { + "metadata": all_raw_metadata, + "provider": provider, + "tool": tool, + } + expected_metadata.append( + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + id=url, + discovery_date=origin_extrinsic_metadata.results[-1].discovery_date, + metadata=json.dumps(origin_metadata).encode(), + format="original-artifacts-json", + authority=authority, + fetcher=fetcher, + ) ) - assert orig_meta0 == expected_metadata + assert len(origin_extrinsic_metadata.results) == len(expected_metadata) + for orig_meta in origin_extrinsic_metadata.results: + assert orig_meta in expected_metadata # Check the revision metadata swh side revision_swhid = SWHID(object_type="revision", object_id=revision_id) - rev_meta = loader.storage.raw_extrinsic_metadata_get( + actual_revision_metadata = loader.storage.raw_extrinsic_metadata_get( MetadataTargetType.REVISION, revision_swhid, authority ) - assert rev_meta.next_page_token is None + assert actual_revision_metadata.next_page_token is None + assert len(actual_revision_metadata.results) == len(all_raw_metadata) - assert len(rev_meta.results) == 1 - - rev_meta0 = rev_meta.results[0] - - assert rev_meta0 == attr.evolve( - expected_metadata, + rev_metadata_template = RawExtrinsicMetadata( type=MetadataTargetType.REVISION, id=revision_swhid, + format="sword-v2-atom-codemeta-v2", + authority=authority, + fetcher=fetcher, origin=url, + # to satisfy the constructor + discovery_date=now(), + metadata=b"", ) + expected_revision_metadata = [] + for idx, raw_meta in enumerate(all_raw_metadata): + rev_metadata = actual_revision_metadata.results[idx] + expected_revision_metadata.append( + attr.evolve( + rev_metadata_template, + discovery_date=rev_metadata.discovery_date, + metadata=raw_meta.encode(), + ) + ) + + assert actual_revision_metadata.results == expected_revision_metadata + # Retrieve the information for deposit status update query to the deposit urls = [ m