diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
import datetime
+from datetime import timezone
import json
import logging
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union
@@ -33,6 +34,10 @@
logger = logging.getLogger(__name__)
+def now() -> datetime.datetime:
+ return datetime.datetime.now(tz=timezone.utc)
+
+
@attr.s
class DepositPackageInfo(BasePackageInfo):
filename = attr.ib(type=str) # instead of Optional[str]
@@ -62,13 +67,16 @@
# which computes itself the values. The loader needs to use those to create the
# revision.
- raw_metadata_from_origin = json.dumps(
- metadata["origin_metadata"]["metadata"]
- ).encode()
- metadata = metadata.copy()
- # FIXME: this removes information from 'raw' metadata
- depo = metadata.pop("deposit")
-
+ all_raw_metadata: List[str] = metadata["raw_metadata"]
+ raw_info = {
+ "origin": metadata["origin"],
+ "origin_metadata": {
+ "metadata": all_raw_metadata,
+ "provider": metadata["provider"],
+ "tool": metadata["tool"],
+ },
+ }
+ depo = metadata["deposit"]
return cls(
url=url,
filename=filename,
@@ -80,12 +88,14 @@
author=parse_author(depo["author"]),
committer=parse_author(depo["committer"]),
revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]),
- raw_info=metadata,
+ raw_info=raw_info,
revision_extrinsic_metadata=[
RawExtrinsicMetadataCore(
- format="sword-v2-atom-codemeta-v2-in-json",
- metadata=raw_metadata_from_origin,
- ),
+ discovery_date=now(),
+ metadata=raw_metadata.encode(),
+ format="sword-v2-atom-codemeta-v2",
+ )
+ for raw_metadata in all_raw_metadata
],
)
@@ -117,8 +127,8 @@
return ["HEAD"]
def get_metadata_authority(self) -> MetadataAuthority:
- provider = self.metadata()["origin_metadata"]["provider"]
- assert provider["provider_type"] == "deposit_client"
+ provider = self.metadata()["provider"]
+ assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value
return MetadataAuthority(
type=MetadataAuthorityType.DEPOSIT_CLIENT,
url=provider["provider_url"],
@@ -129,7 +139,7 @@
)
def get_metadata_fetcher(self) -> MetadataFetcher:
- tool = self.metadata()["origin_metadata"]["tool"]
+ tool = self.metadata()["tool"]
return MetadataFetcher(
name=tool["name"], version=tool["version"], metadata=tool["configuration"],
)
@@ -177,11 +187,27 @@
)
def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
- origin_metadata = self.metadata()["origin_metadata"]
+ metadata = self.metadata()
+ all_raw_metadata: List[str] = metadata["raw_metadata"]
+ origin_metadata = json.dumps(
+ {
+ "metadata": all_raw_metadata,
+ "provider": metadata["provider"],
+ "tool": metadata["tool"],
+ }
+ ).encode()
return [
RawExtrinsicMetadataCore(
- format="sword-v2-atom-codemeta-v2-in-json",
- metadata=json.dumps(origin_metadata["metadata"]).encode(),
+ discovery_date=now(),
+ metadata=raw_meta.encode(),
+ format="sword-v2-atom-codemeta-v2",
+ )
+ for raw_meta in all_raw_metadata
+ ] + [
+ RawExtrinsicMetadataCore(
+ discovery_date=now(),
+ metadata=origin_metadata,
+ format="original-artifacts-json",
)
]
diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json
--- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json
+++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json
@@ -3,32 +3,18 @@
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
"type": "deposit"
},
- "origin_metadata": {
- "metadata": {
- "@xmlns": [
- "http://www.w3.org/2005/Atom"
- ],
- "author": [
- "some awesome author",
- "another one",
- "no one"
- ],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "external_identifier": "some-external-id",
- "url": "https://hal-test.archives-ouvertes.fr/some-external-id"
- },
- "provider": {
- "provider_name": "hal",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- "metadata": null
- },
- "tool": {
- "name": "swh-deposit",
- "version": "0.0.1",
- "configuration": {
- "sword_version": "2"
- }
+ "raw_metadata" : ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one"],
+ "provider": {
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "metadata": null
+ },
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {
+ "sword_version": "2"
}
},
"deposit": {
diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json
--- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json
+++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json
@@ -3,33 +3,20 @@
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
"type": "deposit"
},
- "origin_metadata": {
- "metadata": {
- "@xmlns": [
- "http://www.w3.org/2005/Atom"
- ],
- "author": [
- "some awesome author",
- "another one",
- "no one"
- ],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "codemeta:datePublished": "2017-10-08T15:00:00Z",
- "external_identifier": "some-external-id",
- "url": "https://hal-test.archives-ouvertes.fr/some-external-id"
- },
- "provider": {
- "provider_name": "hal",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- "metadata": null
- },
- "tool": {
- "name": "swh-deposit",
- "version": "0.0.1",
- "configuration": {
- "sword_version": "2"
- }
+ "raw_metadata": ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one",
+"someone"
+ ],
+ "provider": {
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "metadata": null
+ },
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {
+ "sword_version": "2"
}
},
"deposit": {
diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py
--- a/swh/loader/package/deposit/tests/test_deposit.py
+++ b/swh/loader/package/deposit/tests/test_deposit.py
@@ -5,12 +5,14 @@
import json
import re
+from typing import List
import attr
import pytest
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.loader.package.deposit.loader import DepositLoader
+from swh.loader.package.loader import now
from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes, hash_to_hex
@@ -199,21 +201,25 @@
MetadataTargetType.ORIGIN, url, authority
)
assert orig_meta.next_page_token is None
- assert len(orig_meta.results) == 1
+ raw_meta = loader.client.metadata_get(deposit_id)
+ all_raw_metadata: List[str] = raw_meta["raw_metadata"]
+ # 2 raw metadata xml + 1 json dict
+ assert len(orig_meta.results) == len(all_raw_metadata) + 1
orig_meta0 = orig_meta.results[0]
assert orig_meta0.authority == authority
assert orig_meta0.fetcher == fetcher
# Check revision metadata
revision_swhid = SWHID(object_type="revision", object_id=revision_id)
- rev_meta = loader.storage.raw_extrinsic_metadata_get(
+ actual_rev_meta = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.REVISION, revision_swhid, authority
)
- assert rev_meta.next_page_token is None
- assert len(rev_meta.results) == 1
- rev_meta0 = rev_meta.results[0]
- assert rev_meta0.authority == authority
- assert rev_meta0.fetcher == fetcher
+ assert actual_rev_meta.next_page_token is None
+ assert len(actual_rev_meta.results) == len(all_raw_metadata)
+ for rev_meta in actual_rev_meta.results:
+ assert rev_meta.authority == authority
+ assert rev_meta.fetcher == fetcher
+ assert rev_meta.metadata.decode() in all_raw_metadata
# Retrieve the information for deposit status update query to the deposit
urls = [
@@ -278,31 +284,26 @@
read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/"
+ provider = {
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "metadata": None,
+ }
+ tool = {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {"sword_version": "2"},
+ }
assert revision.metadata == {
"extrinsic": {
"provider": read_api,
"raw": {
"origin": {"type": "deposit", "url": url,},
"origin_metadata": {
- "metadata": {
- "@xmlns": ["http://www.w3.org/2005/Atom"],
- "author": ["some awesome author", "another one", "no one",],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "codemeta:datePublished": "2017-10-08T15:00:00Z",
- "external_identifier": "some-external-id",
- "url": url,
- },
- "provider": {
- "metadata": None,
- "provider_name": "hal",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- },
- "tool": {
- "configuration": {"sword_version": "2"},
- "name": "swh-deposit",
- "version": "0.0.1",
- },
+ "metadata": raw_meta["raw_metadata"],
+ "provider": provider,
+ "tool": tool,
},
},
"when": revision.metadata["extrinsic"]["when"], # dynamic
@@ -328,54 +329,84 @@
)
# Check the origin metadata swh side
- orig_meta = loader.storage.raw_extrinsic_metadata_get(
+ origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.ORIGIN, url, authority
)
- assert orig_meta.next_page_token is None
- assert len(orig_meta.results) == 1
-
- orig_meta0 = orig_meta.results[0]
+ assert origin_extrinsic_metadata.next_page_token is None
+ all_raw_metadata: List[str] = raw_meta["raw_metadata"]
+ # 1 raw metadata xml + 1 json dict
+ assert len(origin_extrinsic_metadata.results) == len(all_raw_metadata) + 1
+
+ expected_metadata = []
+ for idx, raw_meta in enumerate(all_raw_metadata):
+ origin_meta = origin_extrinsic_metadata.results[idx]
+ expected_metadata.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ id=url,
+ discovery_date=origin_meta.discovery_date,
+ metadata=raw_meta.encode(),
+ format="sword-v2-atom-codemeta-v2",
+ authority=authority,
+ fetcher=fetcher,
+ )
+ )
- expected_metadata = RawExtrinsicMetadata(
- type=MetadataTargetType.ORIGIN,
- id=url,
- discovery_date=orig_meta0.discovery_date,
- metadata=json.dumps(
- {
- "@xmlns": ["http://www.w3.org/2005/Atom"],
- "author": ["some awesome author", "another one", "no one"],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "codemeta:datePublished": "2017-10-08T15:00:00Z",
- "external_identifier": "some-external-id",
- "url": "https://hal-test.archives-ouvertes.fr/some-external-id",
- }
- ).encode(),
- format="sword-v2-atom-codemeta-v2-in-json",
- authority=authority,
- fetcher=fetcher,
+ origin_metadata = {
+ "metadata": all_raw_metadata,
+ "provider": provider,
+ "tool": tool,
+ }
+ expected_metadata.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ id=url,
+ discovery_date=origin_extrinsic_metadata.results[-1].discovery_date,
+ metadata=json.dumps(origin_metadata).encode(),
+ format="original-artifacts-json",
+ authority=authority,
+ fetcher=fetcher,
+ )
)
- assert orig_meta0 == expected_metadata
+ assert len(origin_extrinsic_metadata.results) == len(expected_metadata)
+ for orig_meta in origin_extrinsic_metadata.results:
+ assert orig_meta in expected_metadata
# Check the revision metadata swh side
revision_swhid = SWHID(object_type="revision", object_id=revision_id)
- rev_meta = loader.storage.raw_extrinsic_metadata_get(
+ actual_revision_metadata = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.REVISION, revision_swhid, authority
)
- assert rev_meta.next_page_token is None
+ assert actual_revision_metadata.next_page_token is None
+ assert len(actual_revision_metadata.results) == len(all_raw_metadata)
- assert len(rev_meta.results) == 1
-
- rev_meta0 = rev_meta.results[0]
-
- assert rev_meta0 == attr.evolve(
- expected_metadata,
+ rev_metadata_template = RawExtrinsicMetadata(
type=MetadataTargetType.REVISION,
id=revision_swhid,
+ format="sword-v2-atom-codemeta-v2",
+ authority=authority,
+ fetcher=fetcher,
origin=url,
+ # to satisfy the constructor
+ discovery_date=now(),
+ metadata=b"",
)
+ expected_revision_metadata = []
+ for idx, raw_meta in enumerate(all_raw_metadata):
+ rev_metadata = actual_revision_metadata.results[idx]
+ expected_revision_metadata.append(
+ attr.evolve(
+ rev_metadata_template,
+ discovery_date=rev_metadata.discovery_date,
+ metadata=raw_meta.encode(),
+ )
+ )
+
+ assert actual_revision_metadata.results == expected_revision_metadata
+
# Retrieve the information for deposit status update query to the deposit
urls = [
m