Page MenuHomeSoftware Heritage

D4233.id14941.diff
No OneTemporary

D4233.id14941.diff

diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
--- a/swh/storage/migrate_extrinsic_metadata.py
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -426,13 +426,20 @@
assert metadata["@xmlns"] == ATOM_NS
assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
format = NEW_DEPOSIT_FORMAT
- else:
- assert "{http://www.w3.org/2005/Atom}id" in metadata
+ elif "{http://www.w3.org/2005/Atom}id" in metadata:
assert (
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata
or "{http://www.w3.org/2005/Atom}author" in metadata
)
format = OLD_DEPOSIT_FORMAT
+ else:
+ # new format introduced in
+ # https://forge.softwareheritage.org/D4065
+ # it's the same as the first case, but with the @xmlns
+ # declarations stripped
+ assert "id" in metadata
+ assert "codemeta:author" in metadata
+ format = NEW_DEPOSIT_FORMAT
metadata_entries.append((date, format, metadata))
if discovery_date is None:
@@ -722,18 +729,28 @@
actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][
"metadata"
]
+ if isinstance(actual_metadata, str):
+ # new format introduced in
+ # https://forge.softwareheritage.org/D4105
+ actual_metadata = json.loads(actual_metadata)
if "@xmlns" in actual_metadata:
assert actual_metadata["@xmlns"] == ATOM_NS
assert actual_metadata["@xmlns:codemeta"] in (
CODEMETA_NS,
[CODEMETA_NS],
)
- else:
- assert "{http://www.w3.org/2005/Atom}id" in actual_metadata
+ elif "{http://www.w3.org/2005/Atom}id" in actual_metadata:
assert (
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
in actual_metadata
)
+ else:
+ # new format introduced in
+ # https://forge.softwareheritage.org/D4065
+ # it's the same as the first case, but with the @xmlns
+ # declarations stripped
+ assert "id" in actual_metadata
+ assert "codemeta:author" in actual_metadata
(origin, discovery_date) = handle_deposit_row(
row, discovery_date, origin, storage, deposit_cur, dry_run
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
--- a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
@@ -543,6 +543,167 @@
]
+def test_deposit_2_with_json_in_json_and_no_xmlns():
+ """New formats introduced in https://forge.softwareheritage.org/D4105 ,
+ where the raw metadata is itself JSONed inside the metadata JSON tree
+ and https://forge.softwareheritage.org/D4065 where the @xmlns declarations
+ are stripped before being sent to the deposit DB"""
+ extrinsic_metadata = {
+ "id": "hal-02960679",
+ "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"},
+ "client": "hal",
+ "codemeta:url": "https://hal.archives-ouvertes.fr/hal-02960679",
+ "codemeta:name": "Compressive Spectral Clustering Toolbox",
+ "codemeta:author": [
+ {"codemeta:name": "Nicolas Tremblay", "codemeta:affiliation": "PANAMA"},
+ {"codemeta:name": "Gilles Puy", "codemeta:affiliation": "PANAMA"},
+ {"codemeta:name": "R{\\'e}mi Gribonval", "codemeta:affiliation": "PANAMA"},
+ {"codemeta:name": "Pierre Vandergheynst"},
+ ],
+ # ...
+ }
+
+ original_artifacts = [
+ {
+ "url": "https://deposit.softwareheritage.org/1/private/1037/raw/",
+ "length": 4546913,
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "01a0069c626a383de9a17ace40ecfd588e5c4f26",
+ "sha256": "c780a6de91286c70ceecc69fe0c6d201d3fe944aa89e193f3a89ae85dc25c3b1",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"J\x9dc{\xa5\x07\xa2\xb93e%\x04(\xe6\xe3\xf0!\xf1\x94\xd0",
+ "date": datetime.datetime(2016, 1, 29, 0, 0, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"hal: Deposit 1037 in collection hal",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "origin": {
+ "url": "https://hal.archives-ouvertes.fr/hal-02960679",
+ "type": "deposit",
+ },
+ "origin_metadata": {
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.2.0",
+ "configuration": {"sword_version": "2"},
+ },
+ "metadata": json.dumps(extrinsic_metadata),
+ "provider": {
+ "metadata": {},
+ "provider_url": "https://hal.archives-ouvertes.fr/",
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ },
+ },
+ },
+ "when": "2020-10-09T13:38:25.888646+00:00",
+ "provider": "https://deposit.softwareheritage.org/1/private/1037/meta/",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ swhid = (
+ "swh:1:dir:8bfdf74037ae1c51335995891c6226e0f85e46e2"
+ ";origin=https://hal.archives-ouvertes.fr/hal-02960679"
+ ";visit=swh:1:snp:bc4a2ddf84dd0cc13d74e1970a1471c2574ed6aa"
+ ";anchor=swh:1:rev:4a9d637ba507a2b93365250428e6e3f021f194d0"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 1037,
+ "deposit.external_id": "hal-02960679",
+ "deposit.swhid_context": swhid,
+ "deposit.status": "done",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2020, 10, 9, 13, 38, 8, 269611, tzinfo=datetime.timezone.utc,
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ {
+ "deposit.id": 1037,
+ "deposit.external_id": "hal-02960679",
+ "deposit.swhid_context": swhid,
+ "deposit.status": "done",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2020, 10, 9, 13, 38, 7, 394544, tzinfo=datetime.timezone.utc,
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ ]
+
+ origin_url = "https://hal.archives-ouvertes.fr/hal-02960679"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:4a9d637ba507a2b93365250428e6e3f021f194d0"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 10, 9, 13, 38, 7, 394544, tzinfo=datetime.timezone.utc
+ ),
+ authority=HAL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:4a9d637ba507a2b93365250428e6e3f021f194d0"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 10, 9, 13, 38, 25, 888646, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
def test_deposit_3_and_wrong_external_id_in_metadata():
extrinsic_metadata = {
"title": "VTune Perf tool",

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:30 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226190

Event Timeline