Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345757
D4233.id14941.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D4233.id14941.diff
View Options
diff --git a/swh/storage/migrate_extrinsic_metadata.py b/swh/storage/migrate_extrinsic_metadata.py
--- a/swh/storage/migrate_extrinsic_metadata.py
+++ b/swh/storage/migrate_extrinsic_metadata.py
@@ -426,13 +426,20 @@
assert metadata["@xmlns"] == ATOM_NS
assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
format = NEW_DEPOSIT_FORMAT
- else:
- assert "{http://www.w3.org/2005/Atom}id" in metadata
+ elif "{http://www.w3.org/2005/Atom}id" in metadata:
assert (
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author" in metadata
or "{http://www.w3.org/2005/Atom}author" in metadata
)
format = OLD_DEPOSIT_FORMAT
+ else:
+ # new format introduced in
+ # https://forge.softwareheritage.org/D4065
+ # it's the same as the first case, but with the @xmlns
+ # declarations stripped
+ assert "id" in metadata
+ assert "codemeta:author" in metadata
+ format = NEW_DEPOSIT_FORMAT
metadata_entries.append((date, format, metadata))
if discovery_date is None:
@@ -722,18 +729,28 @@
actual_metadata = metadata["extrinsic"]["raw"]["origin_metadata"][
"metadata"
]
+ if isinstance(actual_metadata, str):
+ # new format introduced in
+ # https://forge.softwareheritage.org/D4105
+ actual_metadata = json.loads(actual_metadata)
if "@xmlns" in actual_metadata:
assert actual_metadata["@xmlns"] == ATOM_NS
assert actual_metadata["@xmlns:codemeta"] in (
CODEMETA_NS,
[CODEMETA_NS],
)
- else:
- assert "{http://www.w3.org/2005/Atom}id" in actual_metadata
+ elif "{http://www.w3.org/2005/Atom}id" in actual_metadata:
assert (
"{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
in actual_metadata
)
+ else:
+ # new format introduced in
+ # https://forge.softwareheritage.org/D4065
+ # it's the same as the first case, but with the @xmlns
+ # declarations stripped
+ assert "id" in actual_metadata
+ assert "codemeta:author" in actual_metadata
(origin, discovery_date) = handle_deposit_row(
row, discovery_date, origin, storage, deposit_cur, dry_run
diff --git a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
--- a/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
+++ b/swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py
@@ -543,6 +543,167 @@
]
+def test_deposit_2_with_json_in_json_and_no_xmlns():
+ """New formats introduced in https://forge.softwareheritage.org/D4105 ,
+ where the raw metadata is itself JSONed inside the metadata JSON tree
+ and https://forge.softwareheritage.org/D4065 where the @xmlns declarations
+ are stripped before being sent to the deposit DB"""
+ extrinsic_metadata = {
+ "id": "hal-02960679",
+ "author": {"name": "HAL", "email": "hal@ccsd.cnrs.fr"},
+ "client": "hal",
+ "codemeta:url": "https://hal.archives-ouvertes.fr/hal-02960679",
+ "codemeta:name": "Compressive Spectral Clustering Toolbox",
+ "codemeta:author": [
+ {"codemeta:name": "Nicolas Tremblay", "codemeta:affiliation": "PANAMA"},
+ {"codemeta:name": "Gilles Puy", "codemeta:affiliation": "PANAMA"},
+ {"codemeta:name": "R{\\'e}mi Gribonval", "codemeta:affiliation": "PANAMA"},
+ {"codemeta:name": "Pierre Vandergheynst"},
+ ],
+ # ...
+ }
+
+ original_artifacts = [
+ {
+ "url": "https://deposit.softwareheritage.org/1/private/1037/raw/",
+ "length": 4546913,
+ "filename": "archive.zip",
+ "checksums": {
+ "sha1": "01a0069c626a383de9a17ace40ecfd588e5c4f26",
+ "sha256": "c780a6de91286c70ceecc69fe0c6d201d3fe944aa89e193f3a89ae85dc25c3b1",
+ },
+ }
+ ]
+
+ row = {
+ "id": b"J\x9dc{\xa5\x07\xa2\xb93e%\x04(\xe6\xe3\xf0!\xf1\x94\xd0",
+ "date": datetime.datetime(2016, 1, 29, 0, 0, tzinfo=datetime.timezone.utc),
+ "committer_date": datetime.datetime(
+ 2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ "type": "tar",
+ "message": b"hal: Deposit 1037 in collection hal",
+ "metadata": {
+ "extrinsic": {
+ "raw": {
+ "origin": {
+ "url": "https://hal.archives-ouvertes.fr/hal-02960679",
+ "type": "deposit",
+ },
+ "origin_metadata": {
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.2.0",
+ "configuration": {"sword_version": "2"},
+ },
+ "metadata": json.dumps(extrinsic_metadata),
+ "provider": {
+ "metadata": {},
+ "provider_url": "https://hal.archives-ouvertes.fr/",
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ },
+ },
+ },
+ "when": "2020-10-09T13:38:25.888646+00:00",
+ "provider": "https://deposit.softwareheritage.org/1/private/1037/meta/",
+ },
+ "original_artifact": original_artifacts,
+ },
+ }
+
+ swhid = (
+ "swh:1:dir:8bfdf74037ae1c51335995891c6226e0f85e46e2"
+ ";origin=https://hal.archives-ouvertes.fr/hal-02960679"
+ ";visit=swh:1:snp:bc4a2ddf84dd0cc13d74e1970a1471c2574ed6aa"
+ ";anchor=swh:1:rev:4a9d637ba507a2b93365250428e6e3f021f194d0"
+ ";path=/"
+ )
+ deposit_rows = [
+ {
+ "deposit.id": 1037,
+ "deposit.external_id": "hal-02960679",
+ "deposit.swhid_context": swhid,
+ "deposit.status": "done",
+ "deposit_request.metadata": None,
+ "deposit_request.date": datetime.datetime(
+ 2020, 10, 9, 13, 38, 8, 269611, tzinfo=datetime.timezone.utc,
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ {
+ "deposit.id": 1037,
+ "deposit.external_id": "hal-02960679",
+ "deposit.swhid_context": swhid,
+ "deposit.status": "done",
+ "deposit_request.metadata": extrinsic_metadata,
+ "deposit_request.date": datetime.datetime(
+ 2020, 10, 9, 13, 38, 7, 394544, tzinfo=datetime.timezone.utc,
+ ),
+ "deposit_client.provider_url": "https://hal.archives-ouvertes.fr/",
+ "deposit_collection.name": "hal",
+ "auth_user.username": "hal",
+ },
+ ]
+
+ origin_url = "https://hal.archives-ouvertes.fr/hal-02960679"
+
+ storage = Mock()
+
+ def origin_get(urls):
+ assert urls == [origin_url]
+ return [Origin(url=origin_url)]
+
+ storage.origin_get.side_effect = origin_get
+ deposit_cur = get_mock_deposit_cur(deposit_rows)
+ handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)
+
+ deposit_cur.execute.assert_called_once()
+ deposit_cur.__iter__.assert_called_once()
+
+ assert storage.method_calls == [
+ call.origin_get([origin_url]),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:4a9d637ba507a2b93365250428e6e3f021f194d0"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 10, 9, 13, 38, 7, 394544, tzinfo=datetime.timezone.utc
+ ),
+ authority=HAL_AUTHORITY,
+ fetcher=FETCHER,
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(extrinsic_metadata).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ call.raw_extrinsic_metadata_add(
+ [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=parse_swhid(
+ "swh:1:rev:4a9d637ba507a2b93365250428e6e3f021f194d0"
+ ),
+ discovery_date=datetime.datetime(
+ 2020, 10, 9, 13, 38, 25, 888646, tzinfo=datetime.timezone.utc
+ ),
+ authority=SWH_AUTHORITY,
+ fetcher=FETCHER,
+ format="original-artifacts-json",
+ metadata=json.dumps(original_artifacts).encode(),
+ origin=origin_url,
+ ),
+ ]
+ ),
+ ]
+
+
def test_deposit_3_and_wrong_external_id_in_metadata():
extrinsic_metadata = {
"title": "VTune Perf tool",
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:30 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226190
Attached To
D4233: migrate_extrinsic_metadata: add support for the new deposit metadata formats introduced in late september.
Event Timeline
Log In to Comment