diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
index aeb2823c..77d5198f 100644
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -1,471 +1,459 @@
# Copyright (C) 2017-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse_lazy as reverse
from rest_framework import status
from swh.deposit import __version__, utils
from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SE_IRI, SWH_PERSON
from swh.deposit.models import Deposit
from swh.deposit.parsers import parse_xml
PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc"
def private_get_raw_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
deposit_id = deposit if isinstance(deposit, int) else deposit.id
return [
reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]),
reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]),
]
def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata):
# update deposit's metadata
response = authenticated_client.post(
reverse(SE_IRI, args=[collection.name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=metadata,
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS=True,
)
assert response.status_code == status.HTTP_201_CREATED
return deposit
def test_read_metadata(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private metadata read api to existing deposit should return metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_atoms = [
atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"]
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"metadata_raw": metadata_xml_atoms,
"metadata_dict": utils.merge(*metadata_xml_raws),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [],
"release_notes": "This is the release of October 7th, 2017.",
},
}
def test_read_metadata_revision_with_parent(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private read metadata to a deposit (with parent) returns metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_atoms = [
atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"]
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa"
swhid = "swh:1:rev:%s" % rev_id
fake_parent = Deposit(
swhid=swhid, client=deposit.client, collection=deposit.collection
)
fake_parent.save()
deposit.parent = fake_parent
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"metadata_raw": metadata_xml_atoms,
"metadata_dict": utils.merge(*metadata_xml_raws),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [rev_id],
"release_notes": "This is the release of October 7th, 2017.",
},
}
def test_read_metadata_3(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""date(Created|Published) provided, uses author/committer date
"""
deposit = partial_deposit
deposit.external_id = "hal-01243065"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
# add metadata to the deposit with datePublished and dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:00
2017-05-03T16:08:47+02:00
"""
)
metadata_xml_atoms = [
atom_dataset["entry-data2"],
atom_dataset["entry-data3"],
codemeta_entry_data,
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
},
"metadata_raw": metadata_xml_atoms,
"metadata_dict": utils.merge(*metadata_xml_raws),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
- "negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
"release_notes": "This is the release of October 7th, 2017.",
},
}
def test_read_metadata_4(
authenticated_client, deposit_collection, atom_dataset, partial_deposit
):
"""dateCreated/datePublished not provided, revision uses complete_date
"""
deposit = partial_deposit
codemeta_entry_data = atom_dataset["metadata"] % ""
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
# will use the deposit completed date as fallback date
deposit.complete_date = "2016-04-06"
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {"type": "deposit", "url": None,},
"metadata_raw": [codemeta_entry_data],
"metadata_dict": parse_xml(codemeta_entry_data),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1459900800},
},
"author_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1459900800},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
"release_notes": None,
},
}
def test_read_metadata_5(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""dateCreated/datePublished provided, revision uses author/committer
date
If multiple dateCreated provided, the first occurrence (of
dateCreated) is selected. If multiple datePublished provided,
the first occurrence (of datePublished) is selected.
"""
deposit = partial_deposit
# add metadata to the deposit with multiple datePublished/dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:00
2017-05-03T16:08:47+02:00
2016-04-06T17:08:47+02:00
2018-05-03T16:08:47+02:00
"""
)
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
},
"metadata_raw": [codemeta_entry_data],
"metadata_dict": parse_xml(codemeta_entry_data),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
- "negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
- "negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1428332927},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
"release_notes": None,
},
}
def test_access_to_nonexisting_deposit_returns_404_response(
authenticated_client, deposit_collection,
):
"""Read unknown collection should return a 404 response
"""
unknown_id = 999
try:
Deposit.objects.get(pk=unknown_id)
except Deposit.DoesNotExist:
assert True
for url in private_get_raw_url_endpoints(deposit_collection, unknown_id):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_404_NOT_FOUND
msg = "Deposit %s does not exist" % unknown_id
assert msg in response.content.decode("utf-8")
def test_read_metadata_multiple_release_notes(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private metadata read api to existing deposit should return metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_atoms = [
atom_dataset[atom_key] for atom_key in ["entry-data-multiple-release-notes"]
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"metadata_raw": metadata_xml_atoms,
"metadata_dict": utils.merge(*metadata_xml_raws),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
- "negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [],
"release_notes": (
"This is the release of October 7th, 2017.\n\n"
"It fixes some bugs."
),
},
}
diff --git a/swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta b/swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta
index 0451c714..01419e3d 100644
--- a/swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta
+++ b/swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta
@@ -1,69 +1,67 @@
{
"branch_name": "master",
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id"
},
"origin_metadata": {
"metadata": {
"@xmlns": ["http://www.w3.org/2005/Atom"],
"author": [
"some awesome author",
"another one",
"no one"
],
"codemeta:dateCreated": "2017-10-07T15:17:08Z",
"external_identifier": "some-external-id",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id"
},
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/"
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": "0.0.1"
}
},
"revision": {
"author": {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org"
},
"committer": {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org"
},
"committer_date": {
- "negative_utc": "false",
"offset": 0,
"timestamp": {
"microseconds": 0,
"seconds": 1507389428
}
},
"date": {
- "negative_utc": "false",
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428}
},
"message": "test: Deposit 999 in collection test",
"metadata": {
"@xmlns": ["http://www.w3.org/2005/Atom"],
"author": ["some awesome author",
"another one",
"no one"],
"codemeta:dateCreated": "2017-10-07T15:17:08Z",
"external_identifier": "some-external-id",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id"
},
"synthetic": "true",
"type": "tar",
"parents": []
}
}
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
index a06e02ce..5402a5b7 100644
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -1,276 +1,273 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.deposit import utils
from swh.deposit.parsers import parse_xml
from swh.model.exceptions import ValidationError
from swh.model.swhids import CoreSWHID, QualifiedSWHID
@pytest.fixture
def xml_with_origin_reference():
xml_data = """
"""
return xml_data.strip()
def test_merge():
"""Calling utils.merge on dicts should merge without losing information
"""
d0 = {"author": "someone", "license": [["gpl2"]], "a": 1}
d1 = {
"author": ["author0", {"name": "author1"}],
"license": [["gpl3"]],
"b": {"1": "2"},
}
d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}}
d3 = {
"author": (v for v in ["no one"]),
}
actual_merge = utils.merge(d0, d1, d2, d3)
expected_merge = {
"a": 1,
"license": [["gpl2"], ["gpl3"], "mit"],
"author": ["someone", "author0", {"name": "author1"}, "else", "no one"],
"b": {"1": "2", "2": "3",},
}
assert actual_merge == expected_merge
def test_merge_2():
d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}}
d1 = {"license": "gpl3", "runtime": "GNU/Linux"}
expected = {
"license": ["gpl2", "gpl3"],
"runtime": [{"os": "unix derivative"}, "GNU/Linux"],
}
actual = utils.merge(d0, d1)
assert actual == expected
def test_merge_edge_cases():
input_dict = {
"license": ["gpl2", "gpl3"],
"runtime": [{"os": "unix derivative"}, "GNU/Linux"],
}
# against empty dict
actual = utils.merge(input_dict, {})
assert actual == input_dict
# against oneself
actual = utils.merge(input_dict, input_dict, input_dict)
assert actual == input_dict
def test_merge_one_dict():
"""Merge one dict should result in the same dict value
"""
input_and_expected = {"anything": "really"}
actual = utils.merge(input_and_expected)
assert actual == input_and_expected
def test_merge_raise():
"""Calling utils.merge with any no dict argument should raise
"""
d0 = {"author": "someone", "a": 1}
d1 = ["not a dict"]
with pytest.raises(ValueError):
utils.merge(d0, d1)
with pytest.raises(ValueError):
utils.merge(d1, d0)
with pytest.raises(ValueError):
utils.merge(d1)
assert utils.merge(d0) == d0
def test_normalize_date_0():
"""When date is a list, choose the first date and normalize it
"""
actual_date = utils.normalize_date(["2017-10-12", "date1"])
assert actual_date == {
"timestamp": {"microseconds": 0, "seconds": 1507766400},
- "negative_utc": False,
"offset": 0,
}
def test_normalize_date_1():
"""Providing a date in a reasonable format, everything is fine
"""
actual_date = utils.normalize_date("2018-06-11 17:02:02")
assert actual_date == {
"timestamp": {"microseconds": 0, "seconds": 1528736522},
- "negative_utc": False,
"offset": 0,
}
def test_normalize_date_doing_irrelevant_stuff():
"""Providing a date with only the year results in a reasonable date
"""
actual_date = utils.normalize_date("2017")
assert actual_date == {
"timestamp": {"seconds": 1483228800, "microseconds": 0},
"offset": 0,
- "negative_utc": False,
}
@pytest.mark.parametrize(
"swhid,expected_metadata_context",
[
("swh:1:cnt:51b5c8cc985d190b5a7ef4878128ebfdc2358f49", {"origin": None},),
(
"swh:1:snp:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=http://blah",
{"origin": "http://blah", "path": None},
),
(
"swh:1:dir:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;path=/path",
{"origin": None, "path": b"/path"},
),
(
"swh:1:rev:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;visit=swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa
{
"origin": None,
"path": None,
"snapshot": CoreSWHID.from_string(
"swh:1:snp:41b5c8cc985d190b5a7ef4878128ebfdc2358f49"
),
},
),
(
"swh:1:rel:51b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49", # noqa
{
"origin": None,
"path": None,
"directory": CoreSWHID.from_string(
"swh:1:dir:41b5c8cc985d190b5a7ef4878128ebfdc2358f49"
),
},
),
],
)
def test_compute_metadata_context(swhid: str, expected_metadata_context):
assert expected_metadata_context == utils.compute_metadata_context(
QualifiedSWHID.from_string(swhid)
)
def test_parse_swh_reference_origin(xml_with_origin_reference):
url = "https://url"
xml_data = xml_with_origin_reference.format(url=url)
metadata = parse_xml(xml_data)
actual_origin = utils.parse_swh_reference(metadata)
assert actual_origin == url
@pytest.fixture
def xml_with_empty_reference():
xml_data = """
{swh_reference}
"""
return xml_data.strip()
@pytest.mark.parametrize(
"xml_ref",
[
"",
"",
"",
"""""",
],
)
def test_parse_swh_reference_empty(xml_with_empty_reference, xml_ref):
xml_body = xml_with_empty_reference.format(swh_reference=xml_ref)
metadata = utils.parse_xml(xml_body)
assert utils.parse_swh_reference(metadata) is None
@pytest.fixture
def xml_with_swhid(atom_dataset):
return atom_dataset["entry-data-with-swhid"]
@pytest.mark.parametrize(
"swhid",
[
"swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/", # noqa
"swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:dir:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:rev:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:rel:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:rel:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:snp:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;anchor=swh:1:snp:9c5de20cfb54682370a398fcc733e829903c8cba", # noqa
"swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49",
],
)
def test_parse_swh_reference_swhid(swhid, xml_with_swhid):
xml_data = xml_with_swhid.format(swhid=swhid)
metadata = utils.parse_xml(xml_data)
actual_swhid = utils.parse_swh_reference(metadata)
assert actual_swhid is not None
expected_swhid = QualifiedSWHID.from_string(swhid)
assert actual_swhid == expected_swhid
@pytest.mark.parametrize(
"invalid_swhid",
[
# incorrect length
"swh:1:cnt:31b5c8cc985d190b5a7ef4878128ebfdc235" # noqa
# visit qualifier should be a core SWHID with type,
"swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:rev:0175049fc45055a3824a1675ac06e3711619a55a", # noqa
# anchor qualifier should be a core SWHID with type one of
"swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;anchor=swh:1:cnt:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/", # noqa
"swh:1:rev:c4993c872593e960dc84e4430dbbfbc34fd706d0;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:snp:b5f505b005435fa5c4fa4c279792bd7b17167c04", # noqa
],
)
def test_parse_swh_reference_invalid_swhid(invalid_swhid, xml_with_swhid):
"""Unparsable swhid should raise
"""
xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid)
metadata = utils.parse_xml(xml_invalid_swhid)
with pytest.raises(ValidationError):
utils.parse_swh_reference(metadata)
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
index f56f6b2f..d1130075 100644
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -1,257 +1,254 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from types import GeneratorType
from typing import Any, Dict, Optional, Union
import iso8601
import xmltodict
from swh.model.exceptions import ValidationError
from swh.model.model import TimestampWithTimezone
from swh.model.swhids import ExtendedSWHID, ObjectType, QualifiedSWHID
logger = logging.getLogger(__name__)
def parse_xml(stream, encoding="utf-8"):
namespaces = {
"http://www.w3.org/2005/Atom": "atom",
"http://www.w3.org/2007/app": "app",
"http://purl.org/dc/terms/": "dc",
"https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta",
"http://purl.org/net/sword/terms/": "sword",
"https://www.softwareheritage.org/schema/2018/deposit": "swh",
}
data = xmltodict.parse(
stream,
encoding=encoding,
namespaces=namespaces,
process_namespaces=True,
dict_constructor=dict,
)
if "atom:entry" in data:
data = data["atom:entry"]
return data
def merge(*dicts):
"""Given an iterator of dicts, merge them losing no information.
Args:
*dicts: arguments are all supposed to be dict to merge into one
Returns:
dict merged without losing information
"""
def _extend(existing_val, value):
"""Given an existing value and a value (as potential lists), merge
them together without repetition.
"""
if isinstance(value, (list, map, GeneratorType)):
vals = value
else:
vals = [value]
for v in vals:
if v in existing_val:
continue
existing_val.append(v)
return existing_val
d = {}
for data in dicts:
if not isinstance(data, dict):
raise ValueError("dicts is supposed to be a variable arguments of dict")
for key, value in data.items():
existing_val = d.get(key)
if not existing_val:
d[key] = value
continue
if isinstance(existing_val, (list, map, GeneratorType)):
new_val = _extend(existing_val, value)
elif isinstance(existing_val, dict):
if isinstance(value, dict):
new_val = merge(existing_val, value)
else:
new_val = _extend([existing_val], value)
else:
new_val = _extend([existing_val], value)
d[key] = new_val
return d
def normalize_date(date):
"""Normalize date fields as expected by swh workers.
If date is a list, elect arbitrarily the first element of that
list
If date is (then) a string, parse it through
dateutil.parser.parse to extract a datetime.
Then normalize it through
:class:`swh.model.model.TimestampWithTimezone`
Returns
The swh date object
"""
if isinstance(date, list):
date = date[0]
if isinstance(date, str):
date = iso8601.parse_date(date)
- d = TimestampWithTimezone.from_dict(date).to_dict()
+ tstz = TimestampWithTimezone.from_dict(date)
- # Workaround while we migrate from storing offsets as (int, bool) to bytes.
- # When the migration is done, remove this pop().
- # offset_bytes will also need to be converted to a string (which is fine because
- # it is always a well-formed offset)
- d.pop("offset_bytes", None)
-
- return d
+ return {
+ "timestamp": tstz.timestamp.to_dict(),
+ "offset": tstz.offset,
+ }
def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]:
"""Given a SWHID object, determine the context as a dict.
"""
metadata_context: Dict[str, Any] = {"origin": None}
if swhid_reference.qualifiers():
metadata_context = {
"origin": swhid_reference.origin,
"path": swhid_reference.path,
}
snapshot = swhid_reference.visit
if snapshot:
metadata_context["snapshot"] = snapshot
anchor = swhid_reference.anchor
if anchor:
metadata_context[anchor.object_type.name.lower()] = anchor
return metadata_context
ALLOWED_QUALIFIERS_NODE_TYPE = (
ObjectType.SNAPSHOT,
ObjectType.REVISION,
ObjectType.RELEASE,
ObjectType.DIRECTORY,
)
def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]:
"""Parse swh reference within the metadata dict (or origin) reference if found,
None otherwise.
.. code-block:: xml
or:
.. code-block:: xml
Args:
metadata: result of parsing an Atom document with :func:`parse_xml`
Raises:
ValidationError in case the swhid referenced (if any) is invalid
Returns:
Either swhid or origin reference if any. None otherwise.
""" # noqa
swh_deposit = metadata.get("swh:deposit")
if not swh_deposit:
return None
swh_reference = swh_deposit.get("swh:reference")
if not swh_reference:
return None
swh_origin = swh_reference.get("swh:origin")
if swh_origin:
url = swh_origin.get("@url")
if url:
return url
swh_object = swh_reference.get("swh:object")
if not swh_object:
return None
swhid = swh_object.get("@swhid")
if not swhid:
return None
swhid_reference = QualifiedSWHID.from_string(swhid)
if swhid_reference.qualifiers():
anchor = swhid_reference.anchor
if anchor:
if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE:
error_msg = (
"anchor qualifier should be a core SWHID with type one of "
f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}"
)
raise ValidationError(error_msg)
visit = swhid_reference.visit
if visit:
if visit.object_type != ObjectType.SNAPSHOT:
raise ValidationError(
f"visit qualifier should be a core SWHID with type snp, "
f"not {visit.object_type.value}"
)
if (
visit
and anchor
and visit.object_type == ObjectType.SNAPSHOT
and anchor.object_type == ObjectType.SNAPSHOT
):
logger.warn(
"SWHID use of both anchor and visit targeting "
f"a snapshot: {swhid_reference}"
)
raise ValidationError(
"'anchor=swh:1:snp:' is not supported when 'visit' is also provided."
)
return swhid_reference
def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID:
"""Used to get the target of a metadata object from a ,
as the latter uses a QualifiedSWHID."""
return ExtendedSWHID.from_string(str(swhid).split(";")[0])
def to_header_link(link: str, link_name: str) -> str:
"""Build a single header link.
>>> link_next = to_header_link("next-url", "next")
>>> link_next
'; rel="next"'
>>> ','.join([link_next, to_header_link("prev-url", "prev")])
'; rel="next",; rel="prev"'
"""
return f'<{link}>; rel="{link_name}"'