diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
index 60522743..31c9ca5d 100644
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,222 +1,220 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from contextlib import contextmanager
import os
import shutil
import tempfile
from typing import Any, Dict, Optional, Tuple
from xml.etree import ElementTree
from rest_framework import status
from swh.core import tarball
from swh.deposit.utils import NAMESPACES, normalize_date
from swh.model.hashutil import hash_to_hex
from swh.model.model import MetadataAuthorityType
from swh.model.swhids import CoreSWHID
from . import APIPrivateView, DepositReadMixin
from ...config import ARCHIVE_TYPE, SWH_PERSON
from ...models import Deposit
-from ...utils import parse_xml
from ..common import APIGet
@contextmanager
def aggregate_tarballs(extraction_dir, archive_paths):
"""Aggregate multiple tarballs into one and returns this new archive's
path.
Args:
extraction_dir (path): Path to use for the tarballs computation
archive_paths ([str]): Deposit's archive paths
Returns:
Tuple (directory to clean up, archive path (aggregated or not))
"""
# rebuild one zip archive from (possibly) multiple ones
os.makedirs(extraction_dir, 0o755, exist_ok=True)
dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir)
# root folder to build an aggregated tarball
aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate")
os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
# uncompress in a temporary location all archives
for archive_path in archive_paths:
tarball.uncompress(archive_path, aggregated_tarball_rootdir)
# Aggregate into one big tarball the multiple smaller ones
temp_tarpath = shutil.make_archive(
aggregated_tarball_rootdir, "tar", aggregated_tarball_rootdir
)
# can already clean up temporary directory
shutil.rmtree(aggregated_tarball_rootdir)
try:
yield temp_tarpath
finally:
shutil.rmtree(dir_path)
class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
def __init__(self):
super().__init__()
self.extraction_dir = self.config["extraction_dir"]
if not os.path.exists(self.extraction_dir):
os.makedirs(self.extraction_dir)
def process_get(
self, request, collection_name: str, deposit: Deposit
) -> Tuple[int, Any, str]:
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
request (Request):
collection_name: Collection owning the deposit
deposit: Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
archive_paths = [
r.archive.path
for r in self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)
]
return (
status.HTTP_200_OK,
aggregate_tarballs(self.extraction_dir, archive_paths),
"swh/generator",
)
class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin):
"""Class in charge of aggregating metadata on a deposit.
"""
def _parse_dates(
self, deposit: Deposit, metadata: ElementTree.Element
) -> Tuple[dict, dict]:
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
commit_date_elt = metadata.find("codemeta:datePublished", namespaces=NAMESPACES)
author_date_elt = metadata.find("codemeta:dateCreated", namespaces=NAMESPACES)
author_date: Any
commit_date: Any
if author_date_elt is None and commit_date_elt is None:
author_date = commit_date = deposit.complete_date
elif commit_date_elt is None:
author_date = commit_date = author_date_elt.text # type: ignore
elif author_date_elt is None:
author_date = commit_date = commit_date_elt.text
else:
author_date = author_date_elt.text
commit_date = commit_date_elt.text
return (normalize_date(author_date), normalize_date(commit_date))
def metadata_read(self, deposit: Deposit) -> Dict[str, Any]:
"""Read and aggregate multiple deposit information into one unified dictionary.
Args:
deposit: Deposit to retrieve information from
Returns:
Dictionary of deposit information read by the deposit loader, with the
following keys:
**origin** (Dict): Information about the origin
**metadata_raw** (str): List of raw metadata received for the
deposit
**metadata_dict** (Dict): Deposit aggregated metadata into one dict
**provider** (Dict): the metadata provider information about the
deposit client
**tool** (Dict): the deposit information
**deposit** (Dict): deposit information relevant to build the revision
(author_date, committer_date, etc...)
"""
raw_metadata = self._metadata_get(deposit)
author_date: Optional[dict]
commit_date: Optional[dict]
if raw_metadata:
metadata_tree = ElementTree.fromstring(raw_metadata)
author_date, commit_date = self._parse_dates(deposit, metadata_tree)
else:
author_date = commit_date = None
if deposit.parent:
parent_swhid = deposit.parent.swhid
assert parent_swhid is not None
swhid = CoreSWHID.from_string(parent_swhid)
parent_revision = hash_to_hex(swhid.object_id)
parents = [parent_revision]
else:
parents = []
release_notes_elements = metadata_tree.findall(
"codemeta:releaseNotes", namespaces=NAMESPACES
)
release_notes: Optional[str]
if release_notes_elements:
release_notes = "\n\n".join(
element.text for element in release_notes_elements if element.text
)
else:
release_notes = None
return {
"origin": {"type": "deposit", "url": deposit.origin_url},
"provider": {
"provider_name": deposit.client.last_name,
"provider_url": deposit.client.provider_url,
"provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value,
"metadata": {},
},
"tool": self.tool,
"metadata_raw": raw_metadata,
- "metadata_dict": parse_xml(raw_metadata),
"deposit": {
"id": deposit.id,
"client": deposit.client.username,
"collection": deposit.collection.name,
"author": SWH_PERSON,
"author_date": author_date,
"committer": SWH_PERSON,
"committer_date": commit_date,
"revision_parents": parents,
"release_notes": release_notes,
},
}
def process_get(
self, request, collection_name: str, deposit: Deposit
) -> Tuple[int, Dict, str]:
data = self.metadata_read(deposit)
return status.HTTP_200_OK, data if data else {}, "application/json"
diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
index 42a9fc38..c2bd7bb8 100644
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -1,433 +1,426 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse_lazy as reverse
from rest_framework import status
from swh.deposit import __version__
from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SE_IRI, SWH_PERSON
from swh.deposit.models import Deposit
-from swh.deposit.parsers import parse_xml
PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc"
def private_get_raw_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
deposit_id = deposit if isinstance(deposit, int) else deposit.id
return [
reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]),
reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]),
]
def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata):
# update deposit's metadata
response = authenticated_client.post(
reverse(SE_IRI, args=[collection.name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=metadata,
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS=True,
)
assert response.status_code == status.HTTP_201_CREATED
return deposit
def test_read_metadata(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private metadata read api to existing deposit should return metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_raw = atom_dataset["entry-data2"]
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, metadata_xml_raw,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"metadata_raw": metadata_xml_raw,
- "metadata_dict": parse_xml(metadata_xml_raw),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [],
"release_notes": "This is the release of October 7th, 2017.",
},
}
def test_read_metadata_revision_with_parent(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private read metadata to a deposit (with parent) returns metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_raw = atom_dataset["entry-data2"]
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, metadata_xml_raw,
)
rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa"
swhid = "swh:1:rev:%s" % rev_id
fake_parent = Deposit(
swhid=swhid, client=deposit.client, collection=deposit.collection
)
fake_parent.save()
deposit.parent = fake_parent
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"metadata_raw": metadata_xml_raw,
- "metadata_dict": parse_xml(metadata_xml_raw),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [rev_id],
"release_notes": "This is the release of October 7th, 2017.",
},
}
def test_read_metadata_3(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""date(Created|Published) provided, uses author/committer date
"""
deposit = partial_deposit
deposit.external_id = "hal-01243065"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_raw = atom_dataset["entry-data3"]
update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, metadata_xml_raw,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
},
"metadata_raw": metadata_xml_raw,
- "metadata_dict": parse_xml(metadata_xml_raw),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
"release_notes": "This is the release of October 7th, 2017.",
},
}
def test_read_metadata_4(
authenticated_client, deposit_collection, atom_dataset, partial_deposit
):
"""dateCreated/datePublished not provided, revision uses complete_date
"""
deposit = partial_deposit
codemeta_entry_data = atom_dataset["metadata"] % ""
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
# will use the deposit completed date as fallback date
deposit.complete_date = "2016-04-06"
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {"type": "deposit", "url": None,},
"metadata_raw": codemeta_entry_data,
- "metadata_dict": parse_xml(codemeta_entry_data),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1459900800},
},
"author_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1459900800},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
"release_notes": None,
},
}
def test_read_metadata_5(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""dateCreated/datePublished provided, revision uses author/committer
date
If multiple dateCreated provided, the first occurrence (of
dateCreated) is selected. If multiple datePublished provided,
the first occurrence (of datePublished) is selected.
"""
deposit = partial_deposit
# add metadata to the deposit with multiple datePublished/dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:00
2017-05-03T16:08:47+02:00
2016-04-06T17:08:47+02:00
2018-05-03T16:08:47+02:00
"""
)
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
},
"metadata_raw": codemeta_entry_data,
- "metadata_dict": parse_xml(codemeta_entry_data),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1428332927},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
"release_notes": None,
},
}
def test_access_to_nonexisting_deposit_returns_404_response(
authenticated_client, deposit_collection,
):
"""Read unknown collection should return a 404 response
"""
unknown_id = 999
try:
Deposit.objects.get(pk=unknown_id)
except Deposit.DoesNotExist:
assert True
for url in private_get_raw_url_endpoints(deposit_collection, unknown_id):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_404_NOT_FOUND
msg = "Deposit %s does not exist" % unknown_id
assert msg in response.content.decode("utf-8")
def test_read_metadata_multiple_release_notes(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private metadata read api to existing deposit should return metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.origin_url = f"https://hal-test.archives-ouvertes.fr/{deposit.external_id}"
deposit.save()
metadata_xml_raw = atom_dataset["entry-data-multiple-release-notes"]
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, metadata_xml_raw,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["content-type"] == "application/json"
actual_data = response.json()
assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"metadata_raw": metadata_xml_raw,
- "metadata_dict": parse_xml(metadata_xml_raw),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [],
"release_notes": (
"This is the release of October 7th, 2017.\n\n"
"It fixes some bugs."
),
},
}