diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
index c783d5f5..ab9f6f28 100644
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,207 +1,200 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from contextlib import contextmanager
import os
import shutil
import tempfile
from typing import Any, Dict, Tuple
from rest_framework import status
from swh.core import tarball
from swh.deposit.utils import normalize_date
from swh.model import identifiers
from swh.model.model import MetadataAuthorityType
from . import APIPrivateView, DepositReadMixin
from ...config import ARCHIVE_TYPE, SWH_PERSON
from ...models import Deposit
from ..common import APIGet
@contextmanager
def aggregate_tarballs(extraction_dir, archive_paths):
"""Aggregate multiple tarballs into one and returns this new archive's
path.
Args:
extraction_dir (path): Path to use for the tarballs computation
archive_paths ([str]): Deposit's archive paths
Returns:
Tuple (directory to clean up, archive path (aggregated or not))
"""
# rebuild one zip archive from (possibly) multiple ones
os.makedirs(extraction_dir, 0o755, exist_ok=True)
dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir)
# root folder to build an aggregated tarball
aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate")
os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
# uncompress in a temporary location all archives
for archive_path in archive_paths:
tarball.uncompress(archive_path, aggregated_tarball_rootdir)
# Aggregate into one big tarball the multiple smaller ones
temp_tarpath = shutil.make_archive(
aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir
)
# can already clean up temporary directory
shutil.rmtree(aggregated_tarball_rootdir)
try:
yield temp_tarpath
finally:
shutil.rmtree(dir_path)
class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
def __init__(self):
super().__init__()
self.extraction_dir = self.config["extraction_dir"]
if not os.path.exists(self.extraction_dir):
os.makedirs(self.extraction_dir)
def process_get(
self, request, collection_name: str, deposit_id: int
) -> Tuple[int, Any, str]:
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
request (Request):
collection_name: Collection owning the deposit
deposit_id: Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
archive_paths = [
r.archive.path
for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE)
]
return (
status.HTTP_200_OK,
aggregate_tarballs(self.extraction_dir, archive_paths),
"swh/generator",
)
class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin):
"""Class in charge of aggregating metadata on a deposit.
"""
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
Args:
deposit (Deposit): Deposit model representation
metadata (Dict): Metadata dict representation
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
commit_date = metadata.get("codemeta:datePublished")
author_date = metadata.get("codemeta:dateCreated")
if author_date and commit_date:
pass
elif commit_date:
author_date = commit_date
elif author_date:
commit_date = author_date
else:
author_date = deposit.complete_date
commit_date = deposit.complete_date
return (normalize_date(author_date), normalize_date(commit_date))
def metadata_read(self, deposit: Deposit) -> Dict[str, Any]:
"""Read and aggregate multiple deposit information into one unified dictionary.
Args:
- deposit: Deposit concerned by the data aggregation.
+ deposit: Deposit to retrieve information from
Returns:
Dictionary of deposit information read by the deposit loader, with the
following keys:
**origin** (Dict): Information about the origin
- **origin_metadata (Dict): Metadata about the origin to load
+ **metadata_raw** (List[str]): List of raw metadata received for the
+ deposit
- **metadata_raw** (List[str]): List of raw metadata received for the
- deposit
+ **metadata_dict** (Dict): Deposit aggregated metadata into one dict
- **metadata_dict** (Dict): Deposit aggregated metadata into one dict
+ **provider** (Dict): the metadata provider information about the
+ deposit client
- **provider** (Dict): the metadata provider information about the
- deposit client
-
- **tool** (Dict): the deposit information
+ **tool** (Dict): the deposit information
**deposit** (Dict): deposit information relevant to build the revision
(author_date, committer_date, etc...)
"""
metadata, raw_metadata = self._metadata_get(deposit)
- # Read information metadata
- data = {"origin": {"type": "deposit", "url": deposit.origin_url,}}
-
author_date, commit_date = self._normalize_dates(deposit, metadata)
if deposit.parent:
parent_swhid = deposit.parent.swhid
assert parent_swhid is not None
swhid = identifiers.parse_swhid(parent_swhid)
parent_revision = swhid.object_id
parents = [parent_revision]
else:
parents = []
- data["origin_metadata"] = {
- # metadata provider
+ return {
+ "origin": {"type": "deposit", "url": deposit.origin_url},
"provider": {
"provider_name": deposit.client.last_name,
"provider_url": deposit.client.provider_url,
"provider_type": MetadataAuthorityType.DEPOSIT_CLIENT.value,
"metadata": {},
},
"tool": self.tool,
"metadata_raw": raw_metadata,
"metadata_dict": metadata,
+ "deposit": {
+ "id": deposit.id,
+ "client": deposit.client.username,
+ "collection": deposit.collection.name,
+ "author": SWH_PERSON,
+ "author_date": author_date,
+ "committer": SWH_PERSON,
+ "committer_date": commit_date,
+ "revision_parents": parents,
+ },
}
- data["deposit"] = {
- "id": deposit.id,
- "client": deposit.client.username,
- "collection": deposit.collection.name,
- "author": SWH_PERSON,
- "author_date": author_date,
- "committer": SWH_PERSON,
- "committer_date": commit_date,
- "revision_parents": parents,
- }
-
- return data
def process_get(
self, request, collection_name: str, deposit_id: int
) -> Tuple[int, Dict, str]:
deposit = Deposit.objects.get(pk=deposit_id)
data = self.metadata_read(deposit)
return status.HTTP_200_OK, data if data else {}, "application/json"
diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
index ba5d2a87..fad948b7 100644
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -1,424 +1,398 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
from rest_framework import status
from swh.deposit import __version__, utils
from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON
from swh.deposit.models import Deposit
from swh.deposit.parsers import parse_xml
PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc"
def private_get_raw_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
deposit_id = deposit if isinstance(deposit, int) else deposit.id
return [
reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]),
reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]),
]
def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata):
# update deposit's metadata
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=metadata,
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS=True,
)
assert response.status_code == status.HTTP_201_CREATED
return deposit
def test_read_metadata(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private metadata read api to existing deposit should return metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.save()
metadata_xml_atoms = [
atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"]
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
- data = response.json()
- assert data == {
+ actual_data = response.json()
+ assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
- "origin_metadata": {
- "metadata_raw": metadata_xml_atoms,
- "metadata_dict": utils.merge(*metadata_xml_raws),
- "provider": {
- "metadata": {},
- "provider_name": "",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- },
- "tool": {
- "configuration": {"sword_version": "2"},
- "name": "swh-deposit",
- "version": __version__,
- },
+ "metadata_raw": metadata_xml_atoms,
+ "metadata_dict": utils.merge(*metadata_xml_raws),
+ "provider": {
+ "metadata": {},
+ "provider_name": "",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ },
+ "tool": {
+ "configuration": {"sword_version": "2"},
+ "name": "swh-deposit",
+ "version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [],
},
}
def test_read_metadata_revision_with_parent(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private read metadata to a deposit (with parent) returns metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.save()
metadata_xml_atoms = [
atom_dataset[atom_key] for atom_key in ["entry-data2", "entry-data3"]
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa"
swhid = "swh:1:rev:%s" % rev_id
fake_parent = Deposit(
swhid=swhid, client=deposit.client, collection=deposit.collection
)
fake_parent.save()
deposit.parent = fake_parent
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
- data = response.json()
- assert data == {
+ actual_data = response.json()
+ assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
- "origin_metadata": {
- "metadata_raw": metadata_xml_atoms,
- "metadata_dict": utils.merge(*metadata_xml_raws),
- "provider": {
- "metadata": {},
- "provider_name": "",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- },
- "tool": {
- "configuration": {"sword_version": "2"},
- "name": "swh-deposit",
- "version": __version__,
- },
+ "metadata_raw": metadata_xml_atoms,
+ "metadata_dict": utils.merge(*metadata_xml_raws),
+ "provider": {
+ "metadata": {},
+ "provider_name": "",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ },
+ "tool": {
+ "configuration": {"sword_version": "2"},
+ "name": "swh-deposit",
+ "version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [rev_id],
},
}
def test_read_metadata_3(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""date(Created|Published) provided, uses author/committer date
"""
deposit = partial_deposit
deposit.external_id = "hal-01243065"
deposit.save()
# add metadata to the deposit with datePublished and dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:00
2017-05-03T16:08:47+02:00
"""
)
metadata_xml_atoms = [
atom_dataset["entry-data2"],
atom_dataset["entry-data3"],
codemeta_entry_data,
]
metadata_xml_raws = [parse_xml(xml) for xml in metadata_xml_atoms]
for atom_xml in metadata_xml_atoms:
update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, atom_xml,
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
- data = response.json()
- assert data == {
+ actual_data = response.json()
+ assert actual_data == {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
},
- "origin_metadata": {
- "metadata_raw": metadata_xml_atoms,
- "metadata_dict": utils.merge(*metadata_xml_raws),
- "provider": {
- "metadata": {},
- "provider_name": "",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- },
- "tool": {
- "configuration": {"sword_version": "2"},
- "name": "swh-deposit",
- "version": __version__,
- },
+ "metadata_raw": metadata_xml_atoms,
+ "metadata_dict": utils.merge(*metadata_xml_raws),
+ "provider": {
+ "metadata": {},
+ "provider_name": "",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ },
+ "tool": {
+ "configuration": {"sword_version": "2"},
+ "name": "swh-deposit",
+ "version": __version__,
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
},
}
def test_read_metadata_4(
authenticated_client, deposit_collection, atom_dataset, partial_deposit
):
"""dateCreated/datePublished not provided, revision uses complete_date
"""
deposit = partial_deposit
codemeta_entry_data = atom_dataset["metadata"] % ""
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
# will use the deposit completed date as fallback date
deposit.complete_date = "2016-04-06"
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
- data = response.json()
-
- expected_origin = {
- "type": "deposit",
- "url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id),
- }
+ actual_data = response.json()
- expected_origin_metadata = {
+ assert actual_data == {
+ "origin": {
+ "type": "deposit",
+ "url": "https://hal-test.archives-ouvertes.fr/external-id-partial",
+ },
"metadata_raw": [codemeta_entry_data],
"metadata_dict": parse_xml(codemeta_entry_data),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
- }
-
- expected_deposit_info = {
- "author": SWH_PERSON,
- "committer": SWH_PERSON,
- "committer_date": {
- "negative_utc": False,
- "offset": 0,
- "timestamp": {"microseconds": 0, "seconds": 1459900800},
- },
- "author_date": {
- "negative_utc": False,
- "offset": 0,
- "timestamp": {"microseconds": 0, "seconds": 1459900800},
+ "deposit": {
+ "author": SWH_PERSON,
+ "committer": SWH_PERSON,
+ "committer_date": {
+ "negative_utc": False,
+ "offset": 0,
+ "timestamp": {"microseconds": 0, "seconds": 1459900800},
+ },
+ "author_date": {
+ "negative_utc": False,
+ "offset": 0,
+ "timestamp": {"microseconds": 0, "seconds": 1459900800},
+ },
+ "client": deposit_collection.name,
+ "id": deposit.id,
+ "collection": deposit_collection.name,
+ "revision_parents": [],
},
- "client": deposit_collection.name,
- "id": deposit.id,
- "collection": deposit_collection.name,
- "revision_parents": [],
}
- expected_meta = {
- "origin": expected_origin,
- "origin_metadata": expected_origin_metadata,
- "deposit": expected_deposit_info,
- }
-
- assert data == expected_meta
-
def test_read_metadata_5(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""dateCreated/datePublished provided, revision uses author/committer
date
If multiple dateCreated provided, the first occurrence (of
dateCreated) is selected. If multiple datePublished provided,
the first occurrence (of datePublished) is selected.
"""
deposit = partial_deposit
# add metadata to the deposit with multiple datePublished/dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:00
2017-05-03T16:08:47+02:00
2016-04-06T17:08:47+02:00
2018-05-03T16:08:47+02:00
"""
)
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
- data = response.json()
+ actual_data = response.json()
- expected_origin = {
- "type": "deposit",
- "url": "https://hal-test.archives-ouvertes.fr/external-id-partial",
- }
-
- expected_origin_metadata = {
+ assert actual_data == {
+ "origin": {
+ "type": "deposit",
+ "url": "https://hal-test.archives-ouvertes.fr/external-id-partial",
+ },
"metadata_raw": [codemeta_entry_data],
"metadata_dict": parse_xml(codemeta_entry_data),
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
"version": __version__,
},
- }
-
- expected_deposit_info = {
- "author": SWH_PERSON,
- "committer": SWH_PERSON,
- "committer_date": {
- "negative_utc": False,
- "offset": 120,
- "timestamp": {"microseconds": 0, "seconds": 1493820527},
- },
- "author_date": {
- "negative_utc": False,
- "offset": 120,
- "timestamp": {"microseconds": 0, "seconds": 1428332927},
+ "deposit": {
+ "author": SWH_PERSON,
+ "committer": SWH_PERSON,
+ "committer_date": {
+ "negative_utc": False,
+ "offset": 120,
+ "timestamp": {"microseconds": 0, "seconds": 1493820527},
+ },
+ "author_date": {
+ "negative_utc": False,
+ "offset": 120,
+ "timestamp": {"microseconds": 0, "seconds": 1428332927},
+ },
+ "client": deposit_collection.name,
+ "id": deposit.id,
+ "collection": deposit_collection.name,
+ "revision_parents": [],
},
- "client": deposit_collection.name,
- "id": deposit.id,
- "collection": deposit_collection.name,
- "revision_parents": [],
}
- expected_meta = {
- "origin": expected_origin,
- "origin_metadata": expected_origin_metadata,
- "deposit": expected_deposit_info,
- }
-
- assert data == expected_meta
-
def test_access_to_nonexisting_deposit_returns_404_response(
authenticated_client, deposit_collection,
):
"""Read unknown collection should return a 404 response
"""
unknown_id = 999
try:
Deposit.objects.get(pk=unknown_id)
except Deposit.DoesNotExist:
assert True
for url in private_get_raw_url_endpoints(deposit_collection, unknown_id):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_404_NOT_FOUND
msg = "Deposit with id %s does not exist" % unknown_id
assert msg in response.content.decode("utf-8")