diff --git a/swh/deposit/api/private/deposit_list.py b/swh/deposit/api/private/deposit_list.py index ced68c09..80847841 100644 --- a/swh/deposit/api/private/deposit_list.py +++ b/swh/deposit/api/private/deposit_list.py @@ -1,42 +1,57 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.generics import ListAPIView from swh.deposit.api.utils import DefaultPagination, DepositSerializer from . import APIPrivateView from ...models import Deposit class APIList(ListAPIView, APIPrivateView): """Deposit request class to list the deposit's status per page. HTTP verbs supported: GET """ serializer_class = DepositSerializer pagination_class = DefaultPagination def get_queryset(self): + """Retrieve iterable of deposits (with some optional filtering).""" params = self.request.query_params exclude_like = params.get("exclude") username = params.get("username") if username: - deposits = Deposit.objects.select_related("client").filter( + deposits_qs = Deposit.objects.select_related("client").filter( client__username=username ) else: - deposits = Deposit.objects.all() + deposits_qs = Deposit.objects.all() if exclude_like: # sql injection: A priori, nothing to worry about, django does it for # queryset # https://docs.djangoproject.com/en/3.0/topics/security/#sql-injection-protection # noqa - deposits = deposits.exclude(external_id__startswith=exclude_like) - return deposits.order_by("id") + deposits_qs = deposits_qs.exclude(external_id__startswith=exclude_like) + + deposits = [] + for deposit in deposits_qs.order_by("id"): + deposit_requests = deposit.depositrequest_set.filter( + type="metadata" + ).order_by("-id") + # enrich deposit with raw metadata when we have some + if deposit_requests and len(deposit_requests) > 0: + raw_meta = deposit_requests[0].raw_metadata + if raw_meta: + deposit.set_raw_metadata(raw_meta) + + deposits.append(deposit) + + return deposits diff --git a/swh/deposit/api/utils.py b/swh/deposit/api/utils.py index 938d44e6..7a9aff1b 100644 --- a/swh/deposit/api/utils.py +++ b/swh/deposit/api/utils.py @@ -1,35 +1,36 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import serializers from rest_framework.fields import _UnvalidatedField from rest_framework.pagination import PageNumberPagination from swh.deposit.api.converters import convert_status_detail from swh.deposit.models import Deposit class DefaultPagination(PageNumberPagination): page_size = 100 page_size_query_param = "page_size" class StatusDetailField(_UnvalidatedField): """status_detail field is a dict, we want a simple message instead. So, we reuse the convert_status_detail from deposit_status endpoint to that effect. """ def to_representation(self, value): return convert_status_detail(value) class DepositSerializer(serializers.ModelSerializer): status_detail = StatusDetailField() + raw_metadata = _UnvalidatedField() class Meta: model = Deposit fields = "__all__" diff --git a/swh/deposit/models.py b/swh/deposit/models.py index 28db7802..d8336b6f 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,248 +1,256 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb import datetime from typing import Optional from django.contrib.auth.models import User, UserManager from django.contrib.postgres.fields import ArrayField, JSONField from django.db import models from django.utils.timezone import now from swh.auth.django.models import OIDCUser from .config import ( ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, METADATA_TYPE, ) class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = "dbversion" app_label = "deposit" def __str__(self): return str( { "version": self.version, "release": self.release, "description": self.description, } ) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ("expired", "expired"), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ("loading", "loading"), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, " "In-Progress header should be false", "expired": "Deposit has been there too long and is now " "deemed ready to be garbage collected", DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks " "(tarball ok, metadata, etc...)", DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and " "ready for loading", DEPOSIT_STATUS_REJECTED: "Deposit failed the checks", "loading": "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully " "loaded into the Software Heritage archive", DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the " "Software Heritage archive failed", } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() # type: ignore # this typing hint is due to a mypy/django-stubs limitation, # see https://github.com/typeddjango/django-stubs/issues/174 provider_url = models.TextField(null=False) domain = models.TextField(null=False) oidc_user: Optional[OIDCUser] = None class Meta: db_table = "deposit_client" app_label = "deposit" def __str__(self): return str( { "id": self.id, "collections": self.collections, "username": super().username, "domain": self.domain, "provider_url": self.provider_url, } ) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey("DepositCollection", models.DO_NOTHING) # Deprecated: Deposit's external identifier external_id = models.TextField(null=True) # URL of the origin of this deposit, null if this is a metadata-only deposit origin_url = models.TextField(null=True) # Deposit client client = models.ForeignKey("DepositClient", models.DO_NOTHING) # SWH's loading result identifier swhid = models.TextField(blank=True, null=True) swhid_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True) check_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated checking task id" ) load_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated loading task id" ) + raw_metadata: Optional[str] = None class Meta: db_table = "deposit" app_label = "deposit" def __str__(self): d = { "id": self.id, "reception_date": self.reception_date, "collection": self.collection.name, "external_id": self.external_id, "origin_url": self.origin_url, "client": self.client.username, "status": self.status, } if self.status in (DEPOSIT_STATUS_REJECTED): d["status_detail"] = self.status_detail return str(d) + def set_raw_metadata(self, raw_metadata: str) -> None: + """Set the metadata raw out of a 'metadata' typed deposit request. This is + specifically used during listing. + + """ + self.raw_metadata = raw_metadata + def client_directory_path(instance: "DepositRequest", filename: str) -> str: """Callable to determine the upload archive path. This defaults to MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/. The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit formatted using strftime. Args: instance: DepositRequest concerned by the upload filename: Filename of the uploaded file Returns: The upload archive path. """ reception_date = instance.deposit.reception_date assert isinstance(reception_date, datetime.datetime) folder = reception_date.strftime("%Y%m%d-%H%M%S.%f") return f"client_{instance.deposit.client.id}/{folder}/{filename}" REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)] class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True) class Meta: db_table = "deposit_request" app_label = "deposit" def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str( { "id": self.id, "deposit": self.deposit, "metadata": meta, "archive": archive_name, } ) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = "deposit_collection" app_label = "deposit" def __str__(self): return str({"id": self.id, "name": self.name}) diff --git a/swh/deposit/tests/api/test_deposit_private_list.py b/swh/deposit/tests/api/test_deposit_private_list.py index ce17fdef..27897b4a 100644 --- a/swh/deposit/tests/api/test_deposit_private_list.py +++ b/swh/deposit/tests/api/test_deposit_private_list.py @@ -1,130 +1,160 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse_lazy as reverse from rest_framework import status from swh.deposit.api.converters import convert_status_detail -from swh.deposit.config import ( - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_LOAD_SUCCESS, - DEPOSIT_STATUS_PARTIAL, - PRIVATE_LIST_DEPOSITS, -) +from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS, PRIVATE_LIST_DEPOSITS from swh.deposit.models import DepositClient from swh.deposit.tests.conftest import internal_create_deposit STATUS_DETAIL = { "url": { "summary": "At least one compatible url field. Failed", "fields": ["testurl"], }, "metadata": [{"summary": "Mandatory fields missing", "fields": ["9", 10, 1.212],},], "archive": [ {"summary": "Invalid archive", "fields": ["3"],}, {"summary": "Unsupported archive", "fields": [2],}, ], } -def test_deposit_list(partial_deposit, deposited_deposit, authenticated_client): +def test_deposit_list( + partial_deposit_with_metadata, + partial_deposit_only_metadata, + partial_deposit, + authenticated_client, +): """Deposit list api should return all deposits in a paginated way """ - partial_deposit.status_detail = STATUS_DETAIL - partial_deposit.save() - - deposit_id = partial_deposit.id - deposit_id2 = deposited_deposit.id + partial_deposit_with_metadata.status_detail = STATUS_DETAIL + partial_deposit_with_metadata.save() + deposit1 = partial_deposit_with_metadata + deposit2 = partial_deposit_only_metadata + deposit3 = partial_deposit main_url = reverse(PRIVATE_LIST_DEPOSITS) - url = "%s?page_size=1" % main_url + url = f"{main_url}?page_size=1" response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK - data = response.json() - assert data["count"] == 2 # total result of 2 deposits if consuming all results - expected_next = f"{main_url}?page=2&page_size=1" - assert data["next"].endswith(expected_next) is True - assert data["previous"] is None - assert len(data["results"]) == 1 # page of size 1 - deposit = data["results"][0] - assert deposit["id"] == deposit_id - assert deposit["status"] == DEPOSIT_STATUS_PARTIAL + data_p1 = response.json() + assert data_p1["count"] == 3 # total nb of deposits + expected_next_p1 = f"{main_url}?page=2&page_size=1" + assert data_p1["next"].endswith(expected_next_p1) is True + assert data_p1["previous"] is None + assert len(data_p1["results"]) == 1 # page of size 1 + deposit_d = data_p1["results"][0] + assert deposit_d["id"] == deposit1.id + assert deposit_d["status"] == deposit1.status expected_status_detail = convert_status_detail(STATUS_DETAIL) - assert deposit["status_detail"] == expected_status_detail + assert deposit_d["status_detail"] == expected_status_detail + assert deposit_d["raw_metadata"] is not None + assert ( + deposit_d["raw_metadata"] + == deposit1.depositrequest_set.filter(type="metadata")[0].raw_metadata + ) # then 2nd page - response2 = authenticated_client.get(expected_next) + response2 = authenticated_client.get(data_p1["next"]) assert response2.status_code == status.HTTP_200_OK - data2 = response2.json() + data_p2 = response2.json() + + assert data_p2["count"] == 3 # total nb of deposits + expected_next_p2 = f"{main_url}?page=3&page_size=1" + assert data_p2["next"].endswith(expected_next_p2) + assert data_p2["previous"].endswith(url) + assert len(data_p2["results"]) == 1 # page of size 1 + + deposit2_d = data_p2["results"][0] + assert deposit2_d["id"] == deposit2.id + assert deposit2_d["status"] == deposit2.status + assert deposit2_d["raw_metadata"] is not None + assert ( + deposit2_d["raw_metadata"] + == deposit2.depositrequest_set.filter(type="metadata")[0].raw_metadata + ) + + # then 3rd (and last) page + response3 = authenticated_client.get(data_p2["next"]) + + assert response3.status_code == status.HTTP_200_OK + data_p3 = response3.json() - assert data["count"] == 2 # total result of 2 deposits if consuming all results - assert data2["next"] is None + assert data_p3["count"] == 3 # total nb of deposits + assert data_p3["next"] is None, "No more page beyond that point" - expected_previous = f"{main_url}?page_size=1" - assert data2["previous"].endswith(expected_previous) is True - assert len(data2["results"]) == 1 # page of size 1 + assert data_p3["previous"] == data_p1["next"] + assert len(data_p3["results"]) == 1 # page of size 1 - deposit2 = data2["results"][0] - assert deposit2["id"] == deposit_id2 - assert deposit2["status"] == DEPOSIT_STATUS_DEPOSITED + deposit3_d = data_p3["results"][0] + assert deposit3_d["id"] == deposit3.id + assert deposit3_d["status"] == deposit3.status + assert not deposit3.depositrequest_set.filter( + type="metadata" + ), "No metadata type request for that deposit" + # hence no raw metadata set for that deposit + assert deposit3_d["raw_metadata"] is None, "no raw metadata for that deposit" def test_deposit_list_exclude(partial_deposit, deposited_deposit, authenticated_client): """Exclusion pattern on external_id should be respected """ partial_deposit.status_detail = STATUS_DETAIL partial_deposit.save() main_url = reverse(PRIVATE_LIST_DEPOSITS) # Testing exclusion pattern exclude_pattern = "external-id" assert partial_deposit.external_id.startswith(exclude_pattern) assert deposited_deposit.external_id.startswith(exclude_pattern) url = f"{main_url}?page_size=1&exclude=external-id" response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 0 url = "%s?page_size=1&exclude=dummy" % main_url # that won't exclude anything response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 2 def test_deposit_list_for_username( authenticated_client, deposit_another_collection, completed_deposit, deposit_user, deposit_another_user, ): # create a new deposit with a user different from deposit_user, # the one that created completed_deposit internal_create_deposit( client=deposit_another_user, collection=deposit_another_collection, external_id="external-id-bar", status=DEPOSIT_STATUS_LOAD_SUCCESS, ) for user in (deposit_user, deposit_another_user): # check deposit filtering by username url = f"{reverse(PRIVATE_LIST_DEPOSITS)}?username={user.username}" json_response = authenticated_client.get(url).json() assert len(json_response["results"]) == 1 deposit_client = DepositClient.objects.all().get( id=json_response["results"][0]["client"] ) assert deposit_client.username == user.username