diff --git a/PKG-INFO b/PKG-INFO index 78cb3cde..96b7393e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,11 +1,11 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.42 +Version: 0.0.43 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN Provides-Extra: loader diff --git a/bin/Makefile b/bin/Makefile index 1a1984e4..03834dc2 100644 --- a/bin/Makefile +++ b/bin/Makefile @@ -1,45 +1,45 @@ DEPOSIT_ID=1 ARCHIVE=../../swh-deposit.zip ARCHIVE2=../../swh-model.zip STATUS=false PARTIAL_STATUS=true -UPDATE_STATUS='success' +UPDATE_STATUS='done' ATOM_ENTRY=../../atom-entry.xml EXTERNAL_ID='external-id' create-archives: 7z a $(ARCHIVE) $(FOLDER) 7z a $(ARCHIVE2) $(FOLDER2) new: ./create_deposit.sh $(ARCHIVE) $(STATUS) new-complete: ./create_deposit_with_metadata.sh $(ARCHIVE) $(ATOM_ENTRY) $(STATUS) $(EXTERNAL_ID) new-partial: make new STATUS=$(PARTIAL_STATUS) ARCHIVE=$(ARCHIVE) update: ./update-deposit-with-another-archive.sh $(DEPOSIT_ID) $(ARCHIVE2) $(STATUS) update-partial: make update DEPOSIT_ID=$(DEPOSIT_ID) ARCHIVE2=$(ARCHIVE2) STATUS=$(PARTIAL_STATUS) replace: ./replace-deposit-archive.sh $(ARCHIVE2) $(DEPOSIT_ID) download: ./download-deposit-archive.sh $(DEPOSIT_ID) status: ./status.sh $(DEPOSIT_ID) service-document: ./service-document.sh home: ./home.sh update-status: ./update-status.sh $(DEPOSIT_ID) $(UPDATE_STATUS) diff --git a/bin/create_deposit.sh b/bin/create_deposit.sh index c883306e..183f114a 100755 --- a/bin/create_deposit.sh +++ b/bin/create_deposit.sh @@ -1,21 +1,21 @@ #!/usr/bin/env bash . ./default-setup ARCHIVE=${1-'../../deposit.zip'} NAME=$(basename ${ARCHIVE}) MD5=$(md5sum ${ARCHIVE} | cut -f 1 -d' ') PROGRESS=${2-'false'} +TYPE=${3-'application/zip'} curl -i -u "$CREDS" \ -X POST \ --data-binary @${ARCHIVE} \ -H "In-Progress: ${PROGRESS}" \ -H "Content-MD5: ${MD5}" \ -H "Content-Disposition: attachment; filename=${NAME}" \ -H 'Slug: external-id' \ - -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \ - -H 'Content-type: application/zip' \ + -H "Content-type: ${TYPE}" \ ${SERVER}/1/${COLLECTION}/ diff --git a/bin/update-status.sh b/bin/update-status.sh index 5be1cafb..c5925a41 100755 --- a/bin/update-status.sh +++ b/bin/update-status.sh @@ -1,12 +1,12 @@ #!/usr/bin/env bash . ./default-setup DEPOSIT_ID=${1-1} -UPDATE_STATUS=${2-'success'} +UPDATE_STATUS=${2-'done'} curl -i \ -X PUT \ -H 'Content-Type: application/json' \ -d "{\"status\": \"${UPDATE_STATUS}\"}" \ ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/update/ diff --git a/debian/control b/debian/control index 004b80cf..6503a32f 100644 --- a/debian/control +++ b/debian/control @@ -1,41 +1,46 @@ Source: swh-deposit Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-setuptools, python3-all, python3-nose, python3-django-nose, python3-vcversioner, python3-swh.core (>= 0.0.36~), + python3-swh.model (>= 0.0.21~), python3-swh.loader.core (>= 0.0.27~), python3-swh.loader.tar (>= 0.0.32~), python3-swh.scheduler (>= 0.0.19~), python3-django, python3-click, python3-vcversioner, python3-djangorestframework, python3-djangorestframework-xml, - python3-requests + python3-requests, + patool Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/source/swh-deposit/ Package: python3-swh.deposit Architecture: all Depends: python3-swh.core (>= 0.0.36~), + python3-swh.model (>= 0.0.21~), python3-swh.scheduler (>= 0.0.19~), + patool, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Server Package: python3-swh.deposit.loader Conflict: python3-swh.deposit.injection Architecture: all Depends: python3-swh.core (>= 0.0.36~), + python3-swh.model (>= 0.0.21~), python3-swh.loader.core (>= 0.0.27~), python3-swh.loader.tar (>= 0.0.32~), python3-swh.scheduler (>= 0.0.19~), python3-requests, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Deposit Loader diff --git a/docs/getting-started.md b/docs/getting-started.md index f7d3eab1..83a1435a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,332 +1,333 @@ # Getting Started This is a getting started to demonstrate the deposit api use case with a shell client. The api is rooted at https://deposit.softwareheritage.org. For more details, see the [main documentation](./index.html). ## Requirements You need to be referenced on SWH's client list to have: - a credential (needed for the basic authentication step). - an associated collection [Contact us for more information.](https://www.softwareheritage.org/contact/) ## Demonstration For the rest of the document, we will: - reference `` as the client and `` as its associated authentication password. - use curl as example on how to request the api. - present the main deposit use cases. The use cases are: - one single deposit step: The user posts in one query (one deposit) a software source code archive and associated metadata (deposit is - finalized with status `ready-for-checks`). + finalized with status `deposited`). This will demonstrate the multipart query. - another 3-steps deposit (which can be extended as more than 2 steps): 1. Create an incomplete deposit (status `partial`) 2. Update a deposit (and finalize it, so the status becomes - `ready-for-checks`) + `deposited`) 3. Check the deposit's state This will demonstrate the stateful nature of the sword protocol. Those use cases share a common part, they must start by requesting the `service document iri` (internationalized resource identifier) for information about the collection's location. ### Common part - Start with the service document First, to determine the *collection iri* onto which deposit data, the client needs to ask the server where is its *collection* located. That is the role of the *service document iri*. For example: ``` Shell curl -i --user : https://deposit.softwareheritage.org/1/servicedocument/ ``` If everything went well, you should have received a response similar to this: ``` Shell HTTP/1.0 200 OK Server: WSGIServer/0.2 CPython/3.5.3 Content-Type: application/xml 2.0 209715200 The Software Heritage (SWH) Archive Software Collection application/zip + application/x-tar Collection Policy Software Heritage Archive Collect, Preserve, Share false http://purl.org/net/sword/package/SimpleZip https://deposit.softwareheritage.org/1// ``` Explaining the response: - `HTTP/1.0 200 OK`: the query is successful and returns a body response - `Content-Type: application/xml`: The body response is in xml format - `body response`: it is a service document describing that the client `` has a collection named ``. That collection is available at the *collection iri* `/1//` (through POST query). At this level, if something went wrong, this should be authentication related. So the response would have been a 401 Unauthorized access. Something like: ``` Shell curl -i https://deposit.softwareheritage.org/1// HTTP/1.0 401 Unauthorized Server: WSGIServer/0.2 CPython/3.5.3 Content-Type: application/xml WWW-Authenticate: Basic realm="" X-Frame-Options: SAMEORIGIN Access to this api needs authentication processing failed ``` ### Single deposit A single deposit translates to a multipart deposit request. This means, in swh's deposit's terms, sending exactly one POST query with: -- 1 archive (`content-type application/zip`) +- 1 archive (content-type `application/zip` or `application/x-tar`) - 1 atom xml content (`content-type: application/atom+xml;type=entry`) The supported archive, for now are limited to zip files. Those archives are expected to contain some form of software source code. The atom entry content is some xml defining metadata about that software. Example of minimal atom entry file: ``` XML Title urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2005-10-07T17:17:08Z Contributor The abstract The abstract Access Rights Alternative Title Date Available Bibliographic Citation Contributor Description Has Part Has Version Identifier Is Part Of Publisher References Rights Holder Source Title Type ``` Once the files are ready for deposit, we want to do the actual deposit in one shot. For this, we need to provide: - the contents and their associated correct content-types - either the header `In-Progress` to false (meaning, it's finished after this query) or nothing (the server will assume it's not in progress if not present). - Optionally, the `Slug` header, which is a reference to a unique identifier the client knows about and wants to provide us. You can do this with the following command: ``` Shell curl -i --user : \ -F "file=@deposit.zip;type=application/zip;filename=payload" \ -F "atom=@atom-entry.xml;type=application/atom+xml;charset=UTF-8" \ -H 'In-Progress: false' \ -H 'Slug: some-external-id' \ -XPOST https://deposit.softwareheritage.org/1// ``` You just posted a deposit to the collection https://deposit.softwareheritage.org/1//. If everything went well, you should have received a response similar to this: ``` Shell HTTP/1.0 201 Created Server: WSGIServer/0.2 CPython/3.5.3 Location: /1//10/metadata/ Content-Type: application/xml 9 Sept. 26, 2017, 10:11 a.m. payload - ready-for-checks + deposited http://purl.org/net/sword/package/SimpleZip ``` Explaining this response: - `HTTP/1.0 201 Created`: the deposit is successful - `Location: /1//10/metadata/`: the EDIT-SE-IRI through which we can update a deposit - body response: it is a deposit receipt detailing all endpoints available to manipulate the deposit (update, replace, delete, etc...) It also explains the deposit identifier to be 9 (which is useful for the remaining example). -Note: As the deposit is in `ready-for-checks` status, you cannot -actually update anything after this query. Well, the client can try, -but it will be answered with a 403 forbidden answer. +Note: As the deposit is in `deposited` status, you cannot actually +update anything after this query. Well, the client can try, but it +will be answered with a 403 forbidden answer. ### Multi-steps deposit #### Create a deposit We will use the collection IRI again as the starting point. We need to explicitely give to the server information about: - the deposit's completeness (through header `In-Progress` to true, as we want to do in multiple steps now). - archive's md5 hash (through header `Content-MD5`) - upload's type (through the headers `Content-Disposition` and `Content-Type`) The following command: ``` Shell curl -i --user : \ - --data-binary @swh/deposit.zip \ + --data-binary @swh/deposit.tar.gz \ -H 'In-Progress: true' \ -H 'Content-MD5: 0faa1ecbf9224b9bf48a7c691b8c2b6f' \ - -H 'Content-Disposition: attachment; filename=[deposit.zip]' \ + -H 'Content-Disposition: attachment; filename=[deposit.tar.gz]' \ -H 'Slug: some-external-id' \ -H 'Packaging: http://purl.org/net/sword/package/SimpleZIP' \ -H 'Content-type: application/zip' \ -XPOST https://deposit.softwareheritage.org/1// ``` The expected answer is the same as the previous sample. #### Update deposit's metadata To update a deposit, we can either add some more archives, some more metadata or replace existing ones. As we don't have defined metadata yet (except for the `slug` header), we can add some to the `EDIT-SE-IRI` endpoint (/1//10/metadata/). That information is extracted from the deposit receipt sample. Using here the same atom-entry.xml file presented in previous chapter. For example, here is the command to update deposit metadata: ``` Shell curl -i --user : --data-binary @atom-entry.xml \ -H 'In-Progress: true' \ -H 'Slug: some-external-id' \ -H 'Content-Type: application/atom+xml;type=entry' \ -XPOST https://deposit.softwareheritage.org/1//10/metadata/ HTTP/1.0 201 Created Server: WSGIServer/0.2 CPython/3.5.3 Location: /1//10/metadata/ Content-Type: application/xml 10 Sept. 26, 2017, 10:32 a.m. None partial http://purl.org/net/sword/package/SimpleZip ``` #### Check the deposit's state You need to check the STATE-IRI endpoint (/1//10/status/). ``` Shell curl -i --user : https://deposit.softwareheritage.org/1//10/status/ HTTP/1.0 200 OK Date: Wed, 27 Sep 2017 08:25:53 GMT Content-Type: application/xml ``` Response: ``` XML 9 - ready-for-checks + deposited deposit is fully received and ready for loading ``` diff --git a/docs/spec-api.md b/docs/spec-api.md index 2ee1b06c..b57785d3 100644 --- a/docs/spec-api.md +++ b/docs/spec-api.md @@ -1,801 +1,810 @@ # API Specification This is [Software Heritage](https://www.softwareheritage.org)'s [SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server implementation. **S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository **D**eposit) is an interoperability standard for digital file deposit. This implementation will permit interaction between a client (a repository) and a server (SWH repository) to permit deposits of software source code archives and associated metadata. -*Note:* In the following document, we will use the `archive` or -`software source code archive` interchangeably. +*Note:* + - In the following document, we will use the `archive` or `software + source code archive` interchangeably. + - The supported archive formats are: + - zip: common zip archive (no multi-disk zip files). + - tar: tar archive without compression or optionally any of the + following compression algorithm gzip (.tar.gz, .tgz), bzip2 + (.tar.bz2) , or lzma (.tar.lzma) + ## Collection SWORD defines a `collection` concept. In SWH's case, this collection refers to a group of deposits. A `deposit` is some form of software source code archive(s) associated with metadata. *Note:* It may be multiple archives if one archive is too big and must be splitted into multiple smaller ones. ### Example As part of the [HAL](https://hal.archives-ouvertes.fr/)-[SWH](https://www.softwareheritage.org) collaboration, we define a `HAL collection` to which the `hal` client will have access to. ## Limitations We will not have a fully compliant SWORD 2.0 protocol at first, so voluntary implementation shortcomings can exist, for example, only zip tarballs will be accepted. Other more permanent limitations exists: - upload limitation of 100Mib - no mediation ## Endpoints Here are the defined endpoints this document will refer to from this point on: - `/1/servicedocument/` *service document iri* (a.k.a [SD-IRI](#sd-iri-the-service-document-iri)) *Goal:* For a client to discover its collection's location - `/1//` *collection iri* (a.k.a [COL-IRI](#col-iri-the-collection-iri)) *Goal:*: create deposit to a collection - `/1///media/` *update iri* (a.k.a [EM-IRI](#em-iri-the-atom-edit-media-iri)) *Goal:*: Add or replace archive(s) to a deposit - `/1///metadata/` *update iri* (a.k.a [EDIT-IRI](#edit-iri-the-atom-entry-edit-iri) merged with [SE-IRI](#se-iri-the-sword-edit-iri)) *Goal:*: Add or replace metadata (and optionally archive(s) to a deposit - `/1///status/` *state iri* (a.k.a [STATE-IRI](#state-iri-the-sword-statement-iri)) *Goal:*: Display deposit's status in regards to loading - `/1///content/` *content iri* (a.k.a [CONT-FILE-IRI](#cont-iri-the-content-iri)) *Goal:*: Display information on the content's representation in the sword server ## Use cases ### Deposit creation From client's deposit repository server to SWH's repository server: [1.] The client requests for the server's abilities and its associated collection (GET query to the *SD/service document uri*) [2.] The server answers the client with the service document which gives the *collection uri* (also known as *COL/collection IRI*). [3.] The client sends a deposit (optionally a zip archive, some metadata or both) through the *collection uri*. This can be done in: - one POST request (metadata + archive). - one POST request (metadata or archive) + other PUT or POST request to the *update uris* (*edit-media iri* or *edit iri*) [3.1.] Server validates the client's input or returns detailed error if any [3.2.] Server stores information received (metadata or software archive source code or both) [4.] The server notifies the client it acknowledged the client's request. An `http 201 Created` response with a deposit receipt in the body response is sent back. That deposit receipt will hold the necessary information to eventually complete the deposit later on if it was incomplete (also known as status `partial`). #### Schema representation ![](/images/deposit-create-chart.png) ### Updating an existing deposit [5.] Client updates existing deposit through the *update uris* (one or more POST or PUT requests to either the *edit-media iri* or *edit iri*). [5.1.] Server validates the client's input or returns detailed error if any [5.2.] Server stores information received (metadata or software archive source code or both) This would be the case for example if the client initially posted a `partial` deposit (e.g. only metadata with no archive, or an archive without metadata, or a splitted archive because the initial one exceeded the limit size imposed by swh repository deposit) #### Schema representation ![](/images/deposit-update-chart.png) ### Deleting deposit (or associated archive, or associated metadata) [6.] Deposit deletion is possible as long as the deposit is still in `partial` state. [6.1.] Server validates the client's input or returns detailed error if any [6.2.] Server actually delete information according to request #### Schema representation ![](/images/deposit-delete-chart.png) ### Client asks for operation status [7.] Operation status can be read through a GET query to the *state iri*. ### Server: Triggering deposit checks -Once the status `ready-for-checks` is reached for a deposit, checks -for the associated archive(s) and metadata will be triggered. If -those checks fail, the status is changed to `rejected` and nothing -more happens there. Otherwise, the status is changed to -`ready-for-load`. +Once the status `deposited` is reached for a deposit, checks for the +associated archive(s) and metadata will be triggered. If those checks +fail, the status is changed to `rejected` and nothing more happens +there. Otherwise, the status is changed to `verified`. ### Server: Triggering deposit load -Once the status `ready-for-load` is reached for a deposit, loading the +Once the status `verified` is reached for a deposit, loading the deposit with its associated metadata will be triggered. -The loading will result on status update, either `success` or -`failure` (depending on the loading's status). +The loading will result on status update, either `done` or `failed` +(depending on the loading's status). This is described in the [loading document](./spec-loading.html). ## API overview API access is over HTTPS. The API is protected through basic authentication. The API endpoints are rooted at [https://deposit.softwareheritage.org/1/](https://deposit.softwareheritage.org/1/). Data is sent and received as XML (as specified in the SWORD 2.0 specification). In the following chapters, we will described the different endpoints [through the use cases described previously.](#use-cases) ### [2] Service document Endpoint: GET /1/servicedocument/ This is the starting endpoint for the client to discover its initial collection. The answer to this query will describes: - the server's abilities - connected client's collection information Also known as: [SD-IRI - The Service Document IRI](#sd-iri-the-service-document-iri). #### Sample request ``` Shell GET https://deposit.softwareheritage.org/1/servicedocument/ HTTP/1.1 Host: deposit.softwareheritage.org ``` The server returns its abilities with the service document in xml format: - protocol sword version v2 -- accepted mime types: application/zip +- accepted mime types: application/zip (zip), application/x-tar (tar + archive with any of the following optional compression algorithm + gzip, bzip2, or lzma) - upload max size accepted. Beyond that point, it's expected the client splits its tarball into multiple ones - the collection the client can act upon (swh supports only one software collection per client) - mediation is not supported - etc... The current answer for example for the [hal archive](https://hal.archives-ouvertes.fr/) is: ``` XML 2.0 20971520 The Software Heritage (SWH) archive SWH Software Archive application/zip + application/x-tar Collection Policy Software Heritage Archive false false Collect, Preserve, Share http://purl.org/net/sword/package/SimpleZip https://deposit.softwareheritage.org/1/hal/ ``` ### [3|5] Deposit creation/update The client can send deposit creation/update through a series of deposit requests to the following endpoints: - *collection iri* (COL-IRI) to initialize a deposit - *update iris* (EM-IRI, EDIT-SE-IRI) to complete/finalize a deposit The deposit creation/update can also happens in one request. The deposit request can contain: - an archive holding the software source code (binary upload) - an envelop with metadata describing information regarding a deposit (atom entry deposit) - or both (multipart deposit, exactly one archive and one envelop). #### Request Types ##### Binary deposit The client can deposit a binary archive, supplying the following headers: - Content-Type (text): accepted mimetype - Content-Length (int): tarball size - Content-MD5 (text): md5 checksum hex encoded of the tarball - Content-Disposition (text): attachment; filename=[filename] ; the filename parameter must be text (ascii) - Packaging (IRI): http://purl.org/net/sword/package/SimpleZip - In-Progress (bool): true to specify it's not the last request, false to specify it's a final request and the server can go on with processing the request's information (if not provided, this is considered false, so final). This is a single zip archive deposit. Almost no metadata is associated with the archive except for the unique external identifier. *Note:* This kind of deposit should be `partial` (In-Progress: True) as almost no metadata can be associated with the uploaded archive. ##### API endpoints concerned POST /1// Create a first deposit with one archive PUT /1///media/ Replace existing archives POST /1///media/ Add new archive ##### Sample request ``` Shell curl -i -u hal: \ --data-binary @swh/deposit.zip \ -H 'In-Progress: false' -H 'Content-MD5: 0faa1ecbf9224b9bf48a7c691b8c2b6f' \ -H 'Content-Disposition: attachment; filename=[deposit.zip]' \ -H 'Slug: some-external-id' \ -H 'Packaging: http://purl.org/net/sword/package/SimpleZIP' \ -H 'Content-type: application/zip' \ -XPOST https://deposit.softwareheritage.org/1/hal/ ``` #### Atom entry deposit The client can deposit an xml body holding metadata information on the deposit. *Note:* This kind of deposit is mostly expected to be `partial` (In-Progress: True) since no archive will be associated to those metadata. ##### API endpoints concerned POST /1// Create a first atom deposit entry PUT /1///metadata/ Replace existing metadata POST /1///metadata/ Add new metadata to deposit ##### Sample request Sample query: ``` Shell curl -i -u hal: --data-binary @atom-entry.xml \ -H 'In-Progress: false' \ -H 'Slug: some-external-id' \ -H 'Content-Type: application/atom+xml;type=entry' \ -XPOST https://deposit.softwareheritage.org/1/hal/ HTTP/1.0 201 Created Date: Tue, 26 Sep 2017 10:32:35 GMT Server: WSGIServer/0.2 CPython/3.5.3 Vary: Accept, Cookie Allow: GET, POST, PUT, DELETE, HEAD, OPTIONS Location: /1/hal/10/metadata/ X-Frame-Options: SAMEORIGIN Content-Type: application/xml 10 Sept. 26, 2017, 10:32 a.m. None - ready-for-checks + deposited http://purl.org/net/sword/package/SimpleZip ``` Sample body: ``` XML Title urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2005-10-07T17:17:08Z Contributor The abstract The abstract Access Rights Alternative Title Date Available Bibliographic Citation # noqa Contributor Description Has Part Has Version Identifier Is Part Of Publisher References Rights Holder Source Title Type ``` #### One request deposit / Multipart deposit The one request deposit is a single request containing both the metadata (as atom entry attachment) and the archive (as payload attachment). Thus, it is a multipart deposit. Client provides: - Content-Disposition (text): header of type 'attachment' on the Entry Part with a name parameter set to 'atom' - Content-Disposition (text): header of type 'attachment' on the Media Part with a name parameter set to payload and a filename parameter (the filename will be expressed in ASCII). - Content-MD5 (text): md5 checksum hex encoded of the tarball - Packaging (text): http://purl.org/net/sword/package/SimpleZip (packaging format used on the Media Part) - In-Progress (bool): true|false; true means `partial` upload and we can expect other requests in the future, false means the deposit is done. - add metadata formats or foreign markup to the atom:entry element ##### API endpoints concerned POST /1// Create a full deposit (metadata + archive) PUT /1///metadata/ Replace existing metadata and archive POST /1///metadata/ Add new metadata and archive to deposit ##### Sample request Sample query: ``` Shell curl -i -u hal: \ -F "file=@../deposit.json;type=application/zip;filename=payload" \ -F "atom=@../atom-entry.xml;type=application/atom+xml;charset=UTF-8" \ -H 'In-Progress: false' \ -H 'Slug: some-external-id' \ -XPOST https://deposit.softwareheritage.org/1/hal/ HTTP/1.0 201 Created Date: Tue, 26 Sep 2017 10:11:55 GMT Server: WSGIServer/0.2 CPython/3.5.3 Vary: Accept, Cookie Allow: GET, POST, PUT, DELETE, HEAD, OPTIONS Location: /1/hal/9/metadata/ X-Frame-Options: SAMEORIGIN Content-Type: application/xml 9 Sept. 26, 2017, 10:11 a.m. payload - ready-for-checks + deposited http://purl.org/net/sword/package/SimpleZip ``` Sample content: ``` XML POST deposit HTTP/1.1 Host: deposit.softwareheritage.org Content-Length: [content length] Content-Type: multipart/related; boundary="===============1605871705=="; type="application/atom+xml" In-Progress: false MIME-Version: 1.0 Media Post --===============1605871705== Content-Type: application/atom+xml; charset="utf-8" Content-Disposition: attachment; name="atom" MIME-Version: 1.0 Title hal-or-other-archive-id 2005-10-07T17:17:08Z Contributor The abstract Access Rights Alternative Title Date Available Bibliographic Citation # noqa Contributor Description Has Part Has Version Identifier Is Part Of Publisher References Rights Holder Source Title Type --===============1605871705== Content-Type: application/zip Content-Disposition: attachment; name=payload; filename=[filename] Packaging: http://purl.org/net/sword/package/SimpleZip Content-MD5: [md5-digest] MIME-Version: 1.0 [...binary package data...] --===============1605871705==-- ``` ## Deposit Creation - server point of view The server receives the request(s) and does minimal checking on the input prior to any saving operations. ### [3|5|6.1] Validation of the header and body request Any kind of errors can happen, here is the list depending on the situation: - common errors: - 401 (unauthenticated) if a client does not provide credential or provide wrong ones - 403 (forbidden) if a client tries access to a collection it does not own - 404 (not found) if a client tries access to an unknown collection - 404 (not found) if a client tries access to an unknown deposit - 415 (unsupported media type) if a wrong media type is provided to the endpoint - archive/binary deposit: - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 412 (precondition failed) if the md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request's body is empty (for creation only) ### [3|5|6.2] Server uploads the content in a temporary location Using an objstorage, the server stores the archive in a temporary location. It's deemed temporary the time the deposit is completed -(status becomes `ready-for-checks`) and the loading finishes. +(status becomes `deposited`) and the loading finishes. The server also persists requests' information in a database. ### [4] Servers answers the client If everything went well, the server answers either with a 200, 201 or 204 response (depending on the actual endpoint) A `http 200` response is returned for GET endpoints. A `http 201 Created` response is returned for POST endpoints. The body holds the deposit receipt. The headers holds the EDIT-IRI in the Location header of the response. A `http 204 No Content` response is returned for PUT, DELETE endpoints. If something went wrong, the server answers with one of the [error status code and associated message mentioned](#possible errors)). ### [5] Deposit Update The client previously deposited a `partial` document (through an archive, metadata, or both). The client wants to update information for that previous deposit (possibly in multiple steps as well). The important thing to note here is that, as long as the deposit is in status `partial`, the loading did not start. Thus, the client can update information (replace or add new archive, new metadata, even delete) for that same `partial` deposit. -When the deposit status changes to `ready-for-checks`, the client can +When the deposit status changes to `deposited`, the client can no longer change the deposit's information (a 403 will be returned in that case). Then aggregation of all those deposit's information will later be used for the actual loading. Providing the collection name, and the identifier of the previous deposit id received from the deposit receipt, the client executes a POST or PUT request on the *update iris*. After validation of the body request, the server: - uploads such content in a temporary location - answers the client an `http 204 (No content)`. In the Location header of the response lies an iri to permit further update. - Asynchronously, the server will inject the archive uploaded and the associated metadata. An operation status endpoint *state iri* permits the client to query the loading operation status. #### Possible update endpoints PUT /1///media/ Replace existing archives for the deposit POST /1///media/ Add new archives to the deposit PUT /1///metadata/ Replace existing metadata (and possible archives) POST /1///metadata/ Add new metadata ### [6] Deposit Removal As long as the deposit's status remains `partial`, it's possible to remove the deposit entirely or remove only the deposit's archive(s). If the deposit has been removed, further querying that deposit will return a *404* response. If the deposit's archive(s) has been removed, we can still ensue other query to update that deposit. ### Operation Status Providing a collection name and a deposit id, the client asks the operation status of a prior deposit. URL: GET /1///status/ This returns: - *201* response with the actual status - *404* if the deposit does not exist (or no longer does) ## Possible errors ### sword:ErrorContent IRI: `http://purl.org/net/sword/error/ErrorContent` The supplied format is not the same as that identified in the Packaging header and/or that supported by the server Associated HTTP Associated HTTP status: *415 (Unsupported Media Type)* ### sword:ErrorChecksumMismatch IRI: `http://purl.org/net/sword/error/ErrorChecksumMismatch` Checksum sent does not match the calculated checksum. Associated HTTP status: *412 Precondition Failed* ### sword:ErrorBadRequest IRI: `http://purl.org/net/sword/error/ErrorBadRequest` Some parameters sent with the POST/PUT were not understood. Associated HTTP status: *400 Bad Request* ### sword:MediationNotAllowed IRI: `http://purl.org/net/sword/error/MediationNotAllowed` Used where a client has attempted a mediated deposit, but this is not supported by the server. Associated HTTP status: *412 Precondition Failed* ### sword:MethodNotAllowed IRI: `http://purl.org/net/sword/error/MethodNotAllowed` Used when the client has attempted one of the HTTP update verbs (POST, PUT, DELETE) but the server has decided not to respond to such requests on the specified resource at that time. Associated HTTP Status: *405 Method Not Allowed* ### sword:MaxUploadSizeExceeded IRI: `http://purl.org/net/sword/error/MaxUploadSizeExceeded` Used when the client has attempted to supply to the server a file which exceeds the server's maximum upload size limit Associated HTTP Status: *413 (Request Entity Too Large)* ### sword:Unauthorized IRI: `http://purl.org/net/sword/error/ErrorUnauthorized` The access to the api is through authentication. Associated HTTP status: *401* ### sword:Forbidden IRI: `http://purl.org/net/sword/error/ErrorForbidden` The action is forbidden (access to another collection for example). Associated HTTP status: *403* ## Nomenclature SWORD uses IRI notion, Internationalized Resource Identifier. In this chapter, we will describe SWH's IRIs. ### SD-IRI - The Service Document IRI The Service Document IRI. This is the IRI from which the client can discover its collection IRI. HTTP verbs supported: *GET* ### Col-IRI - The Collection IRI The software collection associated to one user. The SWORD Collection IRI is the IRI to which the initial deposit will take place, and which is listed in the Service Document. Following our previous example, this is: https://deposit.softwareheritage.org/1/hal/. HTTP verbs supported: *POST* ### Cont-IRI - The Content IRI This is the endpoint which permits the client to retrieve representations of the object as it resides in the SWORD server. This will display information about the content and its associated metadata. HTTP verbs supported: *GET* *Note:* We also refer to it as *Cont-File-IRI*. ### EM-IRI - The Atom Edit Media IRI This is the endpoint to upload other related archives for the same deposit. It is used to change a `partial` deposit in regards of archives, in particular: - replace existing archives with new ones - add new archives - delete archives from a deposit Example use case: A first archive to put exceeds the deposit's limit size. The client can thus split the archives in multiple ones. Post a first `partial` archive to the Col-IRI (with In-Progress: True). Then, in order to complete the deposit, POST the other remaining archives to the EM-IRI (the last one with the In-Progress header to False). HTTP verbs supported: *POST*, *PUT*, *DELETE* ### Edit-IRI - The Atom Entry Edit IRI This is the endpoint to change a `partial` deposit in regards of metadata. In particular: - replace existing metadata (and archives) with new ones - add new metadata (and archives) - delete deposit HTTP verbs supported: *POST*, *PUT*, *DELETE* *Note:* We also refer to it as *Edit-SE-IRI*. ### SE-IRI - The SWORD Edit IRI The sword specification permits to merge this with EDIT-IRI, so we did. *Note:* We also refer to it as *Edit-SE-IRI*. ### State-IRI - The SWORD Statement IRI This is the IRI which can be used to retrieve a description of the object from the sword server, including the structure of the object and its state. This will be used as the operation status endpoint. HTTP verbs supported: *GET* ## Sources - [SWORD v2 specification](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) - [arxiv documentation](https://arxiv.org/help/submit_sword) - [Dataverse example](http://guides.dataverse.org/en/4.3/api/sword.html) - [SWORD used on HAL](https://api.archives-ouvertes.fr/docs/sword) - [xml examples for CCSD](https://github.com/CCSDForge/HAL/tree/master/Sword) diff --git a/docs/spec-loading.md b/docs/spec-loading.md index 1dea1c64..ef8780eb 100644 --- a/docs/spec-loading.md +++ b/docs/spec-loading.md @@ -1,222 +1,221 @@ # Loading specification (draft) This part discusses the deposit loading part on the server side. ## Tarball Loading The `swh-loader-tar` module is already able to inject tarballs in swh with very limited metadata (mainly the origin). The loading of the deposit will use the deposit's associated data: - the metadata - the archive(s) We will use the `synthetic` revision notion. To that revision will be associated the metadata. Those will be included in the hash computation, thus resulting in a unique identifier. ### Loading mapping Some of those metadata will also be included in the `origin_metadata` table. ``` origin | https://hal.inria.fr/hal-id | ------------------------------------|----------------------------------------| origin_visit | 1 :reception_date | origin_metadata | aggregated metadata | occurrence & occurrence_history | branch: client's version n° (e.g hal) | revision | synthetic_revision (tarball) | directory | upper level of the uncompressed archive| ``` ### Questions raised concerning loading - A deposit has one origin, yet an origin can have multiple deposits? No, an origin can have multiple requests for the same deposit. Which should end up in one single deposit (when the client pushes its final request saying deposit 'done' through the header In-Progress). Only update of existing 'partial' deposit is permitted. Other than that, the deposit 'update' operation. To create a new version of a software (already deposited), the client must prior to this create a new deposit. Illustration First deposit loading: HAL's deposit 01535619 = SWH's deposit **01535619-1** + 1 origin with url:https://hal.inria.fr/medihal-01535619 + 1 synthetic revision + 1 directory HAL's update on deposit 01535619 = SWH's deposit **01535619-2** (*with HAL updates can only be on the metadata and a new version is required if the content changes) + 1 origin with url:https://hal.inria.fr/medihal-01535619 + new synthetic revision (with new metadata) + same directory HAL's deposit 01535619-v2 = SWH's deposit **01535619-v2-1** + same origin + new revision + new directory ## Technical details ### Requirements - one dedicated database to store the deposit's state - swh-deposit - one dedicated temporary objstorage to store archives before loading - one client to test the communication with SWORD protocol ### Deposit reception schema - SWORD imposes the use of basic authentication, so we need a way to authenticate client. Also, a client can access collections: **deposit_client** table: - id (bigint): Client's identifier - username (str): Client's username - password (pass): Client's crypted password - collections ([id]): List of collections the client can access - Collections group deposits together: **deposit_collection** table: - id (bigint): Collection's identifier - name (str): Collection's human readable name - A deposit is the main object the repository is all about: **deposit** table: - id (bigint): deposit's identifier - reception_date (date): First deposit's reception date - complete_data (date): Date when the deposit is deemed complete and ready for loading - collection (id): The collection the deposit belongs to - external id (text): client's internal identifier (e.g hal's id, etc...). - client_id (id) : Client which did the deposit - swh_id (str) : swh identifier result once the loading is complete - status (enum): The deposit's current status - As mentioned, a deposit can have a status, whose possible values are: ``` text - 'partial', -- the deposit is new or partially received since it - -- can be done in multiple requests - 'expired', -- deposit has been there too long and is now deemed - -- ready to be garbage collected - 'ready-for-checks' -- ready for checks to ensure data coherency - 'ready-for-load', -- deposit is fully received, checked, and ready for loading - 'loading', -- loading is ongoing on swh's side - 'success', -- loading is successful - 'failure' -- loading is a failure + 'partial', -- the deposit is new or partially received since it + -- can be done in multiple requests + 'expired', -- deposit has been there too long and is now deemed + -- ready to be garbage collected + 'deposited' -- deposit complete, it is ready to be checked to ensure data consistency + 'verified', -- deposit is fully received, checked, and ready for loading + 'loading', -- loading is ongoing on swh's side + 'done', -- loading is successful + 'failed' -- loading is a failure ``` A deposit is stateful and can be made in multiple requests: **deposit_request** table: - id (bigint): identifier - type (id): deposit request's type (possible values: 'archive', 'metadata') - deposit_id (id): deposit whose request belongs to - metadata: metadata associated to the request - date (date): date of the requests Information sent along a request are stored in a `deposit_request` row. They can be either of type `metadata` (atom entry, multipart's atom entry part) or of type `archive` (binary upload, multipart's binary upload part). -When the deposit is complete (status `ready-for-checks`), those -`metadata` and `archive` deposit requests will be read and -aggregated. They will then be sent as parameters to the loading -routine. +When the deposit is complete (status `deposited`), those `metadata` +and `archive` deposit requests will be read and aggregated. They will +then be sent as parameters to the loading routine. During loading, some of those metadata are kept in the `origin_metadata` table and some other are stored in the `revision` table (see [metadata loading](#metadata-loading)). The only update actions occurring on the deposit table are in regards of: - status changing: - - `partial` -> {`expired`/`ready-for-checks`}, - - `ready-for-checks` -> {`rejected`/`ready-for-load`}, - - `ready-for-load` -> `loading` - - `loading` -> {`success`/`failure`} + - `partial` -> {`expired`/`deposited`}, + - `deposited` -> {`rejected`/`verified`}, + - `verified` -> `loading` + - `loading` -> {`done`/`failed`} - `complete_date` when the deposit is finalized (when the status is - changed to `ready-for-checks`) + changed to `deposited`) - `swh-id` is populated once we have the loading result #### SWH Identifier returned The synthetic revision id e.g: 47dc6b4636c7f6cba0df83e3d5490bf4334d987e ### Scheduling loading All `archive` and `metadata` deposit requests should be aggregated before loading. The loading should be scheduled via the scheduler's api. -Only `ready-for-checks` deposit are concerned by the loading. +Only `deposited` deposit are concerned by the loading. When the loading is done and successful, the deposit entry is updated: -- `status` is updated to `success` +- `status` is updated to `done` - `swh-id` is populated with the resulting hash (cf. [swh identifier](#swh-identifier-returned)) - `complete_date` is updated to the loading's finished time When the loading is failed, the deposit entry is updated: -- `status` is updated to `failure` +- `status` is updated to `failed` - `swh-id` and `complete_data` remains as is *Note:* As a further improvement, we may prefer having a retry policy with graceful delays for further scheduling. ### Metadata loading - the metadata received with the deposit should be kept in the `origin_metadata` table before translation as part of the loading process and an indexation process should be scheduled. - provider_id and tool_id are resolved by the prepare_metadata method in the loader-core - the origin_metadata entry is sent to storage by the send_origin_metadata in the loader-core origin_metadata table: ``` id bigint PK origin bigint discovery_date date provider_id bigint FK // (from provider table) tool_id bigint FK // indexer_configuration_id tool used for extraction metadata jsonb // before translation ``` diff --git a/requirements-swh.txt b/requirements-swh.txt index 0f21a5f2..842fb4e4 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,5 @@ swh.core >= 0.0.36 swh.loader.tar >= 0.0.32 swh.loader.core >= 0.0.27 swh.scheduler >= 0.0.19 +swh.model >= 0.0.21 diff --git a/requirements.txt b/requirements.txt index 29cf6c92..ac9fbfe5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ vcversioner - click Django djangorestframework djangorestframework-xml diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index 78cb3cde..96b7393e 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,11 +1,11 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.42 +Version: 0.0.43 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN Provides-Extra: loader diff --git a/swh.deposit.egg-info/SOURCES.txt b/swh.deposit.egg-info/SOURCES.txt index 118f62fd..b0b158e4 100644 --- a/swh.deposit.egg-info/SOURCES.txt +++ b/swh.deposit.egg-info/SOURCES.txt @@ -1,142 +1,143 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile Makefile.local README.md requirements-swh.txt requirements.txt setup.py version.txt bin/Makefile bin/content.sh bin/create_deposit.sh bin/create_deposit_atom.sh bin/create_deposit_with_metadata.sh bin/default-setup bin/download-deposit-archive.sh bin/home.sh bin/replace-deposit-archive.sh bin/service-document.sh bin/status.sh bin/update-deposit-with-another-archive.sh bin/update-status.sh debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/dev-info.md docs/getting-started.md docs/index.rst docs/metadata.md docs/spec-api.md docs/spec-loading.md docs/sys-info.md docs/_static/.placeholder docs/_templates/.placeholder docs/images/deposit-create-chart.png docs/images/deposit-delete-chart.png docs/images/deposit-update-chart.png resources/deposit/server.yml swh/__init__.py swh/manage.py swh.deposit.egg-info/PKG-INFO swh.deposit.egg-info/SOURCES.txt swh.deposit.egg-info/dependency_links.txt swh.deposit.egg-info/requires.txt swh.deposit.egg-info/top_level.txt swh/deposit/__init__.py swh/deposit/apps.py swh/deposit/auth.py swh/deposit/config.py swh/deposit/create_user.py swh/deposit/errors.py swh/deposit/models.py swh/deposit/parsers.py swh/deposit/signals.py swh/deposit/urls.py swh/deposit/wsgi.py swh/deposit/api/__init__.py swh/deposit/api/common.py swh/deposit/api/deposit.py swh/deposit/api/deposit_content.py swh/deposit/api/deposit_status.py swh/deposit/api/deposit_update.py swh/deposit/api/service_document.py swh/deposit/api/urls.py swh/deposit/api/private/__init__.py swh/deposit/api/private/deposit_check.py swh/deposit/api/private/deposit_read.py swh/deposit/api/private/deposit_update_status.py swh/deposit/api/private/urls.py swh/deposit/fixtures/__init__.py swh/deposit/fixtures/deposit_data.yaml swh/deposit/loader/__init__.py swh/deposit/loader/checker.py swh/deposit/loader/client.py swh/deposit/loader/loader.py swh/deposit/loader/scheduler.py swh/deposit/loader/tasks.py swh/deposit/migrations/0001_initial.py swh/deposit/migrations/0002_depositrequest_archive.py swh/deposit/migrations/0003_temporaryarchive.py swh/deposit/migrations/0004_delete_temporaryarchive.py swh/deposit/migrations/0005_auto_20171019_1436.py swh/deposit/migrations/0006_depositclient_url.py swh/deposit/migrations/0007_auto_20171129_1609.py swh/deposit/migrations/0008_auto_20171130_1513.py swh/deposit/migrations/0009_deposit_parent.py swh/deposit/migrations/0010_auto_20180110_0953.py +swh/deposit/migrations/0011_auto_20180115_1510.py swh/deposit/migrations/__init__.py swh/deposit/settings/__init__.py swh/deposit/settings/common.py swh/deposit/settings/development.py swh/deposit/settings/production.py swh/deposit/settings/testing.py swh/deposit/static/robots.txt swh/deposit/static/css/bootstrap-responsive.min.css swh/deposit/static/css/style.css swh/deposit/static/img/arrow-up-small.png swh/deposit/static/img/swh-logo-deposit.png swh/deposit/static/img/swh-logo-deposit.svg swh/deposit/static/img/icons/swh-logo-32x32.png swh/deposit/static/img/icons/swh-logo-deposit-180x180.png swh/deposit/static/img/icons/swh-logo-deposit-192x192.png swh/deposit/static/img/icons/swh-logo-deposit-270x270.png swh/deposit/templates/__init__.py swh/deposit/templates/homepage.html swh/deposit/templates/layout.html swh/deposit/templates/deposit/__init__.py swh/deposit/templates/deposit/content.xml swh/deposit/templates/deposit/deposit_receipt.xml swh/deposit/templates/deposit/error.xml swh/deposit/templates/deposit/service_document.xml swh/deposit/templates/deposit/status.xml swh/deposit/templates/rest_framework/api.html swh/deposit/tests/__init__.py swh/deposit/tests/common.py swh/deposit/tests/api/__init__.py swh/deposit/tests/api/test_common.py swh/deposit/tests/api/test_deposit.py swh/deposit/tests/api/test_deposit_atom.py swh/deposit/tests/api/test_deposit_binary.py swh/deposit/tests/api/test_deposit_check.py swh/deposit/tests/api/test_deposit_delete.py swh/deposit/tests/api/test_deposit_multipart.py swh/deposit/tests/api/test_deposit_read_archive.py swh/deposit/tests/api/test_deposit_read_metadata.py swh/deposit/tests/api/test_deposit_status.py swh/deposit/tests/api/test_deposit_update.py swh/deposit/tests/api/test_deposit_update_status.py swh/deposit/tests/api/test_service_document.py swh/deposit/tests/loader/__init__.py swh/deposit/tests/loader/common.py swh/deposit/tests/loader/test_checker.py swh/deposit/tests/loader/test_client.py swh/deposit/tests/loader/test_loader.py \ No newline at end of file diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt index f58d6757..27956dd7 100644 --- a/swh.deposit.egg-info/requires.txt +++ b/swh.deposit.egg-info/requires.txt @@ -1,14 +1,15 @@ Django click djangorestframework djangorestframework-xml swh.core>=0.0.36 swh.loader.core>=0.0.27 swh.loader.tar>=0.0.32 +swh.model>=0.0.21 swh.scheduler>=0.0.19 vcversioner [loader] requests swh.loader.core>=0.0.25 swh.scheduler>=0.0.19 diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index f5367763..5706d610 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,858 +1,863 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from abc import ABCMeta, abstractmethod from django.core.urlresolvers import reverse from django.http import HttpResponse from django.shortcuts import render from django.utils import timezone from rest_framework import status from rest_framework.authentication import BasicAuthentication from rest_framework.permissions import IsAuthenticated, AllowAny from rest_framework.views import APIView from swh.model import hashutil from ..config import SWHDefaultConfig, EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI from ..config import ARCHIVE_KEY, METADATA_KEY, STATE_IRI -from ..config import DEPOSIT_STATUS_READY_FOR_CHECKS, DEPOSIT_STATUS_PARTIAL +from ..config import DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL from ..config import DEPOSIT_STATUS_LOAD_SUCCESS from ..errors import MAX_UPLOAD_SIZE_EXCEEDED, BAD_REQUEST, ERROR_CONTENT from ..errors import CHECKSUM_MISMATCH, make_error_dict, MEDIATION_NOT_ALLOWED from ..errors import make_error_response_from_dict, FORBIDDEN from ..errors import NOT_FOUND, make_error_response, METHOD_NOT_ALLOWED from ..models import Deposit, DepositRequest, DepositCollection from ..models import DepositRequestType, DepositClient from ..parsers import parse_xml ACCEPT_PACKAGINGS = ['http://purl.org/net/sword/package/SimpleZip'] -ACCEPT_CONTENT_TYPES = ['application/zip'] +ACCEPT_ARCHIVE_CONTENT_TYPES = ['application/zip', 'application/x-tar'] class SWHAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes = (BasicAuthentication, ) permission_classes = (IsAuthenticated, ) class SWHPrivateAPIView(SWHAPIView): """Mixin intended as private api (so no authentication) based API view (for the private ones). """ authentication_classes = () permission_classes = (AllowAny, ) class SWHBaseDeposit(SWHDefaultConfig, SWHAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ def __init__(self): super().__init__() deposit_request_types = DepositRequestType.objects.all() self.deposit_request_types = { type.name: type for type in deposit_request_types } def _read_headers(self, req): """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: req (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = req._request.META content_type = req.content_type content_length = meta.get('CONTENT_LENGTH') if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get('HTTP_IN_PROGRESS', False) content_disposition = meta.get('HTTP_CONTENT_DISPOSITION') if isinstance(in_progress, str): in_progress = in_progress.lower() == 'true' content_md5sum = meta.get('HTTP_CONTENT_MD5') if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get('HTTP_PACKAGING') slug = meta.get('HTTP_SLUG') on_behalf_of = meta.get('HTTP_ON_BEHALF_OF') metadata_relevant = meta.get('HTTP_METADATA_RELEVANT') return { 'content-type': content_type, 'content-length': content_length, 'in-progress': in_progress, 'content-disposition': content_disposition, 'content-md5sum': content_md5sum, 'packaging': packaging, 'slug': slug, 'on-behalf-of': on_behalf_of, 'metadata-relevant': metadata_relevant, } def _compute_md5(self, filehandler): """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put(self, deposit_id=None, in_progress=False, external_id=None): """Save/Update a deposit in db. Args: deposit_id (int): deposit identifier in_progress (dict): The deposit's status external_id (str): The external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ if in_progress is False: complete_date = timezone.now() - status_type = DEPOSIT_STATUS_READY_FOR_CHECKS + status_type = DEPOSIT_STATUS_DEPOSITED else: complete_date = None status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: # find a deposit parent (same external id, status load # to success) deposit_parent = Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS).order_by('-id')[0:1].get() # noqa except Deposit.DoesNotExist: deposit_parent = None deposit = Deposit(collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type deposit.save() return deposit def _deposit_request_put(self, deposit, deposit_request_data, replace_metadata=False, replace_archives=False): """Save a deposit request with metadata attached to a deposit. Args: deposit (Deposit): The deposit concerned by the request deposit_request_data (dict): The dictionary with at most 2 deposit request types (archive, metadata) to associate to the deposit replace_metadata (bool): Flag defining if we add or update existing metadata to the deposit replace_archives (bool): Flag defining if we add or update archives to existing deposit Returns: None """ if replace_metadata: DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types[METADATA_KEY]).delete() if replace_archives: DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types[ARCHIVE_KEY]).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=self.deposit_request_types[ARCHIVE_KEY], deposit=deposit, archive=archive_file) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: deposit_request = DepositRequest( type=self.deposit_request_types[METADATA_KEY], deposit=deposit, metadata=metadata) deposit_request.save() assert deposit_request is not None def _delete_archives(self, collection_name, deposit_id): """Delete archives reference from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, 'The deposit %s does not exist' % deposit_id) DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types[ARCHIVE_KEY]).delete() return {} def _delete_deposit(self, collection_name, deposit_id): """Delete deposit reference. Args: collection_name (str): Client's name deposit_id (id): The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, 'The deposit %s does not exist' % deposit_id) if deposit.collection.name != collection_name: summary = 'Cannot delete a deposit from another collection' description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} def _check_preconditions_on(self, filehandler, md5sum, content_length=None): """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check md5sum (hex str): md5 hash expected from the file's content content_length (int): the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ if content_length: if content_length > self.config['max_upload_size']: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, 'Upload size limit exceeded (max %s bytes).' % self.config['max_upload_size'], 'Please consider sending the archive in ' 'multiple steps.') length = filehandler.size if length != content_length: return make_error_dict(status.HTTP_412_PRECONDITION_FAILED, 'Wrong length') if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, 'Wrong md5 hash', 'The checksum sent %s and the actual checksum ' '%s does not match.' % (hashutil.hash_to_hex(md5sum), hashutil.hash_to_hex(_md5sum))) return None def _binary_upload(self, req, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False): """Binary upload routine. Other than such a request, a 415 response is returned. Args: req (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers['content-length'] if not content_length: return make_error_dict( BAD_REQUEST, 'CONTENT_LENGTH header is mandatory', 'For archive deposit, the ' 'CONTENT_LENGTH header must be sent.') content_disposition = headers['content-disposition'] if not content_disposition: return make_error_dict( BAD_REQUEST, 'CONTENT_DISPOSITION header is mandatory', 'For archive deposit, the ' 'CONTENT_DISPOSITION header must be sent.') packaging = headers['packaging'] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, 'Only packaging %s is supported' % ACCEPT_PACKAGINGS, 'The packaging provided %s is not supported' % packaging) filehandler = req.FILES['file'] precondition_status_response = self._check_preconditions_on( filehandler, headers['content-md5sum'], content_length) if precondition_status_response: return precondition_status_response external_id = headers['slug'] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put(deposit_id=deposit_id, in_progress=headers['in-progress'], external_id=external_id) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives) return { 'deposit_id': deposit.id, 'deposit_date': deposit.reception_date, 'status': deposit.status, 'archive': filehandler.name, } def _multipart_upload(self, req, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False): """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: req (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers['slug'] content_types_present = set() data = { - 'application/zip': None, # expected archive + 'application/zip': None, # expected either zip + 'application/x-tar': None, # or x-tar 'application/atom+xml': None, } for key, value in req.FILES.items(): fh = value if fh.content_type in content_types_present: return make_error_dict( ERROR_CONTENT, - 'Only 1 application/zip archive and 1 ' - 'atom+xml entry is supported (as per sword2.0 ' + 'Only 1 application/zip (or application/x-tar) archive ' + 'and 1 atom+xml entry is supported (as per sword2.0 ' 'specification)', - 'You provided more than 1 application/zip ' + 'You provided more than 1 application/(zip|x-tar) ' 'or more than 1 application/atom+xml content-disposition ' 'header in the multipart deposit') content_types_present.add(fh.content_type) data[fh.content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, - 'You must provide both 1 application/zip ' - 'and 1 atom+xml entry for multipart deposit', - 'You need to provide only 1 application/zip ' + 'You must provide both 1 application/zip (or ' + 'application/x-tar) and 1 atom+xml entry for multipart ' + 'deposit', + 'You need to provide only 1 application/(zip|x-tar) ' 'and 1 application/atom+xml content-disposition header ' 'in the multipart deposit') filehandler = data['application/zip'] + if not filehandler: + filehandler = data['application/x-tar'] + precondition_status_response = self._check_preconditions_on( filehandler, headers['content-md5sum']) if precondition_status_response: return precondition_status_response # actual storage of data atom_metadata = parse_xml(data['application/atom+xml']) deposit = self._deposit_put(deposit_id=deposit_id, in_progress=headers['in-progress'], external_id=external_id) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: atom_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives) return { 'deposit_id': deposit.id, 'deposit_date': deposit.reception_date, 'archive': filehandler.name, 'status': deposit.status, } def _atom_entry(self, req, headers, collection_name, deposit_id=None, replace_metadata=False, replace_archives=False): """Atom entry deposit. Args: req (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ if not req.data: return make_error_dict( BAD_REQUEST, 'Empty body request is not supported', 'Atom entry deposit is supposed to send for metadata. ' 'If the body is empty, there is no metadata.') external_id = req.data.get( '{http://www.w3.org/2005/Atom}external_identifier', headers['slug']) deposit = self._deposit_put(deposit_id=deposit_id, in_progress=headers['in-progress'], external_id=external_id) self._deposit_request_put( deposit, {METADATA_KEY: req.data}, replace_metadata, replace_archives) return { 'deposit_id': deposit.id, 'deposit_date': deposit.reception_date, 'archive': None, 'status': deposit.status, } def _empty_post(self, req, headers, collection_name, deposit_id): """Empty post to finalize an empty deposit. Args: req (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() - deposit.status = DEPOSIT_STATUS_READY_FOR_CHECKS + deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { 'deposit_id': deposit_id, 'deposit_date': deposit.complete_date, 'status': deposit.status, 'archive': None, } def _make_iris(self, req, collection_name, deposit_id): """Define the IRI endpoints Args: req (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: req.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } def additional_checks(self, req, headers, collection_name, deposit_id=None): """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} def checks(self, req, collection_name, deposit_id=None): try: self._collection = DepositCollection.objects.get( name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( NOT_FOUND, 'Unknown collection name %s' % collection_name) username = req.user.username if username: # unauthenticated request can have the username empty try: self._client = DepositClient.objects.get(username=username) except DepositClient.DoesNotExist: return make_error_dict(NOT_FOUND, 'Unknown client name %s' % username) if self._collection.id not in self._client.collections: return make_error_dict( FORBIDDEN, 'Client %s cannot access collection %s' % ( username, collection_name)) if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, 'Deposit with id %s does not exist' % deposit_id) checks = self.restrict_access(req, deposit) if checks: return checks headers = self._read_headers(req) if headers['on-behalf-of']: return make_error_dict(MEDIATION_NOT_ALLOWED, 'Mediation is not supported.') checks = self.additional_checks(req, headers, collection_name, deposit_id) if 'error' in checks: return checks return {'headers': headers} def restrict_access(self, req, deposit=None): if deposit: if (req.method != 'GET' and deposit.status != DEPOSIT_STATUS_PARTIAL): summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) description = "This deposit has status '%s'" % deposit.status return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description) def _basic_not_allowed_method(self, req, method): return make_error_response( req, METHOD_NOT_ALLOWED, '%s method is not supported on this endpoint' % method) def get(self, req, *args, **kwargs): return self._basic_not_allowed_method(req, 'GET') def post(self, req, *args, **kwargs): return self._basic_not_allowed_method(req, 'POST') def put(self, req, *args, **kwargs): return self._basic_not_allowed_method(req, 'PUT') def delete(self, req, *args, **kwargs): return self._basic_not_allowed_method(req, 'DELETE') class SWHGetDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): """Mixin for class to support GET method. """ def get(self, req, collection_name, deposit_id, format=None): """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(req, collection_name, deposit_id) if 'error' in checks: return make_error_response_from_dict(req, checks['error']) r = self.process_get( req, collection_name, deposit_id) if isinstance(r, tuple): status, content, content_type = r return HttpResponse(content, status=status, content_type=content_type) return r @abstractmethod def process_get(self, req, collection_name, deposit_id): """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass class SWHPostDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def post(self, req, collection_name, deposit_id=None, format=None): """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(req, collection_name, deposit_id) if 'error' in checks: return make_error_response_from_dict(req, checks['error']) headers = checks['headers'] _status, _iri_key, data = self.process_post( req, headers, collection_name, deposit_id) error = data.get('error') if error: return make_error_response_from_dict(req, error) data['packagings'] = ACCEPT_PACKAGINGS iris = self._make_iris(req, collection_name, data['deposit_id']) data.update(iris) response = render(req, 'deposit/deposit_receipt.xml', context=data, content_type='application/xml', status=_status) response._headers['location'] = 'Location', data[_iri_key] return response @abstractmethod def process_post(self, req, headers, collection_name, deposit_id=None): """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass class SWHPutDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): """Mixin for class to support PUT method. """ def put(self, req, collection_name, deposit_id, format=None): """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(req, collection_name, deposit_id) if 'error' in checks: return make_error_response_from_dict(req, checks['error']) headers = checks['headers'] data = self.process_put(req, headers, collection_name, deposit_id) error = data.get('error') if error: return make_error_response_from_dict(req, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_put(self, req, headers, collection_name, deposit_id): """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass class SWHDeleteDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ def delete(self, req, collection_name, deposit_id): """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(req, collection_name, deposit_id) if 'error' in checks: return make_error_response_from_dict(req, checks['error']) data = self.process_delete(req, collection_name, deposit_id) error = data.get('error') if error: return make_error_response_from_dict(req, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod def process_delete(self, req, collection_name, deposit_id): """Routine to delete a resource. This is mostly not allowed except for the EM_IRI (cf. .api.deposit_update.SWHUpdateArchiveDeposit) """ pass diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py index 77f73b9c..fb7ec49a 100644 --- a/swh/deposit/api/deposit.py +++ b/swh/deposit/api/deposit.py @@ -1,91 +1,93 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import status -from .common import SWHPostDepositAPI +from .common import SWHPostDepositAPI, ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import EDIT_SE_IRI from ..errors import make_error_dict, BAD_REQUEST -from ..parsers import SWHFileUploadParser, SWHAtomEntryParser +from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser +from ..parsers import SWHAtomEntryParser from ..parsers import SWHMultiPartParser class SWHDeposit(SWHPostDepositAPI): """Deposit request class defining api endpoints for sword deposit. What's known as 'Col IRI' in the sword specification. HTTP verbs supported: POST """ parser_classes = (SWHMultiPartParser, - SWHFileUploadParser, + SWHFileUploadZipParser, + SWHFileUploadTarParser, SWHAtomEntryParser) def additional_checks(self, req, headers, collection_name, deposit_id=None): slug = headers['slug'] if not slug: msg = 'Missing SLUG header in request' verbose_description = 'Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing.' # noqa return make_error_dict(BAD_REQUEST, msg, verbose_description) return {} def process_post(self, req, headers, collection_name, deposit_id=None): """Create a first deposit as: - archive deposit (1 zip) - multipart (1 zip + 1 atom entry) - atom entry Args: req (Request): the request holding the information to parse and inject in db collection_name (str): the associated client Returns: An http response (HttpResponse) according to the situation. If everything is ok, a 201 response (created) with a deposit receipt. Otherwise, depending on the upload, the following errors can be returned: - archive deposit: - 400 (bad request) if the request is not providing an external identifier - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ assert deposit_id is None - if req.content_type == 'application/zip': + if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES: data = self._binary_upload(req, headers, collection_name) elif req.content_type.startswith('multipart/'): data = self._multipart_upload(req, headers, collection_name) else: data = self._atom_entry(req, headers, collection_name) return status.HTTP_201_CREATED, EDIT_SE_IRI, data diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index d99563b2..7c2a210b 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,155 +1,158 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework import status from .common import SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI from ..errors import make_error_response, BAD_REQUEST -from ..parsers import SWHFileUploadParser, SWHAtomEntryParser +from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser +from ..parsers import SWHAtomEntryParser from ..parsers import SWHMultiPartParser class SWHUpdateArchiveDeposit(SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ - parser_classes = (SWHFileUploadParser, ) + parser_classes = (SWHFileUploadZipParser, SWHFileUploadTarParser, ) def process_put(self, req, headers, collection_name, deposit_id): """Replace existing content for the existing deposit. +header: Metadata-relevant (to extract metadata from the archive) source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_editingcontent_binary Returns: 204 No content """ - if req.content_type != 'application/zip': - return make_error_response(req, BAD_REQUEST, - 'Only application/zip is supported!') + if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: + msg = 'Packaging format supported is restricted to %s' % ( + ', '.join(ACCEPT_ARCHIVE_CONTENT_TYPES)) + return make_error_response(req, BAD_REQUEST, msg) return self._binary_upload(req, headers, collection_name, deposit_id=deposit_id, replace_archives=True) def process_post(self, req, headers, collection_name, deposit_id): """Add new content to the existing deposit. +header: Metadata-relevant (to extract metadata from the archive) source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_addingcontent_mediaresource Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type != 'application/zip': return make_error_response(req, BAD_REQUEST, 'Only application/zip is supported!') return (status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id)) def process_delete(self, req, collection_name, deposit_id): """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_deletingcontent Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) class SWHUpdateMetadataDeposit(SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) def process_put(self, req, headers, collection_name, deposit_id): """Replace existing deposit's metadata/archive with new ones. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_editingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_editingcontent_multipart Returns: 204 No content """ if req.content_type.startswith('multipart/'): return self._multipart_upload(req, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True) return self._atom_entry(req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True) def process_post(self, req, headers, collection_name, deposit_id): """Add new metadata/archive to existing deposit. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_addingcontent_metadata - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_addingcontent_multipart This also deals with an empty post corner case to finalize a deposit. Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ if req.content_type.startswith('multipart/'): return (status.HTTP_201_CREATED, EM_IRI, self._multipart_upload(req, headers, collection_name, deposit_id=deposit_id)) # check for final empty post # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html # #continueddeposit_complete if headers['content-length'] == 0 and headers['in-progress'] is False: data = self._empty_post(req, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) return (status.HTTP_201_CREATED, EM_IRI, self._atom_entry(req, headers, collection_name, deposit_id=deposit_id)) def process_delete(self, req, collection_name, deposit_id): """Delete the container (deposit). Source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html #protocoloperations_deleteconteiner """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index 93d853b0..a142069c 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,181 +1,183 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -import zipfile +import patoolib from rest_framework import status from ..common import SWHGetDepositAPI, SWHPrivateAPIView -from ...config import DEPOSIT_STATUS_READY, DEPOSIT_STATUS_REJECTED +from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED from ...config import ARCHIVE_TYPE, METADATA_TYPE from ...models import Deposit, DepositRequest class SWHChecksDeposit(SWHGetDepositAPI, SWHPrivateAPIView): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ def _deposit_requests(self, deposit, request_type): """Given a deposit, yields its associated deposit_request Args: deposit (Deposit): Deposit to list requests for request_type (str): Archive or metadata type Yields: deposit requests of type request_type associated to the deposit """ deposit_requests = DepositRequest.objects.filter( type=self.deposit_request_types[request_type], deposit=deposit).order_by('id') for deposit_request in deposit_requests: yield deposit_request def _check_deposit_archives(self, deposit): """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns True if all archives are ok, False otherwise. """ requests = list(self._deposit_requests( deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False for dr in requests: - check = self._check_archive(dr.archive) + check = self._check_archive(dr.archive.path) if not check: return False return True - def _check_archive(self, archive): + def _check_archive(self, archive_path): """Check that a given archive is actually ok for reading. Args: - archive (File): Archive to check + archive_path (str): Archive to check Returns: True if archive is successfully read, False otherwise. """ try: - zf = zipfile.ZipFile(archive.path) - zf.infolist() - except Exception as e: + patoolib.test_archive(archive_path, verbosity=-1) + except: return False else: return True def _metadata_get(self, deposit): """Given a deposit, aggregate all metadata requests. Args: The deposit to check metadata for. Returns: True if the deposit's associated metadata are ok, False otherwise. """ metadata = {} for dr in self._deposit_requests(deposit, request_type=METADATA_TYPE): metadata.update(dr.metadata) return metadata def _check_metadata(self, metadata): """Check to execute on all metadata for mandatory field presence. Args: metadata (dict): Metadata to actually check Returns: True if metadata is ok, False otherwise. """ required_fields = (('url',), ('external_identifier',), ('name', 'title'), ('author',)) result = all(any(name in field for field in metadata for name in possible_names) for possible_names in required_fields) return result def _check_url(self, client_domain, metadata): """Check compatibility between client_domain and url field in metadata Args: client_domain (str): url associated with the deposit's client metadata (dict): Metadata where to find url Returns: True if url is ok, False otherwise. """ metadata_urls = [] for field in metadata: if 'url' in field: metadata_urls.append(metadata[field]) return any(client_domain in url for url in metadata_urls) def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ deposit = Deposit.objects.get(pk=deposit_id) client_domain = deposit.client.domain metadata = self._metadata_get(deposit) problems = [] # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status = self._check_deposit_archives(deposit) if not archives_status: problems.append('archive(s)') metadata_status = self._check_metadata(metadata) if not metadata_status: problems.append('metadata') url_status = self._check_url(client_domain, metadata) if not url_status: problems.append('url') deposit_status = archives_status and metadata_status and url_status # if any problems arose, the deposit is rejected if not deposit_status: deposit.status = DEPOSIT_STATUS_REJECTED + response = { + 'status': deposit.status, + 'details': 'Some %s failed the checks.' % ( + ' and '.join(problems), ), + } else: - deposit.status = DEPOSIT_STATUS_READY + deposit.status = DEPOSIT_STATUS_VERIFIED + response = { + 'status': deposit.status, + } + deposit.save() - return (status.HTTP_200_OK, - json.dumps({ - 'status': deposit.status, - 'details': 'Some %s failed the checks.' % ( - ' and '.join(problems), ), - }), - 'application/json') + return status.HTTP_200_OK, json.dumps(response), 'application/json' diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index fc5d3dbc..02fd80e0 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,235 +1,239 @@ -# Copyright (C) 2017 -2018 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import shutil import tempfile from contextlib import contextmanager from django.http import FileResponse from rest_framework import status from swh.core import tarball from swh.model import identifiers from ...config import SWH_PERSON from ..common import SWHGetDepositAPI, SWHPrivateAPIView from ...models import Deposit, DepositRequest @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ if len(archive_paths) > 1: # need to rebuild one archive # from multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.deposit-', dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, 'aggregate') os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = tarball.compress( aggregated_tarball_rootdir + '.zip', nature='zip', dirpath_or_files=aggregated_tarball_rootdir) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) else: # only 1 archive, no need to do fancy actions (and no cleanup step) yield archive_paths[0] class SWHDepositReadArchives(SWHGetDepositAPI, SWHPrivateAPIView): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh-deposit/archive/'), } def __init__(self): super().__init__() self.extraction_dir = self.config['extraction_dir'] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) def retrieve_archives(self, deposit_id): """Given a deposit identifier, returns its associated archives' path. Yields: path to deposited archives """ deposit = Deposit.objects.get(pk=deposit_id) deposit_requests = DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types['archive']).order_by('id') for deposit_request in deposit_requests: yield deposit_request.archive.path def process_get(self, req, collection_name, deposit_id): """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = list(self.retrieve_archives(deposit_id)) with aggregate_tarballs(self.extraction_dir, archive_paths) as path: return FileResponse(open(path, 'rb'), status=status.HTTP_200_OK, content_type='application/octet-stream') class SWHDepositReadMetadata(SWHGetDepositAPI, SWHPrivateAPIView): """Class in charge of aggregating metadata on a deposit. """ ADDITIONAL_CONFIG = { 'provider': ('dict', { # 'provider_name': '', # those are not set since read from the # 'provider_url': '', # deposit's client 'provider_type': 'deposit_client', 'metadata': {} }), 'tool': ('dict', { 'name': 'swh-deposit', 'version': '0.0.1', 'configuration': { 'sword_version': '2' } }) } def __init__(self): super().__init__() self.provider = self.config['provider'] self.tool = self.config['tool'] def _aggregate_metadata(self, deposit, metadata_requests): """Retrieve and aggregates metadata information. """ metadata = {} for req in metadata_requests: metadata.update(req.metadata) return metadata def _retrieve_url(self, deposit, metadata): client_domain = deposit.client.domain for field in metadata: if 'url' in field: if client_domain in metadata[field]: return metadata[field] def aggregate(self, deposit, requests): """Aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. requests ([DepositRequest]): List of associated requests which need aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ data = {} # Retrieve tarballs/metadata information metadata = self._aggregate_metadata(deposit, requests) # create origin_url from metadata only after deposit_check validates it origin_url = self._retrieve_url(deposit, metadata) # Read information metadata data['origin'] = { 'type': 'deposit', 'url': origin_url } # revision fullname = deposit.client.get_full_name() author_committer = SWH_PERSON # metadata provider self.provider['provider_name'] = deposit.client.last_name self.provider['provider_url'] = deposit.client.provider_url revision_type = 'tar' revision_msg = '%s: Deposit %s in collection %s' % ( fullname, deposit.id, deposit.collection.name) complete_date = identifiers.normalize_timestamp(deposit.complete_date) data['revision'] = { 'synthetic': True, 'date': complete_date, 'committer_date': complete_date, 'author': author_committer, 'committer': author_committer, 'type': revision_type, 'message': revision_msg, 'metadata': metadata, } if deposit.parent: - parent_revision = deposit.parent.swh_id + swh_persistent_id = deposit.parent.swh_id + persistent_identifier = identifiers.parse_persistent_identifier( + swh_persistent_id) + parent_revision = persistent_identifier['object_id'] + data['revision']['parents'] = [parent_revision] data['occurrence'] = { 'branch': 'master' } data['origin_metadata'] = { 'provider': self.provider, 'tool': self.tool, 'metadata': metadata } return data def process_get(self, req, collection_name, deposit_id): deposit = Deposit.objects.get(pk=deposit_id) requests = DepositRequest.objects.filter( deposit=deposit, type=self.deposit_request_types['metadata']) data = self.aggregate(deposit, requests) d = {} if data: d = json.dumps(data) return status.HTTP_200_OK, d, 'application/json' diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index bb87d474..cbb8ccce 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,71 +1,74 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.parsers import JSONParser +from swh.model.identifiers import persistent_identifier, REVISION + from ..common import SWHPutDepositAPI, SWHPrivateAPIView from ...errors import make_error_dict, BAD_REQUEST from ...models import Deposit, DEPOSIT_STATUS_DETAIL +from ...models import DEPOSIT_STATUS_LOAD_SUCCESS class SWHUpdateStatusDeposit(SWHPutDepositAPI, SWHPrivateAPIView): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser, ) def additional_checks(self, req, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists """ data = req.data status = data.get('status') if not status: msg = 'The status key is mandatory with possible values %s' % list( DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = 'Possible status in %s' % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) - if status == 'success': + if status == DEPOSIT_STATUS_LOAD_SUCCESS: swh_id = data.get('revision_id') if not swh_id: msg = 'Updating status to %s requires a revision_id key' % ( status, ) return make_error_dict(BAD_REQUEST, msg) return {} def restrict_access(self, req, deposit=None): """Remove restriction modification to 'partial' deposit. Update is possible regardless of the existing status. """ return None def process_put(self, req, headers, collection_name, deposit_id): """Update the deposit's status Returns: 204 No content """ deposit = Deposit.objects.get(pk=deposit_id) deposit.status = req.data['status'] # checks already done before swh_id = req.data.get('revision_id') if swh_id: - deposit.swh_id = swh_id + deposit.swh_id = persistent_identifier(REVISION, {'id': swh_id}) deposit.save() return {} diff --git a/swh/deposit/api/service_document.py b/swh/deposit/api/service_document.py index 35905247..9ecac083 100644 --- a/swh/deposit/api/service_document.py +++ b/swh/deposit/api/service_document.py @@ -1,28 +1,29 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render -from .common import SWHBaseDeposit, ACCEPT_PACKAGINGS, ACCEPT_CONTENT_TYPES +from .common import SWHBaseDeposit, ACCEPT_PACKAGINGS +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..models import DepositClient, DepositCollection class SWHServiceDocument(SWHBaseDeposit): def get(self, req, *args, **kwargs): client = DepositClient.objects.get(username=req.user) collections = [] for col_id in client.collections: col = DepositCollection.objects.get(pk=col_id) collections.append(col) context = { 'max_upload_size': self.config['max_upload_size'], 'accept_packagings': ACCEPT_PACKAGINGS, - 'accept_content_types': ACCEPT_CONTENT_TYPES, + 'accept_content_types': ACCEPT_ARCHIVE_CONTENT_TYPES, 'collections': collections, } return render(req, 'deposit/service_document.xml', context, content_type='application/xml') diff --git a/swh/deposit/config.py b/swh/deposit/config.py index f2da70d6..d3a1a8ae 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,93 +1,93 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from swh.core.config import SWHConfig # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = 'edit_se_iri' EM_IRI = 'em_iri' CONT_FILE_IRI = 'cont_file_iri' SD_IRI = 'servicedocument' COL_IRI = 'upload' STATE_IRI = 'state_iri' PRIVATE_GET_RAW_CONTENT = 'private-download' PRIVATE_CHECK_DEPOSIT = 'check-deposit' PRIVATE_PUT_DEPOSIT = 'private-update' PRIVATE_GET_DEPOSIT_METADATA = 'private-read' ARCHIVE_KEY = 'archive' METADATA_KEY = 'metadata' ARCHIVE_TYPE = 'archive' METADATA_TYPE = 'metadata' AUTHORIZED_PLATFORMS = ['development', 'production', 'testing'] DEPOSIT_STATUS_REJECTED = 'rejected' DEPOSIT_STATUS_PARTIAL = 'partial' -DEPOSIT_STATUS_READY = 'ready-for-load' -DEPOSIT_STATUS_READY_FOR_CHECKS = 'ready-for-checks' -DEPOSIT_STATUS_LOAD_SUCCESS = 'success' -DEPOSIT_STATUS_LOAD_FAILURE = 'failure' +DEPOSIT_STATUS_DEPOSITED = 'deposited' +DEPOSIT_STATUS_VERIFIED = 'verified' +DEPOSIT_STATUS_LOAD_SUCCESS = 'done' +DEPOSIT_STATUS_LOAD_FAILURE = 'failed' # Revision author for deposit SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } def setup_django_for(platform): """Setup function for command line tools (swh.deposit.create_user, swh.deposit.scheduler.cli) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running Raises: ValueError in case of wrong platform inputs. """ if platform not in AUTHORIZED_PLATFORMS: raise ValueError('Platform should be one of %s' % AUTHORIZED_PLATFORMS) os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'swh.deposit.settings.%s' % platform) import django django.setup() class SWHDefaultConfig(SWHConfig): """Mixin intended to enrich views with SWH configuration. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'max_upload_size': ('int', 209715200), 'checks': ('bool', True), } ADDITIONAL_CONFIG = {} def __init__(self, **config): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.config.update(config) self.log = logging.getLogger('swh.deposit') if self.config['checks']: from swh.scheduler.backend import SchedulerBackend self.scheduler = SchedulerBackend() diff --git a/swh/deposit/loader/loader.py b/swh/deposit/loader/loader.py index 2ff04d39..316ab5ee 100644 --- a/swh/deposit/loader/loader.py +++ b/swh/deposit/loader/loader.py @@ -1,129 +1,129 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import tempfile from swh.model import hashutil from swh.loader.tar import loader from swh.loader.core.loader import SWHLoader from .client import DepositClient class DepositLoader(loader.TarLoader): """Deposit loader implementation. This is a subclass of the :class:TarLoader as the main goal of this class is to first retrieve the deposit's tarball contents as one and its associated metadata. Then provide said tarball to be loaded by the TarLoader. This will: - retrieves the deposit's archive locally - provide the archive to be loaded by the tar loader - clean up the temporary location used to retrieve the archive locally - update the deposit's status accordingly """ CONFIG_BASE_FILENAME = 'loader/deposit' ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh.deposit.loader/'), } def __init__(self, client=None): super().__init__( logging_class='swh.deposit.loader.loader.DepositLoader') self.client = client if client else DepositClient() def load(self, *, archive_url, deposit_meta_url, deposit_update_url): SWHLoader.load( self, archive_url=archive_url, deposit_meta_url=deposit_meta_url, deposit_update_url=deposit_update_url) def prepare(self, *, archive_url, deposit_meta_url, deposit_update_url): """Prepare the loading by first retrieving the deposit's raw archive content. """ self.deposit_update_url = deposit_update_url + self.client.status_update(deposit_update_url, 'loading') + temporary_directory = tempfile.TemporaryDirectory() self.temporary_directory = temporary_directory archive_path = os.path.join(temporary_directory.name, 'archive.zip') archive = self.client.archive_get( archive_url, archive_path, log=self.log) metadata = self.client.metadata_get( deposit_meta_url, log=self.log) origin = metadata['origin'] visit_date = datetime.datetime.now(tz=datetime.timezone.utc) revision = metadata['revision'] occurrence = metadata['occurrence'] self.origin_metadata = metadata['origin_metadata'] self.prepare_metadata() - self.client.status_update(deposit_update_url, 'loading') - super().prepare(tar_path=archive, origin=origin, visit_date=visit_date, revision=revision, occurrences=[occurrence]) def store_metadata(self): """Storing the origin_metadata during the load processus. Provider_id and tool_id are resolved during the prepare() method. """ origin_id = self.origin_id visit_date = self.visit_date provider_id = self.origin_metadata['provider']['provider_id'] tool_id = self.origin_metadata['tool']['tool_id'] metadata = self.origin_metadata['metadata'] try: self.send_origin_metadata(origin_id, visit_date, provider_id, tool_id, metadata) except: self.log.exception('Problem when storing origin_metadata') raise def post_load(self, success=True): """Updating the deposit's status according to its loading status. - If not successful, we update its status to failure. - Otherwise, we update its status to 'success' and pass along - its associated revision. + If not successful, we update its status to 'failed'. + Otherwise, we update its status to 'done' and pass along its + associated revision. """ try: if not success: self.client.status_update(self.deposit_update_url, - status='failure') + status='failed') return # first retrieve the new revision [rev_id] = self.objects['revision'].keys() if rev_id: rev_id_hex = hashutil.hash_to_hex(rev_id) # then update the deposit's status to success with its # revision-id self.client.status_update(self.deposit_update_url, - status='success', + status='done', revision_id=rev_id_hex) except: self.log.exception( 'Problem when trying to update the deposit\'s status') def cleanup(self): """Clean up temporary directory where we retrieved the tarball. """ super().cleanup() self.temporary_directory.cleanup() diff --git a/swh/deposit/loader/scheduler.py b/swh/deposit/loader/scheduler.py index fad48f5b..e5f38a15 100644 --- a/swh/deposit/loader/scheduler.py +++ b/swh/deposit/loader/scheduler.py @@ -1,212 +1,212 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of sending deposit loading/checking as either celery task or scheduled one-shot tasks. """ import click import logging from abc import ABCMeta, abstractmethod from celery import group from swh.core import utils from swh.core.config import SWHConfig -from swh.deposit.config import setup_django_for, DEPOSIT_STATUS_READY -from swh.deposit.config import DEPOSIT_STATUS_READY_FOR_CHECKS +from swh.deposit.config import setup_django_for, DEPOSIT_STATUS_VERIFIED +from swh.deposit.config import DEPOSIT_STATUS_DEPOSITED from swh.scheduler.utils import get_task, create_oneshot_task_dict class SWHScheduling(SWHConfig, metaclass=ABCMeta): """Base swh scheduling class to aggregate the schedule deposit loading. """ CONFIG_BASE_FILENAME = 'deposit/server' DEFAULT_CONFIG = { 'dry_run': ('bool', False), } ADDITIONAL_CONFIG = {} def __init__(self): super().__init__() self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) self.log = logging.getLogger('swh.deposit.scheduling') @abstractmethod def schedule(self, deposits): """Schedule the new deposit loading. Args: data (dict): Deposit aggregated data Returns: None """ pass class SWHCeleryScheduling(SWHScheduling): """Deposit loading as Celery task scheduling. """ def __init__(self, config=None): super().__init__() if config: self.config.update(**config) self.dry_run = self.config['dry_run'] self.check = self.config['check'] if self.check: task_name = 'swh.deposit.loader.tasks.ChecksDepositTsk' else: task_name = 'swh.deposit.loader.tasks.LoadDepositArchiveTsk' self.task = get_task(task_name) def _convert(self, deposits): """Convert tuple to celery task signature. """ task = self.task for archive_url, meta_url, update_url, check_url in deposits: if self.check: yield task.s(deposit_check_url=check_url) else: yield task.s(archive_url=archive_url, deposit_meta_url=meta_url, deposit_update_url=update_url) def schedule(self, deposits): """Schedule the new deposit loading directly through celery. Args: depositdata (dict): Deposit aggregated information. Returns: None """ if self.dry_run: return return group(self._convert(deposits)).delay() class SWHSchedulerScheduling(SWHScheduling): """Deposit loading through SWH's task scheduling interface. """ ADDITIONAL_CONFIG = {} def __init__(self, config=None): super().__init__() from swh.scheduler.backend import SchedulerBackend if config: self.config.update(**config) self.dry_run = self.config['dry_run'] self.scheduler = SchedulerBackend(**self.config) self.check = self.config['check'] def _convert(self, deposits): """Convert tuple to one-shot scheduling tasks. """ for archive_url, meta_url, update_url, check_url in deposits: if self.check: task = create_oneshot_task_dict( 'swh-deposit-archive-checks', deposit_check_url=check_url) else: task = create_oneshot_task_dict( 'swh-deposit-archive-loading', archive_url=archive_url, deposit_meta_url=meta_url, deposit_update_url=update_url) yield task def schedule(self, deposits): """Schedule the new deposit loading through swh.scheduler's api. Args: deposits (dict): Deposit aggregated information. """ if self.dry_run: return self.scheduler.create_tasks(self._convert(deposits)) def get_deposit_by(status): """Filter deposit given a specific status. """ from swh.deposit.models import Deposit yield from Deposit.objects.filter(status=status) def prepare_task_arguments(check): """Convert deposit to argument for task to be executed. """ from swh.deposit.config import PRIVATE_GET_RAW_CONTENT from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA from swh.deposit.config import PRIVATE_PUT_DEPOSIT from swh.deposit.config import PRIVATE_CHECK_DEPOSIT from django.core.urlresolvers import reverse if check: - status = DEPOSIT_STATUS_READY_FOR_CHECKS + status = DEPOSIT_STATUS_DEPOSITED else: - status = DEPOSIT_STATUS_READY + status = DEPOSIT_STATUS_VERIFIED for deposit in get_deposit_by(status): args = [deposit.collection.name, deposit.id] archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args) check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) yield archive_url, meta_url, update_url, check_url @click.command( help='Schedule one-shot deposit loadings') @click.option('--platform', default='development', help='development or production platform') @click.option('--scheduling-method', default='celery', help='Scheduling method') @click.option('--batch-size', default=1000, type=click.INT, help='Task batch size') @click.option('--dry-run/--no-dry-run', is_flag=True, default=False, help='Dry run') @click.option('--check', is_flag=True, default=False) def main(platform, scheduling_method, batch_size, dry_run, check): setup_django_for(platform) override_config = {} if dry_run: override_config['dry_run'] = dry_run override_config['check'] = check if scheduling_method == 'celery': scheduling = SWHCeleryScheduling(override_config) elif scheduling_method == 'swh-scheduler': scheduling = SWHSchedulerScheduling(override_config) else: raise ValueError( 'Only `celery` or `swh-scheduler` values are accepted') for deposits in utils.grouper(prepare_task_arguments(check), batch_size): scheduling.schedule(deposits) if __name__ == '__main__': main() diff --git a/swh/deposit/migrations/0011_auto_20180115_1510.py b/swh/deposit/migrations/0011_auto_20180115_1510.py new file mode 100644 index 00000000..1265ba2e --- /dev/null +++ b/swh/deposit/migrations/0011_auto_20180115_1510.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.10.7 on 2018-01-15 15:10 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('deposit', '0010_auto_20180110_0953'), + ] + + operations = [ + migrations.AlterField( + model_name='deposit', + name='status', + field=models.TextField(choices=[('partial', 'partial'), ('expired', 'expired'), ('deposited', 'deposited'), ('verified', 'verified'), ('rejected', 'rejected'), ('loading', 'loading'), ('done', 'done'), ('failed', 'failed')], default='partial'), + ), + ] diff --git a/swh/deposit/models.py b/swh/deposit/models.py index 9785757a..68fc5677 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,210 +1,210 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb from django.contrib.postgres.fields import JSONField, ArrayField from django.contrib.auth.models import User, UserManager from django.db import models from django.utils.timezone import now -from .config import DEPOSIT_STATUS_READY, DEPOSIT_STATUS_READY_FOR_CHECKS +from .config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED from .config import DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS from .config import DEPOSIT_STATUS_LOAD_FAILURE class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = 'dbversion' def __str__(self): return str({ 'version': self.version, 'release': self.release, 'description': self.description }) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ('expired', 'expired'), - (DEPOSIT_STATUS_READY_FOR_CHECKS, DEPOSIT_STATUS_READY_FOR_CHECKS), - (DEPOSIT_STATUS_READY, DEPOSIT_STATUS_READY), + (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), + (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), ('rejected', 'rejected'), ('loading', 'loading'), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: 'Deposit is partially received. To finalize it, ' 'In-Progress header should be false', 'expired': 'Deposit has been there too long and is now ' 'deemed ready to be garbage collected', - DEPOSIT_STATUS_READY_FOR_CHECKS: 'Deposit is ready for additional checks ' - '(tarball ok, metadata, etc...)', - DEPOSIT_STATUS_READY: 'Deposit is fully received, checked, and ' - 'ready for loading', + DEPOSIT_STATUS_DEPOSITED: 'Deposit is ready for additional checks ' + '(tarball ok, metadata, etc...)', + DEPOSIT_STATUS_VERIFIED: 'Deposit is fully received, checked, and ' + 'ready for loading', 'rejected': 'Deposit failed the checks', 'loading': "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: 'The deposit has been successfully ' 'loaded into the Software Heritage archive', DEPOSIT_STATUS_LOAD_FAILURE: 'The deposit loading into the ' 'Software Heritage archive failed', } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() provider_url = models.TextField(null=False) domain = models.TextField(null=False) class Meta: db_table = 'deposit_client' def __str__(self): return str({ 'id': self.id, 'collections': self.collections, 'username': super().username, 'domain': self.domain, 'provider_url': self.provider_url, }) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey( 'DepositCollection', models.DO_NOTHING) # Deposit's external identifier external_id = models.TextField() # Deposit client client = models.ForeignKey('DepositClient', models.DO_NOTHING) # SWH's loading result identifier swh_id = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField( choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) # deposit can have one parent parent = models.ForeignKey('self', null=True) class Meta: db_table = 'deposit' def __str__(self): return str({ 'id': self.id, 'reception_date': self.reception_date, 'collection': self.collection.name, 'external_id': self.external_id, 'client': self.client.username, 'status': self.status }) class DepositRequestType(models.Model): """Deposit request type made by clients (either archive or metadata) """ id = models.BigAutoField(primary_key=True) name = models.TextField() class Meta: db_table = 'deposit_request_type' def __str__(self): return str({'id': self.id, 'name': self.name}) def client_directory_path(instance, filename): """Callable to upload archive in MEDIA_ROOT/user_/ Args: instance (DepositRequest): DepositRequest concerned by the upload filename (str): Filename of the uploaded file Returns: A path to be prefixed by the MEDIA_ROOT to access physically to the file uploaded. """ return 'client_{0}/{1}'.format(instance.deposit.client.id, filename) class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.ForeignKey( 'DepositRequestType', models.DO_NOTHING) class Meta: db_table = 'deposit_request' def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str({ 'id': self.id, 'deposit': self.deposit, 'metadata': meta, 'archive': archive_name }) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = 'deposit_collection' def __str__(self): return str({'id': self.id, 'name': self.name}) diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index f0a4c6e7..5c4198c3 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,61 +1,68 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ from decimal import Decimal from rest_framework.parsers import FileUploadParser from rest_framework.parsers import MultiPartParser from rest_framework_xml.parsers import XMLParser -class SWHFileUploadParser(FileUploadParser): +class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive. """ media_type = 'application/zip' +class SWHFileUploadTarParser(FileUploadParser): + """File upload parser limited to zip archive. + + """ + media_type = 'application/x-tar' + + class SWHXMLParser(XMLParser): def _type_convert(self, value): """Override the default type converter to avoid having decimal in the resulting output. """ value = super()._type_convert(value) if isinstance(value, Decimal): value = str(value) return value class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype """ media_type = 'application/atom+xml;type=entry' class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes. """ media_type = 'multipart/*; *' def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Returns: content parsed as dict. """ return SWHXMLParser().parse(raw_content) diff --git a/swh/deposit/signals.py b/swh/deposit/signals.py index a1b85c6f..83d893a0 100644 --- a/swh/deposit/signals.py +++ b/swh/deposit/signals.py @@ -1,83 +1,83 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining some uncoupled actions on deposit. Typically, checking that the archives deposited are ok are not directly testing in the request/answer to avoid too long computations. So this is done in the deposit_on_status_ready_for_check callback. """ from django.db.models.signals import post_save from django.dispatch import receiver from .models import Deposit -from .config import SWHDefaultConfig, DEPOSIT_STATUS_READY -from .config import DEPOSIT_STATUS_READY_FOR_CHECKS +from .config import SWHDefaultConfig, DEPOSIT_STATUS_VERIFIED +from .config import DEPOSIT_STATUS_DEPOSITED @receiver(post_save, sender=Deposit) def post_deposit_save(sender, instance, created, raw, using, update_fields, **kwargs): """When a deposit is saved, check for the deposit's status change and schedule actions accordingly. - When the status passes to ready-for-checks, schedule checks. + When the status passes to deposited, schedule checks. When the status pass to ready, schedule loading. Otherwise, do nothing. Args: sender (Deposit): The model class instance (Deposit): The actual instance being saved created (bool): True if a new record was created raw (bool): True if the model is saved exactly as presented (i.e. when loading a fixture). One should not query/modify other records in the database as the database might not be in a consistent state yet using: The database alias being used update_fields: The set of fields to update as passed to Model.save(), or None if update_fields wasn’t passed to save() """ default_config = SWHDefaultConfig() if not default_config.config['checks']: return - if instance.status not in {DEPOSIT_STATUS_READY_FOR_CHECKS, - DEPOSIT_STATUS_READY}: + if instance.status not in {DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_VERIFIED}: return from django.core.urlresolvers import reverse from swh.scheduler.utils import create_oneshot_task_dict args = [instance.collection.name, instance.id] - if instance.status == DEPOSIT_STATUS_READY_FOR_CHECKS: + if instance.status == DEPOSIT_STATUS_DEPOSITED: # schedule archive check from swh.deposit.config import PRIVATE_CHECK_DEPOSIT check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) task = create_oneshot_task_dict( 'swh-deposit-archive-checks', deposit_check_url=check_url) - else: # instance.status == DEPOSIT_STATUS_READY: + else: # instance.status == DEPOSIT_STATUS_VERIFIED: # schedule loading from swh.deposit.config import PRIVATE_GET_RAW_CONTENT from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA from swh.deposit.config import PRIVATE_PUT_DEPOSIT archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args) update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args) task = create_oneshot_task_dict( 'swh-deposit-archive-loading', archive_url=archive_url, deposit_meta_url=meta_url, deposit_update_url=update_url) default_config.scheduler.create_tasks([task]) diff --git a/swh/deposit/templates/deposit/service_document.xml b/swh/deposit/templates/deposit/service_document.xml index f66dae40..aec0d822 100644 --- a/swh/deposit/templates/deposit/service_document.xml +++ b/swh/deposit/templates/deposit/service_document.xml @@ -1,24 +1,24 @@ 2.0 {{ max_upload_size }} The Software Heritage (SWH) Archive {% for collection in collections %} {{ collection.name }} Software Collection - {% for accept_content_type in accept_content_types %}{{ accept_content_type }}{% endfor %} - Collection Policy + {% for accept_content_type in accept_content_types %}{{ accept_content_type }} + {% endfor %}Collection Policy Software Heritage Archive Collect, Preserve, Share false false - {% for accept_packaging in accept_packagings %}{{ accept_packaging }}{% endfor %} - https://deposit.softwareheritage.org/1/{{ collection.name }}/ + {% for accept_packaging in accept_packagings %}{{ accept_packaging }} + {% endfor %}https://deposit.softwareheritage.org/1/{{ collection.name }}/ {% endfor %} diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py index bf8f507f..2d68ac62 100644 --- a/swh/deposit/tests/api/test_deposit_atom.py +++ b/swh/deposit/tests/api/test_deposit_atom.py @@ -1,524 +1,524 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.urlresolvers import reverse from io import BytesIO from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase -from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_READY_FOR_CHECKS +from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from ..common import BasicTestCase, WithAuthTestCase class DepositAtomEntryTestCase(APITestCase, WithAuthTestCase, BasicTestCase): """Try and post atom entry deposit. """ def setUp(self): super().setUp() self.atom_entry_data0 = b""" Awesome Compiler hal urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a %s 2017-10-07T15:17:08Z some awesome author something awesome-compiler This is an awesome compiler destined to awesomely compile stuff and other stuff compiler,programming,language 2005-10-07T17:17:08Z 2005-10-07T17:17:08Z release note related link Awesome https://hoster.org/awesome-compiler GNU/Linux 0.0.1 running all """ self.atom_entry_data1 = b""" hal urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a 2017-10-07T15:17:08Z some awesome author something awesome-compiler This is an awesome compiler destined to awesomely compile stuff and other stuff compiler,programming,language 2005-10-07T17:17:08Z 2005-10-07T17:17:08Z release note related link Awesome https://hoster.org/awesome-compiler GNU/Linux 0.0.1 running all """ self.atom_entry_data2 = b""" %s """ self.atom_entry_data_empty_body = b""" """ self.atom_entry_data3 = b""" something """ self.atom_entry_data_atom_only = b""" Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author """ self.atom_entry_data_codemeta = b""" Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 1785io25c695 origin url other identifier, DOI, ARK Domain description key-word 1 key-word 2 creation date publication date comment article name article id Collaboration/Projet project name id see also Sponsor A Sponsor B Platform/OS dependencies Version active license url spdx .Net Framework 3.0 Python2.3 author1 Inria UPMC author2 Inria UPMC http://code.com language 1 language 2 http://issuetracker.com """ # noqa self.atom_entry_data_dc_codemeta = b""" %s hal-01587361 https://hal.inria.fr/hal-01587361 https://hal.inria.fr/hal-01587361/document https://hal.inria.fr/hal-01587361/file/AffectationRO-v1.0.0.zip doi:10.5281/zenodo.438684 The assignment problem AffectationRO Gruenpeter, Morane [INFO] Computer Science [cs] [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO] SOFTWARE Project in OR: The assignment problemA java implementation for the assignment problem first release description fr 2015-06-01 2017-10-19 en url stable Version sur hal Version entre par lutilisateur Mots-cls Commentaire Rfrence interne Collaboration/Projet nom du projet id Voir aussi Financement Projet ANR Projet Europen Platform/OS Dpendances Etat du dveloppement license url spdx Outils de dveloppement- outil no1 Outils de dveloppement- outil no2 http://code.com language 1 language 2 """ # noqa self.atom_entry_tei = b"""HAL TEI export of hal-01587083CCSDDistributed under a Creative Commons Attribution 4.0 International License

HAL API platform

questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733MoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.com2017-09-29 11:21:322017-10-03 17:20:132017-10-03 17:20:132017-09-292017-09-29contributorMoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.comCCSDhal-01587083https://hal.inria.fr/hal-01587083gruenpeter:hal-0158708320172017questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733EnglishComputer Science [cs]SoftwareIRILLInitiative pour la Recherche et l'Innovation sur le Logiciel Libre
https://www.irill.org/
Universite Pierre et Marie Curie - Paris 6UPMC
4 place Jussieu - 75005 Paris
http://www.upmc.fr/
Institut National de Recherche en Informatique et en AutomatiqueInria
Domaine de VoluceauRocquencourt - BP 10578153 Le Chesnay Cedex
http://www.inria.fr/en/
Universite Paris Diderot - Paris 7UPD7
5 rue Thomas-Mann - 75205 Paris cedex 13
http://www.univ-paris-diderot.fr
""" # noqa self.atom_entry_data_badly_formatted = b""" """ self.atom_error_with_decimal = b""" Composing a Web of Audio Applications hal hal-01243065 hal-01243065 https://hal-test.archives-ouvertes.fr/hal-01243065 test DSP programming,Web,Composability,Faust 2017-05-03T16:08:47+02:00 The Web offers a great opportunity to share, deploy and use programs without installation difficulties. In this article we explore the idea of freely combining/composing real-time audio applications deployed on the Web using Faust audio DSP language. 1 10.4 phpstorm stable linux php python C GNU General Public License v3.0 only CeCILL Free Software License Agreement v1.1 HAL hal@ccsd.cnrs.fr Someone Nice someone@nice.fr FFJ """ # noqa @istest def post_deposit_atom_entry_serialization_error(self): """Posting an initial atom entry should return 201 with deposit receipt """ # given # when response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=self.atom_error_with_decimal, HTTP_SLUG='external-id', HTTP_IN_PROGRESS='false') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) self.assertIsNotNone(dr.metadata) sw_version = dr.metadata.get( '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}softwareVersion') self.assertEquals(sw_version, '10.4') @istest def post_deposit_atom_empty_body_request(self): """Posting empty body request should return a 400 response """ response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data_empty_body) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def post_deposit_atom_badly_formatted_is_a_bad_request(self): """Posting a badly formatted atom should return a 400 response """ response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data_badly_formatted) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def post_deposit_atom_without_slug_header_is_bad_request(self): """Posting an atom entry without a slug header should return a 400 """ url = reverse(COL_IRI, args=[self.collection.name]) # when response = self.client.post( url, content_type='application/atom+xml;type=entry', data=self.atom_entry_data0, # + headers HTTP_IN_PROGRESS='false') self.assertIn(b'Missing SLUG header', response.content) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def post_deposit_atom_unknown_collection(self): """Posting an atom entry to an unknown collection should return a 404 """ response = self.client.post( reverse(COL_IRI, args=['unknown-one']), content_type='application/atom+xml;type=entry', data=self.atom_entry_data3, HTTP_SLUG='something') self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) @istest def post_deposit_atom_entry_initial(self): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a' with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = self.atom_entry_data0 % external_id.encode('utf-8') # when response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=atom_entry_data, HTTP_SLUG='external-id', HTTP_IN_PROGRESS='false') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.external_id, external_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.client, self.user) # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) self.assertIsNotNone(deposit_request.metadata) self.assertFalse(bool(deposit_request.archive)) @istest def post_deposit_atom_entry_with_codemeta(self): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a' with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = self.atom_entry_data_dc_codemeta % ( external_id.encode('utf-8'), ) # when response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=atom_entry_data, HTTP_SLUG='external-id', HTTP_IN_PROGRESS='false') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.external_id, external_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.client, self.user) # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) self.assertIsNotNone(deposit_request.metadata) self.assertFalse(bool(deposit_request.archive)) @istest def test_post_deposit_atom_entry_tei(self): """Posting initial atom entry as TEI should return 201 with receipt """ # given external_id = 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a' with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = self.atom_entry_tei # when response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS='false') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.external_id, external_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.client, self.user) # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) self.assertIsNotNone(deposit_request.metadata) self.assertFalse(bool(deposit_request.archive)) @istest def post_deposit_atom_entry_multiple_steps(self): """After initial deposit, updating a deposit should return a 201 """ # given external_id = 'urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a' with self.assertRaises(Deposit.DoesNotExist): deposit = Deposit.objects.get(external_id=external_id) # when response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data1, HTTP_IN_PROGRESS='True', HTTP_SLUG=external_id) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.status, 'partial') self.assertEqual(deposit.client, self.user) # one associated request to a deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEqual(len(deposit_requests), 1) atom_entry_data = self.atom_entry_data2 % external_id.encode('utf-8') update_uri = response._headers['location'][1] # when updating the first deposit post response = self.client.post( update_uri, content_type='application/atom+xml;type=entry', data=atom_entry_data, HTTP_IN_PROGRESS='False') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.external_id, external_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.client, self.user) self.assertEqual(len(Deposit.objects.all()), 1) # now 2 associated requests to a same deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEqual(len(deposit_requests), 2) for deposit_request in deposit_requests: actual_metadata = deposit_request.metadata self.assertIsNotNone(actual_metadata) self.assertFalse(bool(deposit_request.archive)) diff --git a/swh/deposit/tests/api/test_deposit_binary.py b/swh/deposit/tests/api/test_deposit_binary.py index f4d7a146..89d876c8 100644 --- a/swh/deposit/tests/api/test_deposit_binary.py +++ b/swh/deposit/tests/api/test_deposit_binary.py @@ -1,660 +1,660 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.files.uploadedfile import InMemoryUploadedFile from django.core.urlresolvers import reverse from io import BytesIO from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.tests import TEST_CONFIG from swh.deposit.config import COL_IRI, EM_IRI -from swh.deposit.config import DEPOSIT_STATUS_READY_FOR_CHECKS +from swh.deposit.config import DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from ..common import BasicTestCase, WithAuthTestCase, create_arborescence_zip from ..common import FileSystemCreationRoutine class DepositTestCase(APITestCase, WithAuthTestCase, BasicTestCase, FileSystemCreationRoutine): """Try and upload one single deposit """ def setUp(self): super().setUp() self.atom_entry_data0 = b""" Awesome Compiler hal urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a %s 2017-10-07T15:17:08Z some awesome author something awesome-compiler This is an awesome compiler destined to awesomely compile stuff and other stuff compiler,programming,language 2005-10-07T17:17:08Z 2005-10-07T17:17:08Z release note related link Awesome https://hoster.org/awesome-compiler GNU/Linux 0.0.1 running all """ self.atom_entry_data1 = b""" hal urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a 2017-10-07T15:17:08Z some awesome author something awesome-compiler This is an awesome compiler destined to awesomely compile stuff and other stuff compiler,programming,language 2005-10-07T17:17:08Z 2005-10-07T17:17:08Z release note related link Awesome https://hoster.org/awesome-compiler GNU/Linux 0.0.1 running all """ self.atom_entry_data2 = b""" %s """ self.atom_entry_data_empty_body = b""" """ self.atom_entry_data3 = b""" something """ self.data_atom_entry_ok = b""" Title urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2005-10-07T17:17:08Z Contributor The abstract The abstract Access Rights Alternative Title Date Available Bibliographic Citation # noqa Contributor Description Has Part Has Version Identifier Is Part Of Publisher References Rights Holder Source Title Type """ @istest def post_deposit_binary_without_slug_header_is_bad_request(self): """Posting a binary deposit without slug header should return 400 """ url = reverse(COL_IRI, args=[self.collection.name]) # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') self.assertIn(b'Missing SLUG header', response.content) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def post_deposit_binary_upload_final_and_status_check(self): """Binary upload with correct headers should return 201 with receipt """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( self.archive['name'], )) # then response_content = parse_xml(BytesIO(response.content)) self.assertEqual(response.status_code, status.HTTP_201_CREATED) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_request = DepositRequest.objects.get(deposit=deposit) self.assertEquals(deposit_request.deposit, deposit) self.assertRegex(deposit_request.archive.name, self.archive['name']) response_content = parse_xml(BytesIO(response.content)) self.assertEqual( response_content['{http://www.w3.org/2005/Atom}deposit_archive'], self.archive['name']) self.assertEqual( response_content['{http://www.w3.org/2005/Atom}deposit_id'], deposit.id) self.assertEqual( response_content['{http://www.w3.org/2005/Atom}deposit_status'], deposit.status) edit_se_iri = reverse('edit_se_iri', args=[self.collection.name, deposit.id]) self.assertEqual(response._headers['location'], ('Location', 'http://testserver' + edit_se_iri)) @istest - def post_deposit_binary_upload_only_supports_zip(self): - """Binary upload without content_type application/zip should return 415 + def post_deposit_binary_upload_supports_zip_or_tar(self): + """Binary upload with content-type not in [zip,x-tar] should return 415 """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/octet-stream', data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_415_UNSUPPORTED_MEDIA_TYPE) with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) @istest def post_deposit_binary_fails_if_unsupported_packaging_header( self): """Bin deposit without supported content_disposition header returns 400 """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id' # when response = self.client.post( url, content_type='application/zip', data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='something-unsupported', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) @istest def post_deposit_binary_upload_fail_if_no_content_disposition_header( self): """Binary upload without content_disposition header should return 400 """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id' # when response = self.client.post( url, content_type='application/zip', data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false') # then self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) @istest def post_deposit_mediation_not_supported(self): """Binary upload with mediation should return a 412 response """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/zip', data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_ON_BEHALF_OF='someone', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_412_PRECONDITION_FAILED) with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) @istest def post_deposit_binary_upload_fail_if_upload_size_limit_exceeded( self): """Binary upload must not exceed the limit set up... """ # given url = reverse(COL_IRI, args=[self.collection.name]) archive = create_arborescence_zip( self.root_path, 'archive2', 'file2', b'some content in file', up_to_size=TEST_CONFIG['max_upload_size']) external_id = 'some-external-id' # when response = self.client.post( url, content_type='application/zip', data=archive['data'], # + headers CONTENT_LENGTH=archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_413_REQUEST_ENTITY_TOO_LARGE) self.assertRegex(response.content, b'Upload size limit exceeded') with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) @istest def post_deposit_2_post_2_different_deposits(self): """2 posting deposits should return 2 different 201 with receipt """ url = reverse(COL_IRI, args=[self.collection.name]) # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG='some-external-id-1', HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) deposits = Deposit.objects.all() self.assertEqual(len(deposits), 1) self.assertEqual(deposits[0], deposit) # second post response = self.client.post( url, - content_type='application/zip', # as zip + content_type='application/x-tar', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG='another-external-id', HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename1') self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id2 = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit2 = Deposit.objects.get(pk=deposit_id2) self.assertNotEqual(deposit, deposit2) deposits = Deposit.objects.all().order_by('id') self.assertEqual(len(deposits), 2) self.assertEqual(list(deposits), [deposit, deposit2]) @istest def post_deposit_binary_and_post_to_add_another_archive(self): """Updating a deposit should return a 201 with receipt """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='true', HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( self.archive['name'], )) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.status, 'partial') self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_request = DepositRequest.objects.get(deposit=deposit) self.assertEquals(deposit_request.deposit, deposit) self.assertEquals(deposit_request.type.name, 'archive') self.assertRegex(deposit_request.archive.name, self.archive['name']) # 2nd archive to upload archive2 = create_arborescence_zip( self.root_path, 'archive2', 'file2', b'some other content in file') # uri to update the content update_uri = reverse(EM_IRI, args=[self.collection.name, deposit_id]) # adding another archive for the deposit and finalizing it response = self.client.post( update_uri, content_type='application/zip', # as zip data=archive2['data'], # + headers CONTENT_LENGTH=archive2['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( archive2['name'])) self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit = Deposit.objects.get(pk=deposit_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_requests = list(DepositRequest.objects.filter(deposit=deposit). order_by('id')) # 2 deposit requests for the same deposit self.assertEquals(len(deposit_requests), 2) self.assertEquals(deposit_requests[0].deposit, deposit) self.assertEquals(deposit_requests[0].type.name, 'archive') self.assertRegex(deposit_requests[0].archive.name, self.archive['name']) self.assertEquals(deposit_requests[1].deposit, deposit) self.assertEquals(deposit_requests[1].type.name, 'archive') self.assertRegex(deposit_requests[1].archive.name, archive2['name']) # only 1 deposit in db deposits = Deposit.objects.all() self.assertEqual(len(deposits), 1) @istest def post_deposit_then_post_or_put_is_refused_when_status_ready(self): """Updating a deposit with status 'ready' should return a 400 """ url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_request = DepositRequest.objects.get(deposit=deposit) self.assertEquals(deposit_request.deposit, deposit) self.assertRegex(deposit_request.archive.name, 'filename0') # updating/adding is forbidden # uri to update the content edit_se_iri = reverse( 'edit_se_iri', args=[self.collection.name, deposit_id]) em_iri = reverse( 'em_iri', args=[self.collection.name, deposit_id]) # Testing all update/add endpoint should fail # since the status is ready archive2 = create_arborescence_zip( self.root_path, 'archive2', 'file2', b'some content in file 2') # replacing file is no longer possible since the deposit's # status is ready r = self.client.put( em_iri, content_type='application/zip', data=archive2['data'], CONTENT_LENGTH=archive2['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') self.assertEquals(r.status_code, status.HTTP_400_BAD_REQUEST) # adding file is no longer possible since the deposit's status # is ready r = self.client.post( em_iri, content_type='application/zip', data=archive2['data'], CONTENT_LENGTH=archive2['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') self.assertEquals(r.status_code, status.HTTP_400_BAD_REQUEST) # replacing metadata is no longer possible since the deposit's # status is ready r = self.client.put( edit_se_iri, content_type='application/atom+xml;type=entry', data=self.data_atom_entry_ok, CONTENT_LENGTH=len(self.data_atom_entry_ok), HTTP_SLUG=external_id) self.assertEquals(r.status_code, status.HTTP_400_BAD_REQUEST) # adding new metadata is no longer possible since the # deposit's status is ready r = self.client.post( edit_se_iri, content_type='application/atom+xml;type=entry', data=self.data_atom_entry_ok, CONTENT_LENGTH=len(self.data_atom_entry_ok), HTTP_SLUG=external_id) self.assertEquals(r.status_code, status.HTTP_400_BAD_REQUEST) archive_content = b'some content representing archive' archive = InMemoryUploadedFile( BytesIO(archive_content), field_name='archive0', name='archive0', content_type='application/zip', size=len(archive_content), charset=None) atom_entry = InMemoryUploadedFile( BytesIO(self.data_atom_entry_ok), field_name='atom0', name='atom0', content_type='application/atom+xml; charset="utf-8"', size=len(self.data_atom_entry_ok), charset='utf-8') # replacing multipart metadata is no longer possible since the # deposit's status is ready r = self.client.put( edit_se_iri, format='multipart', data={ 'archive': archive, 'atom_entry': atom_entry, }) self.assertEquals(r.status_code, status.HTTP_400_BAD_REQUEST) # adding new metadata is no longer possible since the # deposit's status is ready r = self.client.post( edit_se_iri, format='multipart', data={ 'archive': archive, 'atom_entry': atom_entry, }) self.assertEquals(r.status_code, status.HTTP_400_BAD_REQUEST) diff --git a/swh/deposit/tests/api/test_deposit_check.py b/swh/deposit/tests/api/test_deposit_check.py index 7ae63b34..dfda6759 100644 --- a/swh/deposit/tests/api/test_deposit_check.py +++ b/swh/deposit/tests/api/test_deposit_check.py @@ -1,147 +1,147 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest from django.core.urlresolvers import reverse from nose.tools import istest from nose.plugins.attrib import attr from rest_framework import status from rest_framework.test import APITestCase from ...models import Deposit -from ...config import DEPOSIT_STATUS_READY, PRIVATE_CHECK_DEPOSIT -from ...config import DEPOSIT_STATUS_READY_FOR_CHECKS, DEPOSIT_STATUS_REJECTED +from ...config import DEPOSIT_STATUS_VERIFIED, PRIVATE_CHECK_DEPOSIT +from ...config import DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine from ..common import FileSystemCreationRoutine from ...api.private.deposit_check import SWHChecksDeposit @attr('fs') class CheckDepositTest(APITestCase, WithAuthTestCase, BasicTestCase, CommonCreationRoutine, FileSystemCreationRoutine): """Check deposit endpoints. """ def setUp(self): super().setUp() @istest def deposit_ok(self): """Proper deposit should succeed the checks (-> status ready) """ deposit_id = self.create_simple_binary_deposit(status_partial=True) deposit_id = self.update_binary_deposit(deposit_id, status_partial=False) deposit = Deposit.objects.get(pk=deposit_id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEquals(deposit.status, DEPOSIT_STATUS_DEPOSITED) url = reverse(PRIVATE_CHECK_DEPOSIT, args=[self.collection.name, deposit.id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) data = json.loads(response.content.decode('utf-8')) - self.assertEqual(data['status'], DEPOSIT_STATUS_READY) + self.assertEqual(data['status'], DEPOSIT_STATUS_VERIFIED) deposit = Deposit.objects.get(pk=deposit.id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY) + self.assertEquals(deposit.status, DEPOSIT_STATUS_VERIFIED) @istest def deposit_ko(self): """Invalid deposit should fail the checks (-> status rejected) """ deposit_id = self.create_invalid_deposit() deposit = Deposit.objects.get(pk=deposit_id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEquals(deposit.status, DEPOSIT_STATUS_DEPOSITED) url = reverse(PRIVATE_CHECK_DEPOSIT, args=[self.collection.name, deposit.id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) data = json.loads(response.content.decode('utf-8')) self.assertEqual(data['status'], DEPOSIT_STATUS_REJECTED) self.assertEqual(data['details'], 'Some archive(s) and metadata and url ' + 'failed the checks.') deposit = Deposit.objects.get(pk=deposit.id) self.assertEquals(deposit.status, DEPOSIT_STATUS_REJECTED) @istest def check_deposit_metadata_ok(self): """Proper deposit should succeed the checks (-> status ready) with all **MUST** metadata using the codemeta metadata test set """ deposit_id = self.create_simple_binary_deposit(status_partial=True) deposit_id_metadata = self.add_metadata_to_deposit(deposit_id) self.assertEquals(deposit_id, deposit_id_metadata) deposit = Deposit.objects.get(pk=deposit_id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEquals(deposit.status, DEPOSIT_STATUS_DEPOSITED) url = reverse(PRIVATE_CHECK_DEPOSIT, args=[self.collection.name, deposit.id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) data = json.loads(response.content.decode('utf-8')) - self.assertEqual(data['status'], DEPOSIT_STATUS_READY) + self.assertEqual(data['status'], DEPOSIT_STATUS_VERIFIED) deposit = Deposit.objects.get(pk=deposit.id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY) + self.assertEquals(deposit.status, DEPOSIT_STATUS_VERIFIED) class CheckMetadata(unittest.TestCase, SWHChecksDeposit): @istest def check_metadata_ok(self): actual_check = self._check_metadata({ 'url': 'something', 'external_identifier': 'something-else', 'name': 'foo', 'author': 'someone', }) self.assertTrue(actual_check) @istest def check_metadata_ok2(self): actual_check = self._check_metadata({ 'url': 'something', 'external_identifier': 'something-else', 'title': 'bar', 'author': 'someone', }) self.assertTrue(actual_check) @istest def check_metadata_ko(self): actual_check = self._check_metadata({ 'url': 'something', 'external_identifier': 'something-else', 'author': 'someone', }) self.assertFalse(actual_check) @istest def check_metadata_ko2(self): actual_check = self._check_metadata({ 'url': 'something', 'external_identifier': 'something-else', 'title': 'foobar', }) self.assertFalse(actual_check) diff --git a/swh/deposit/tests/api/test_deposit_delete.py b/swh/deposit/tests/api/test_deposit_delete.py index 26c4ec8a..a0a10aa1 100644 --- a/swh/deposit/tests/api/test_deposit_delete.py +++ b/swh/deposit/tests/api/test_deposit_delete.py @@ -1,119 +1,119 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.urlresolvers import reverse from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.config import EDIT_SE_IRI, EM_IRI, ARCHIVE_KEY, METADATA_KEY -from swh.deposit.config import DEPOSIT_STATUS_READY_FOR_CHECKS +from swh.deposit.config import DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine class DepositDeleteTest(APITestCase, WithAuthTestCase, BasicTestCase, CommonCreationRoutine): @istest def delete_archive_on_partial_deposit_works(self): """Removing partial deposit's archive should return a 204 response """ # given deposit_id = self.create_deposit_partial() deposit = Deposit.objects.get(pk=deposit_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEquals(len(deposit_requests), 2) for dr in deposit_requests: if dr.type.name == ARCHIVE_KEY: continue elif dr.type.name == METADATA_KEY: continue else: self.fail('only archive and metadata type should exist ' 'in this test context') # when update_uri = reverse(EM_IRI, args=[self.collection.name, deposit_id]) response = self.client.delete(update_uri) # then self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) deposit = Deposit.objects.get(pk=deposit_id) requests = list(DepositRequest.objects.filter(deposit=deposit)) self.assertEquals(len(requests), 2) self.assertEquals(requests[0].type.name, 'metadata') self.assertEquals(requests[1].type.name, 'metadata') @istest def delete_archive_on_undefined_deposit_fails(self): """Delete undefined deposit returns a 404 response """ # when update_uri = reverse(EM_IRI, args=[self.collection.name, 999]) response = self.client.delete(update_uri) # then self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) @istest def delete_archive_on_non_partial_deposit_fails(self): """Delete !partial status deposit should return a 400 response""" deposit_id = self.create_deposit_ready() deposit = Deposit.objects.get(pk=deposit_id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEquals(deposit.status, DEPOSIT_STATUS_DEPOSITED) # when update_uri = reverse(EM_IRI, args=[self.collection.name, deposit_id]) response = self.client.delete(update_uri) # then self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) deposit = Deposit.objects.get(pk=deposit_id) self.assertIsNotNone(deposit) @istest def delete_partial_deposit_works(self): """Delete deposit should return a 204 response """ # given deposit_id = self.create_simple_deposit_partial() deposit = Deposit.objects.get(pk=deposit_id) assert deposit.id == deposit_id # when url = reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]) response = self.client.delete(url) # then self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) deposit_requests = list(DepositRequest.objects.filter(deposit=deposit)) self.assertEquals(deposit_requests, []) deposits = list(Deposit.objects.filter(pk=deposit_id)) self.assertEquals(deposits, []) @istest def delete_on_edit_se_iri_cannot_delete_non_partial_deposit(self): """Delete !partial deposit should return a 400 response """ # given deposit_id = self.create_deposit_ready() deposit = Deposit.objects.get(pk=deposit_id) assert deposit.id == deposit_id # when url = reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]) response = self.client.delete(url) # then self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) deposit = Deposit.objects.get(pk=deposit_id) self.assertIsNotNone(deposit) diff --git a/swh/deposit/tests/api/test_deposit_multipart.py b/swh/deposit/tests/api/test_deposit_multipart.py index 1b0f5bb1..cee1c2b6 100644 --- a/swh/deposit/tests/api/test_deposit_multipart.py +++ b/swh/deposit/tests/api/test_deposit_multipart.py @@ -1,327 +1,393 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.files.uploadedfile import InMemoryUploadedFile from django.core.urlresolvers import reverse from io import BytesIO from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.config import COL_IRI -from swh.deposit.config import DEPOSIT_STATUS_READY_FOR_CHECKS +from swh.deposit.config import DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from ..common import BasicTestCase, WithAuthTestCase from ..common import FileSystemCreationRoutine class DepositMultipartTestCase(APITestCase, WithAuthTestCase, BasicTestCase, FileSystemCreationRoutine): """Post multipart deposit scenario """ def setUp(self): super().setUp() self.data_atom_entry_ok = b""" Title urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2005-10-07T17:17:08Z Contributor The abstract The abstract Access Rights Alternative Title Date Available Bibliographic Citation # noqa Contributor Description Has Part Has Version Identifier Is Part Of Publisher References Rights Holder Source Title Type """ self.data_atom_entry_update_in_place = """ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa7b Title Type """ @istest def post_deposit_multipart_without_slug_header_is_bad_request(self): # given url = reverse(COL_IRI, args=[self.collection.name]) data_atom_entry = self.data_atom_entry_ok archive_content = b'some content representing archive' archive = InMemoryUploadedFile( BytesIO(archive_content), field_name='archive0', name='archive0', content_type='application/zip', size=len(archive_content), charset=None) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry), field_name='atom0', name='atom0', content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset='utf-8') # when response = self.client.post( url, format='multipart', data={ 'archive': archive, 'atom_entry': atom_entry, }, # + headers HTTP_IN_PROGRESS='false') self.assertIn(b'Missing SLUG header', response.content) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest - def post_deposit_multipart(self): - """one multipart deposit should be accepted + def post_deposit_multipart_zip(self): + """one multipart deposit (zip+xml) should be accepted """ # given url = reverse(COL_IRI, args=[self.collection.name]) # from django.core.files import uploadedfile data_atom_entry = self.data_atom_entry_ok archive = InMemoryUploadedFile( BytesIO(self.archive['data']), field_name=self.archive['name'], name=self.archive['name'], content_type='application/zip', size=self.archive['length'], charset=None) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry), field_name='atom0', name='atom0', content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset='utf-8') external_id = 'external-id' # when response = self.client.post( url, format='multipart', data={ 'archive': archive, 'atom_entry': atom_entry, }, # + headers HTTP_IN_PROGRESS='false', HTTP_SLUG=external_id) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) + self.assertEqual(deposit.external_id, external_id) + self.assertEqual(deposit.collection, self.collection) + self.assertEqual(deposit.client, self.user) + self.assertIsNone(deposit.swh_id) + + deposit_requests = DepositRequest.objects.filter(deposit=deposit) + self.assertEquals(len(deposit_requests), 2) + for deposit_request in deposit_requests: + self.assertEquals(deposit_request.deposit, deposit) + if deposit_request.type.name == 'archive': + self.assertRegex(deposit_request.archive.name, + self.archive['name']) + else: + self.assertEquals( + deposit_request.metadata[ + '{http://www.w3.org/2005/Atom}id'], + 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a') + + @istest + def post_deposit_multipart_tar(self): + """one multipart deposit (tar+xml) should be accepted + + """ + # given + url = reverse(COL_IRI, args=[self.collection.name]) + + # from django.core.files import uploadedfile + data_atom_entry = self.data_atom_entry_ok + + archive = InMemoryUploadedFile( + BytesIO(self.archive['data']), + field_name=self.archive['name'], + name=self.archive['name'], + content_type='application/x-tar', + size=self.archive['length'], + charset=None) + + atom_entry = InMemoryUploadedFile( + BytesIO(data_atom_entry), + field_name='atom0', + name='atom0', + content_type='application/atom+xml; charset="utf-8"', + size=len(data_atom_entry), + charset='utf-8') + + external_id = 'external-id' + + # when + response = self.client.post( + url, + format='multipart', + data={ + 'archive': archive, + 'atom_entry': atom_entry, + }, + # + headers + HTTP_IN_PROGRESS='false', + HTTP_SLUG=external_id) + + # then + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + + response_content = parse_xml(BytesIO(response.content)) + deposit_id = response_content[ + '{http://www.w3.org/2005/Atom}deposit_id'] + + deposit = Deposit.objects.get(pk=deposit_id) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEquals(len(deposit_requests), 2) for deposit_request in deposit_requests: self.assertEquals(deposit_request.deposit, deposit) if deposit_request.type.name == 'archive': self.assertRegex(deposit_request.archive.name, self.archive['name']) else: self.assertEquals( deposit_request.metadata[ '{http://www.w3.org/2005/Atom}id'], 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a') @istest def post_deposit_multipart_put_to_replace_metadata(self): """One multipart deposit followed by a metadata update should be accepted """ # given url = reverse(COL_IRI, args=[self.collection.name]) data_atom_entry = self.data_atom_entry_ok archive = InMemoryUploadedFile( BytesIO(self.archive['data']), field_name=self.archive['name'], name=self.archive['name'], content_type='application/zip', size=self.archive['length'], charset=None) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry), field_name='atom0', name='atom0', content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset='utf-8') external_id = 'external-id' # when response = self.client.post( url, format='multipart', data={ 'archive': archive, 'atom_entry': atom_entry, }, # + headers HTTP_IN_PROGRESS='true', HTTP_SLUG=external_id) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] deposit = Deposit.objects.get(pk=deposit_id) self.assertEqual(deposit.status, 'partial') self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEquals(len(deposit_requests), 2) for deposit_request in deposit_requests: self.assertEquals(deposit_request.deposit, deposit) if deposit_request.type.name == 'archive': self.assertRegex(deposit_request.archive.name, self.archive['name']) else: self.assertEquals( deposit_request.metadata[ '{http://www.w3.org/2005/Atom}id'], 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a') replace_metadata_uri = response._headers['location'][1] response = self.client.put( replace_metadata_uri, content_type='application/atom+xml;type=entry', data=self.data_atom_entry_update_in_place, HTTP_IN_PROGRESS='false') self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) # deposit_id did not change deposit = Deposit.objects.get(pk=deposit_id) - self.assertEqual(deposit.status, DEPOSIT_STATUS_READY_FOR_CHECKS) + self.assertEqual(deposit.status, DEPOSIT_STATUS_DEPOSITED) self.assertEqual(deposit.external_id, external_id) self.assertEqual(deposit.collection, self.collection) self.assertEqual(deposit.client, self.user) self.assertIsNone(deposit.swh_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEquals(len(deposit_requests), 2) for deposit_request in deposit_requests: self.assertEquals(deposit_request.deposit, deposit) if deposit_request.type.name == 'archive': self.assertRegex(deposit_request.archive.name, self.archive['name']) else: self.assertEquals( deposit_request.metadata[ '{http://www.w3.org/2005/Atom}id'], 'urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa7b') # FAILURE scenarios @istest def post_deposit_multipart_only_archive_and_atom_entry(self): """Multipart deposit only accepts one archive and one atom+xml""" # given url = reverse(COL_IRI, args=[self.collection.name]) - # from django.core.files import uploadedfile - archive_content = b'some content representing archive' archive = InMemoryUploadedFile(BytesIO(archive_content), field_name='archive0', name='archive0', - content_type='application/zip', + content_type='application/x-tar', size=len(archive_content), charset=None) other_archive_content = b"some-other-content" other_archive = InMemoryUploadedFile(BytesIO(other_archive_content), field_name='atom0', name='atom0', - content_type='application/zip', + content_type='application/x-tar', size=len(other_archive_content), charset='utf-8') # when response = self.client.post( url, format='multipart', data={ 'archive': archive, 'atom_entry': other_archive, }, # + headers HTTP_IN_PROGRESS='false', HTTP_SLUG='external-id') # then self.assertEqual(response.status_code, status.HTTP_415_UNSUPPORTED_MEDIA_TYPE) # when archive.seek(0) response = self.client.post( url, format='multipart', data={ 'archive': archive, }, # + headers HTTP_IN_PROGRESS='false', HTTP_SLUG='external-id') # then self.assertEqual(response.status_code, status.HTTP_415_UNSUPPORTED_MEDIA_TYPE) diff --git a/swh/deposit/tests/api/test_deposit_read_metadata.py b/swh/deposit/tests/api/test_deposit_read_metadata.py index 657d61ed..e46682c4 100644 --- a/swh/deposit/tests/api/test_deposit_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_read_metadata.py @@ -1,214 +1,215 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from django.core.urlresolvers import reverse from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.models import Deposit from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.config import DEPOSIT_STATUS_PARTIAL from ...config import SWH_PERSON from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine class DepositReadMetadataTest(APITestCase, WithAuthTestCase, BasicTestCase, CommonCreationRoutine): """Deposit access to read metadata information on deposit. """ @istest def read_metadata(self): """Private metadata read api to existing deposit should return metadata """ deposit_id = self.create_deposit_partial() url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[self.collection.name, deposit_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEquals(response._headers['content-type'][1], 'application/json') data = json.loads(response.content.decode('utf-8')) expected_meta = { 'origin': { 'url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id', 'type': 'deposit' }, 'origin_metadata': { 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id', '{http://www.w3.org/2005/Atom}url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' }, 'provider': { 'provider_name': '', 'provider_type': 'deposit_client', 'provider_url': 'https://hal-test.archives-ouvertes.fr/', 'metadata': {} }, 'tool': { 'tool_name': 'swh-deposit', 'tool_version': '0.0.1', 'tool_configuration': { 'sword_version': '2' } } }, 'revision': { 'synthetic': True, 'committer_date': None, 'message': ': Deposit %s in collection hal' % deposit_id, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'date': None, 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id', '{http://www.w3.org/2005/Atom}url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' }, 'type': 'tar' }, 'occurrence': { 'branch': 'master' } } self.assertEquals(data, expected_meta) @istest def read_metadata_revision_with_parent(self): """Private read metadata to a deposit (with parent) returns metadata """ swh_id = 'da78a9d4cf1d5d29873693fd496142e3a18c20fa' + swh_persistent_id = 'swh:1:rev:%s' % swh_id deposit_id1 = self.create_deposit_with_status( status=DEPOSIT_STATUS_LOAD_SUCCESS, external_id='some-external-id', - swh_id=swh_id) + swh_id=swh_persistent_id) deposit_parent = Deposit.objects.get(pk=deposit_id1) - self.assertEquals(deposit_parent.swh_id, swh_id) + self.assertEquals(deposit_parent.swh_id, swh_persistent_id) self.assertEquals(deposit_parent.external_id, 'some-external-id') self.assertEquals(deposit_parent.status, DEPOSIT_STATUS_LOAD_SUCCESS) deposit_id = self.create_deposit_partial( external_id='some-external-id') deposit = Deposit.objects.get(pk=deposit_id) self.assertEquals(deposit.external_id, 'some-external-id') self.assertEquals(deposit.swh_id, None) self.assertEquals(deposit.parent, deposit_parent) self.assertEquals(deposit.status, DEPOSIT_STATUS_PARTIAL) url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[self.collection.name, deposit_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEquals(response._headers['content-type'][1], 'application/json') data = json.loads(response.content.decode('utf-8')) expected_meta = { 'origin': { 'url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id', 'type': 'deposit' }, 'origin_metadata': { 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id', '{http://www.w3.org/2005/Atom}url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' }, 'provider': { 'provider_name': '', 'provider_type': 'deposit_client', 'provider_url': 'https://hal-test.archives-ouvertes.fr/', 'metadata': {} }, 'tool': { 'tool_name': 'swh-deposit', 'tool_version': '0.0.1', 'tool_configuration': { 'sword_version': '2' } } }, 'revision': { 'synthetic': True, 'date': None, 'committer_date': None, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': 'tar', 'message': ': Deposit %s in collection hal' % deposit_id, 'metadata': { '{http://www.w3.org/2005/Atom}external_identifier': 'some-external-id', '{http://www.w3.org/2005/Atom}url': 'https://hal-test.archives-ouvertes.fr/' + 'some-external-id' }, 'parents': [swh_id] }, 'occurrence': { 'branch': 'master' } } self.assertEquals(data, expected_meta) @istest def access_to_nonexisting_deposit_returns_404_response(self): """Read unknown collection should return a 404 response """ unknown_id = '999' url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[self.collection.name, unknown_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) self.assertIn('Deposit with id %s does not exist' % unknown_id, response.content.decode('utf-8')) @istest def access_to_nonexisting_collection_returns_404_response(self): """Read unknown deposit should return a 404 response """ collection_name = 'non-existing' deposit_id = self.create_deposit_partial() url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection_name, deposit_id]) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) self.assertIn('Unknown collection name %s' % collection_name, response.content.decode('utf-8'),) diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py index 54e3c977..f3c0692d 100644 --- a/swh/deposit/tests/api/test_deposit_status.py +++ b/swh/deposit/tests/api/test_deposit_status.py @@ -1,121 +1,121 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.urlresolvers import reverse from io import BytesIO from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.models import Deposit, DEPOSIT_STATUS_DETAIL from swh.deposit.models import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.parsers import parse_xml from ..common import BasicTestCase, WithAuthTestCase, FileSystemCreationRoutine from ..common import CommonCreationRoutine -from ...config import COL_IRI, STATE_IRI, DEPOSIT_STATUS_READY_FOR_CHECKS +from ...config import COL_IRI, STATE_IRI, DEPOSIT_STATUS_DEPOSITED class DepositStatusTestCase(APITestCase, WithAuthTestCase, BasicTestCase, FileSystemCreationRoutine, CommonCreationRoutine): """Status on deposit """ @istest def post_deposit_with_status_check(self): """Binary upload should be accepted """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) deposit = Deposit.objects.get(external_id=external_id) status_url = reverse(STATE_IRI, args=[self.collection.name, deposit.id]) # check status status_response = self.client.get(status_url) self.assertEqual(status_response.status_code, status.HTTP_200_OK) r = parse_xml(BytesIO(status_response.content)) self.assertEqual(r['{http://www.w3.org/2005/Atom}deposit_id'], deposit.id) self.assertEqual(r['{http://www.w3.org/2005/Atom}deposit_status'], - DEPOSIT_STATUS_READY_FOR_CHECKS) + DEPOSIT_STATUS_DEPOSITED) self.assertEqual( r['{http://www.w3.org/2005/Atom}deposit_status_detail'], - DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_READY_FOR_CHECKS]) + DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED]) @istest def status_with_swh_id(self): _status = DEPOSIT_STATUS_LOAD_SUCCESS _swh_id = '548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10' # given deposit_id = self.create_deposit_with_status( status=_status, swh_id=_swh_id) url = reverse(STATE_IRI, args=[self.collection.name, deposit_id]) # when status_response = self.client.get(url) # then self.assertEqual(status_response.status_code, status.HTTP_200_OK) r = parse_xml(BytesIO(status_response.content)) self.assertEqual(r['{http://www.w3.org/2005/Atom}deposit_id'], deposit_id) self.assertEqual(r['{http://www.w3.org/2005/Atom}deposit_status'], _status) self.assertEqual( r['{http://www.w3.org/2005/Atom}deposit_status_detail'], DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS]) self.assertEqual(r['{http://www.w3.org/2005/Atom}deposit_swh_id'], _swh_id) @istest def status_on_unknown_deposit(self): """Asking for the status of unknown deposit returns 404 response""" status_url = reverse(STATE_IRI, args=[self.collection.name, 999]) status_response = self.client.get(status_url) self.assertEqual(status_response.status_code, status.HTTP_404_NOT_FOUND) @istest def status_with_http_accept_header_should_not_break(self): """Asking deposit status with Accept header should return 200 """ deposit_id = self.create_deposit_partial() status_url = reverse(STATE_IRI, args=[ self.collection.name, deposit_id]) response = self.client.get( status_url, HTTP_ACCEPT='text/html,application/xml;q=9,*/*,q=8') self.assertEqual(response.status_code, status.HTTP_200_OK) diff --git a/swh/deposit/tests/api/test_deposit_update_status.py b/swh/deposit/tests/api/test_deposit_update_status.py index 675f5ecb..3d6cc2da 100644 --- a/swh/deposit/tests/api/test_deposit_update_status.py +++ b/swh/deposit/tests/api/test_deposit_update_status.py @@ -1,119 +1,123 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from django.core.urlresolvers import reverse from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.models import Deposit, DEPOSIT_STATUS_DETAIL -from swh.deposit.config import PRIVATE_PUT_DEPOSIT, DEPOSIT_STATUS_READY +from swh.deposit.config import PRIVATE_PUT_DEPOSIT, DEPOSIT_STATUS_VERIFIED +from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from ..common import BasicTestCase class UpdateDepositStatusTest(APITestCase, BasicTestCase): """Update the deposit's status scenario """ def setUp(self): super().setUp() - deposit = Deposit(status=DEPOSIT_STATUS_READY, + deposit = Deposit(status=DEPOSIT_STATUS_VERIFIED, collection=self.collection, client=self.user) deposit.save() self.deposit = Deposit.objects.get(pk=deposit.id) - assert self.deposit.status == DEPOSIT_STATUS_READY + assert self.deposit.status == DEPOSIT_STATUS_VERIFIED @istest def update_deposit_status(self): """Existing status for update should return a 204 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) - possible_status = set(DEPOSIT_STATUS_DETAIL.keys()) - set(['success']) + possible_status = set(DEPOSIT_STATUS_DETAIL.keys()) - set( + [DEPOSIT_STATUS_LOAD_SUCCESS]) for _status in possible_status: response = self.client.put( url, content_type='application/json', data=json.dumps({'status': _status})) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) deposit = Deposit.objects.get(pk=self.deposit.id) self.assertEquals(deposit.status, _status) @istest def update_deposit_with_success_loading_and_swh_id(self): """Existing status for update should return a 204 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) - expected_status = 'success' - expected_id = revision_id = '47dc6b4636c7f6cba0df83e3d5490bf4334d987e' + expected_status = DEPOSIT_STATUS_LOAD_SUCCESS + revision_id = '47dc6b4636c7f6cba0df83e3d5490bf4334d987e' + expected_id = 'swh:1:rev:%s' % revision_id + response = self.client.put( url, content_type='application/json', data=json.dumps({ 'status': expected_status, 'revision_id': revision_id, })) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) deposit = Deposit.objects.get(pk=self.deposit.id) self.assertEquals(deposit.status, expected_status) self.assertEquals(deposit.swh_id, expected_id) @istest def update_deposit_status_will_fail_with_unknown_status(self): """Unknown status for update should return a 400 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) response = self.client.put( url, content_type='application/json', data=json.dumps({'status': 'unknown'})) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def update_deposit_status_will_fail_with_no_status_key(self): """No status provided for update should return a 400 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) response = self.client.put( url, content_type='application/json', data=json.dumps({'something': 'something'})) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def update_deposit_status_success_without_swh_id_fail(self): - """Providing 'success' status without swh_id should return a 400 + """Providing successful status without swh_id should return a 400 """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) response = self.client.put( url, content_type='application/json', - data=json.dumps({'status': 'success'})) + data=json.dumps({'status': DEPOSIT_STATUS_LOAD_SUCCESS})) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) diff --git a/swh/deposit/tests/api/test_service_document.py b/swh/deposit/tests/api/test_service_document.py index 6db695b4..453d16a1 100644 --- a/swh/deposit/tests/api/test_service_document.py +++ b/swh/deposit/tests/api/test_service_document.py @@ -1,101 +1,102 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.urlresolvers import reverse from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.tests import TEST_CONFIG from swh.deposit.config import SD_IRI from ..common import BasicTestCase, WithAuthTestCase class ServiceDocumentNoAuthCase(APITestCase, BasicTestCase): """Service document endpoints are protected with basic authentication. """ @istest def service_document_no_authentication_fails(self): """Without authentication, service document endpoint should return 401 """ url = reverse(SD_IRI) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) @istest def service_document_with_http_accept_should_not_break(self): """Without auth, sd endpoint through browser should return 401 """ url = reverse(SD_IRI) # when response = self.client.get( url, HTTP_ACCEPT='text/html,application/xml;q=9,*/*,q=8') self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) class ServiceDocumentCase(APITestCase, WithAuthTestCase, BasicTestCase): def assertResponseOk(self, response): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEquals(response.content.decode('utf-8'), ''' 2.0 %s The Software Heritage (SWH) Archive %s Software Collection application/zip + application/x-tar Collection Policy Software Heritage Archive Collect, Preserve, Share false false http://purl.org/net/sword/package/SimpleZip https://deposit.softwareheritage.org/1/%s/ ''' % (TEST_CONFIG['max_upload_size'], self.username, self.username, self.username)) # noqa @istest def service_document(self): """With authentication, service document list user's collection """ url = reverse(SD_IRI) # when response = self.client.get(url) # then self.assertResponseOk(response) @istest def service_document_with_http_accept_header(self): """With authentication, with browser, sd list user's collection """ url = reverse(SD_IRI) # when response = self.client.get( url, HTTP_ACCEPT='text/html,application/xml;q=9,*/*,q=8') self.assertResponseOk(response) diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py index c87b19e2..31ac3181 100644 --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -1,466 +1,466 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import hashlib import os import shutil import tempfile from django.core.urlresolvers import reverse from django.test import TestCase from io import BytesIO from nose.plugins.attrib import attr from rest_framework import status from swh.deposit.config import COL_IRI, EM_IRI, EDIT_SE_IRI from swh.deposit.models import DepositClient, DepositCollection, Deposit from swh.deposit.models import DepositRequest from swh.deposit.models import DepositRequestType from swh.deposit.parsers import parse_xml from swh.deposit.settings.testing import MEDIA_ROOT from swh.core import tarball def create_arborescence_zip(root_path, archive_name, filename, content, up_to_size=None): """Build an archive named archive_name in the root_path. This archive contains one file named filename with the content content. Returns: dict with the keys: - dir: the directory of that archive - path: full path to the archive - sha1sum: archive's sha1sum - length: archive's length """ os.makedirs(root_path, exist_ok=True) archive_path_dir = tempfile.mkdtemp(dir=root_path) dir_path = os.path.join(archive_path_dir, archive_name) os.mkdir(dir_path) filepath = os.path.join(dir_path, filename) l = len(content) count = 0 batch_size = 128 with open(filepath, 'wb') as f: f.write(content) if up_to_size: # fill with blank content up to a given size count += l while count < up_to_size: f.write(b'0'*batch_size) count += batch_size zip_path = dir_path + '.zip' zip_path = tarball.compress(zip_path, 'zip', dir_path) with open(zip_path, 'rb') as f: length = 0 sha1sum = hashlib.sha1() md5sum = hashlib.md5() data = b'' for chunk in f: sha1sum.update(chunk) md5sum.update(chunk) length += len(chunk) data += chunk return { 'dir': archive_path_dir, 'name': archive_name, 'data': data, 'path': zip_path, 'sha1sum': sha1sum.hexdigest(), 'md5sum': md5sum.hexdigest(), 'length': length, } @attr('fs') class FileSystemCreationRoutine(TestCase): """Mixin intended for tests needed to tamper with archives. """ def setUp(self): """Define the test client and other test variables.""" super().setUp() self.root_path = '/tmp/swh-deposit/test/build-zip/' os.makedirs(self.root_path, exist_ok=True) self.archive = create_arborescence_zip( self.root_path, 'archive1', 'file1', b'some content in file') self.atom_entry = b""" Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author https://hal-test.archives-ouvertes.fr """ def tearDown(self): super().tearDown() shutil.rmtree(self.root_path) def create_simple_binary_deposit(self, status_partial=True): response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/zip', data=self.archive['data'], CONTENT_LENGTH=self.archive['length'], HTTP_MD5SUM=self.archive['md5sum'], HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial, HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( self.archive['name'], )) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] return deposit_id def create_complex_binary_deposit(self, status_partial=False): deposit_id = self.create_simple_binary_deposit( status_partial=True) # Add a second archive to the deposit - # update its status to DEPOSIT_STATUS_READY + # update its status to DEPOSIT_STATUS_VERIFIED response = self.client.post( reverse(EM_IRI, args=[self.collection.name, deposit_id]), content_type='application/zip', data=self.archive2['data'], CONTENT_LENGTH=self.archive2['length'], HTTP_MD5SUM=self.archive2['md5sum'], HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial, HTTP_CONTENT_DISPOSITION='attachment; filename=filename1.zip') # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] return deposit_id def update_binary_deposit(self, deposit_id, status_partial=False): # update existing deposit with atom entry metadata response = self.client.post( reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]), content_type='application/atom+xml;type=entry', data=self.codemeta_entry_data1, HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial) # then # assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] return deposit_id @attr('fs') class BasicTestCase(TestCase): """Mixin intended for data setup purposes (user, collection, etc...) """ def setUp(self): """Define the test client and other test variables.""" super().setUp() # expanding diffs in tests self.maxDiff = None # basic minimum test data deposit_request_types = {} # Add deposit request types for deposit_request_type in ['archive', 'metadata']: drt = DepositRequestType(name=deposit_request_type) drt.save() deposit_request_types[deposit_request_type] = drt _name = 'hal' _provider_url = 'https://hal-test.archives-ouvertes.fr/' _domain = 'archives-ouvertes.fr/' # set collection up _collection = DepositCollection(name=_name) _collection.save() # set user/client up _client = DepositClient.objects.create_user(username=_name, password=_name, provider_url=_provider_url, domain=_domain) _client.collections = [_collection.id] _client.save() self.collection = _collection self.user = _client self.username = _name self.userpass = _name self.deposit_request_types = deposit_request_types def tearDown(self): super().tearDown() # Clean up uploaded files in temporary directory (tests have # their own media root folder) if os.path.exists(MEDIA_ROOT): for d in os.listdir(MEDIA_ROOT): shutil.rmtree(os.path.join(MEDIA_ROOT, d)) class WithAuthTestCase(TestCase): """Mixin intended for testing the api with basic authentication. """ def setUp(self): super().setUp() _token = '%s:%s' % (self.username, self.userpass) token = base64.b64encode(_token.encode('utf-8')) authorization = 'Basic %s' % token.decode('utf-8') self.client.credentials(HTTP_AUTHORIZATION=authorization) def tearDown(self): super().tearDown() self.client.credentials() class CommonCreationRoutine(TestCase): """Mixin class to share initialization routine. cf: `class`:test_deposit_update.DepositReplaceExistingDataTest `class`:test_deposit_update.DepositUpdateDepositWithNewDataTest `class`:test_deposit_update.DepositUpdateFailuresTest `class`:test_deposit_delete.DepositDeleteTest """ def setUp(self): super().setUp() self.atom_entry_data0 = b""" some-external-id https://hal-test.archives-ouvertes.fr/some-external-id """ self.atom_entry_data1 = b""" anotherthing https://hal-test.archives-ouvertes.fr/anotherthing """ self.atom_entry_data2 = b""" Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author https://hal-test.archives-ouvertes.fr/id """ self.codemeta_entry_data0 = b""" Awesome Compiler https://hal-test.archives-ouvertes.fr/1785io25c695 urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author description key-word 1 """ self.codemeta_entry_data1 = b""" Composing a Web of Audio Applications hal hal-01243065 hal-01243065 https://hal-test.archives-ouvertes.fr/hal-01243065 test DSP programming,Web 2017-05-03T16:08:47+02:00 this is the description 1 phpstorm stable php python C GNU General Public License v3.0 only CeCILL Free Software License Agreement v1.1 HAL hal@ccsd.cnrs.fr Morane Gruenpeter """ def create_invalid_deposit(self, external_id='some-external-id-1'): url = reverse(COL_IRI, args=[self.collection.name]) data = b'some data which is clearly not a zip file' md5sum = hashlib.md5(data).hexdigest() # when response = self.client.post( url, content_type='application/zip', # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] return deposit_id def create_deposit_with_status( self, status, external_id='some-external-id-1', swh_id=None): deposit_id = self.create_invalid_deposit(external_id) # We cannot create some form of deposit with a given status in # test context ('rejected' for example). As flipped off the # checks in the configuration so all deposits have the status - # ready-for-checks). Update in place the deposit with such + # deposited). Update in place the deposit with such # status deposit = Deposit.objects.get(pk=deposit_id) deposit.status = status if swh_id: deposit.swh_id = swh_id deposit.save() return deposit_id def create_simple_deposit_partial(self, external_id='some-external-id'): """Create a simple deposit (1 request) in `partial` state and returns its new identifier. Returns: deposit id """ response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data0, HTTP_SLUG=external_id, HTTP_IN_PROGRESS='true') assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] return deposit_id def create_deposit_partial_with_data_in_args(self, data): """Create a simple deposit (1 request) in `partial` state with the data or metadata as an argument and returns its new identifier. Args: data: atom entry Returns: deposit id """ response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=data, HTTP_SLUG='external-id', HTTP_IN_PROGRESS='true') assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content[ '{http://www.w3.org/2005/Atom}deposit_id'] return deposit_id def _update_deposit_with_status(self, deposit_id, status_partial=False): """Add to a given deposit another archive and update its current - status to `ready-for-checks` (by default). + status to `deposited` (by default). Returns: deposit id """ # when response = self.client.post( reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data1, HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial) # then assert response.status_code == status.HTTP_201_CREATED return deposit_id def create_deposit_ready(self, external_id='some-external-id'): - """Create a complex deposit (2 requests) in status `ready-for-checks`. + """Create a complex deposit (2 requests) in status `deposited`. """ deposit_id = self.create_simple_deposit_partial( external_id=external_id) deposit_id = self._update_deposit_with_status(deposit_id) return deposit_id def create_deposit_partial(self, external_id='some-external-id'): """Create a complex deposit (2 requests) in status `partial`. """ deposit_id = self.create_simple_deposit_partial( external_id=external_id) deposit_id = self._update_deposit_with_status( deposit_id, status_partial=True) return deposit_id def add_metadata_to_deposit(self, deposit_id, status_partial=False): """Add metadata to deposit. """ # when response = self.client.post( reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]), content_type='application/atom+xml;type=entry', data=self.codemeta_entry_data1, HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial) assert response.status_code == status.HTTP_201_CREATED # then deposit = Deposit.objects.get(pk=deposit_id) assert deposit is not None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert deposit_requests is not [] for dr in deposit_requests: if dr.type.name == 'metadata': assert deposit_requests[0].metadata is not {} return deposit_id diff --git a/swh/deposit/tests/loader/test_checker.py b/swh/deposit/tests/loader/test_checker.py index 55d8cd9a..6747b269 100644 --- a/swh/deposit/tests/loader/test_checker.py +++ b/swh/deposit/tests/loader/test_checker.py @@ -1,72 +1,72 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from nose.tools import istest from rest_framework.test import APITestCase from swh.deposit.models import Deposit -from swh.deposit.config import PRIVATE_CHECK_DEPOSIT, DEPOSIT_STATUS_READY +from swh.deposit.config import PRIVATE_CHECK_DEPOSIT, DEPOSIT_STATUS_VERIFIED from swh.deposit.config import DEPOSIT_STATUS_REJECTED from swh.deposit.loader.checker import DepositChecker from django.core.urlresolvers import reverse from .common import SWHDepositTestClient, CLIENT_TEST_CONFIG from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine from ..common import FileSystemCreationRoutine class DepositCheckerScenarioTest(APITestCase, WithAuthTestCase, BasicTestCase, CommonCreationRoutine, FileSystemCreationRoutine): def setUp(self): super().setUp() # 2. Sets a basic client which accesses the test data checker_client = SWHDepositTestClient(client=self.client, config=CLIENT_TEST_CONFIG) # 3. setup loader with no persistence and that client self.checker = DepositChecker(client=checker_client) @istest def check_deposit_ready(self): - """Check a valid deposit ready-for-checks should result in ready state + """Check a valid deposit deposited should result in ready state """ # 1. create a deposit with archive and metadata deposit_id = self.create_simple_binary_deposit() deposit_id = self.update_binary_deposit(deposit_id, status_partial=False) args = [self.collection.name, deposit_id] deposit_check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) # when actual_status = self.checker.check(deposit_check_url=deposit_check_url) # then deposit = Deposit.objects.get(pk=deposit_id) - self.assertEquals(deposit.status, DEPOSIT_STATUS_READY) - self.assertEquals(actual_status, DEPOSIT_STATUS_READY) + self.assertEquals(deposit.status, DEPOSIT_STATUS_VERIFIED) + self.assertEquals(actual_status, DEPOSIT_STATUS_VERIFIED) @istest def check_deposit_rejected(self): - """Check an invalid deposit ready-for-checks should result in rejected + """Check an invalid deposit deposited should result in rejected """ # 1. create a deposit with archive and metadata deposit_id = self.create_invalid_deposit() args = [self.collection.name, deposit_id] deposit_check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args) # when actual_status = self.checker.check(deposit_check_url=deposit_check_url) # then deposit = Deposit.objects.get(pk=deposit_id) self.assertEquals(deposit.status, DEPOSIT_STATUS_REJECTED) self.assertEquals(actual_status, DEPOSIT_STATUS_REJECTED) diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py index d24ead3a..c4ec4963 100644 --- a/swh/deposit/tests/loader/test_client.py +++ b/swh/deposit/tests/loader/test_client.py @@ -1,265 +1,268 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile import unittest from nose.plugins.attrib import attr from nose.tools import istest from swh.deposit.loader.client import DepositClient - +from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS +from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE from .common import CLIENT_TEST_CONFIG class StreamedResponse: """Streamed response facsimile """ def __init__(self, ok, stream): self.ok = ok self.stream = stream def iter_content(self): yield from self.stream class FakeRequestClientGet: """Fake request client dedicated to get method calls. """ def __init__(self, response): self.response = response def get(self, *args, **kwargs): self.args = args self.kwargs = kwargs return self.response @attr('fs') class DepositClientReadArchiveTest(unittest.TestCase): def setUp(self): super().setUp() self.temporary_directory = tempfile.mkdtemp(dir='/tmp') def tearDown(self): super().setUp() shutil.rmtree(self.temporary_directory) @istest def archive_get(self): """Reading archive should write data in temporary directory """ stream_content = [b"some", b"streamed", b"response"] response = StreamedResponse( ok=True, stream=(s for s in stream_content)) _client = FakeRequestClientGet(response) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) archive_path = os.path.join(self.temporary_directory, 'test.archive') archive_path = deposit_client.archive_get('/some/url', archive_path) self.assertTrue(os.path.exists(archive_path)) with open(archive_path, 'rb') as f: actual_content = f.read() self.assertEquals(actual_content, b''.join(stream_content)) self.assertEquals(_client.args, ('http://nowhere:9000/some/url', )) self.assertEquals(_client.kwargs, { 'stream': True }) @istest def archive_get_with_authentication(self): """Reading archive should write data in temporary directory """ stream_content = [b"some", b"streamed", b"response", b"for", b"auth"] response = StreamedResponse( ok=True, stream=(s for s in stream_content)) _client = FakeRequestClientGet(response) _config = CLIENT_TEST_CONFIG.copy() _config['auth'] = { # add authentication setup 'username': 'user', 'password': 'pass' } deposit_client = DepositClient(_config, _client=_client) archive_path = os.path.join(self.temporary_directory, 'test.archive') archive_path = deposit_client.archive_get('/some/url', archive_path) self.assertTrue(os.path.exists(archive_path)) with open(archive_path, 'rb') as f: actual_content = f.read() self.assertEquals(actual_content, b''.join(stream_content)) self.assertEquals(_client.args, ('http://nowhere:9000/some/url', )) self.assertEquals(_client.kwargs, { 'stream': True, 'auth': ('user', 'pass') }) @istest def archive_get_can_fail(self): """Reading archive can fail for some reasons """ response = StreamedResponse(ok=False, stream=None) _client = FakeRequestClientGet(response) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when retrieving deposit archive'): deposit_client.archive_get('/some/url', 'some/path') class JsonResponse: """Json response facsimile """ def __init__(self, ok, response): self.ok = ok self.response = response def json(self): return self.response class DepositClientReadMetadataTest(unittest.TestCase): @istest def metadata_get(self): """Reading archive should write data in temporary directory """ expected_response = {"some": "dict"} response = JsonResponse( ok=True, response=expected_response) _client = FakeRequestClientGet(response) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) actual_metadata = deposit_client.metadata_get('/metadata') self.assertEquals(actual_metadata, expected_response) @istest def metadata_get_can_fail(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientGet(JsonResponse(ok=False, response=None)) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when retrieving metadata at'): deposit_client.metadata_get('/some/metadata/url') class FakeRequestClientPut: """Fake Request client dedicated to put request method calls. """ args = None kwargs = None def put(self, *args, **kwargs): self.args = args self.kwargs = kwargs class DepositClientStatusUpdateTest(unittest.TestCase): @istest def status_update(self): """Update status """ _client = FakeRequestClientPut() deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) deposit_client.status_update('/update/status', - 'success', revision_id='some-revision-id') + DEPOSIT_STATUS_LOAD_SUCCESS, + revision_id='some-revision-id') self.assertEquals(_client.args, ('http://nowhere:9000/update/status', )) self.assertEquals(_client.kwargs, { 'json': { - 'status': 'success', + 'status': DEPOSIT_STATUS_LOAD_SUCCESS, 'revision_id': 'some-revision-id', } }) @istest def status_update_with_no_revision_id(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientPut() deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) - deposit_client.status_update('/update/status/fail', 'failure') + deposit_client.status_update('/update/status/fail', + DEPOSIT_STATUS_LOAD_FAILURE) self.assertEquals(_client.args, ('http://nowhere:9000/update/status/fail', )) self.assertEquals(_client.kwargs, { 'json': { - 'status': 'failure', + 'status': DEPOSIT_STATUS_LOAD_FAILURE, } }) class DepositClientCheckTest(unittest.TestCase): @istest def check(self): """When check ok, this should return the deposit's status """ _client = FakeRequestClientGet( JsonResponse(ok=True, response={'status': 'something'})) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) r = deposit_client.check('/check') self.assertEquals(_client.args, ('http://nowhere:9000/check', )) self.assertEquals(_client.kwargs, {}) self.assertEquals(r, 'something') @istest def check_fails(self): """Checking deposit can fail for some reason """ _client = FakeRequestClientGet( JsonResponse(ok=False, response=None)) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when checking deposit'): deposit_client.check('/check/fails') self.assertEquals(_client.args, ('http://nowhere:9000/check/fails', )) self.assertEquals(_client.kwargs, {}) diff --git a/version.txt b/version.txt index bbce33e9..a7ff6e57 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.42-0-g709c28a \ No newline at end of file +v0.0.43-0-gcb97e84 \ No newline at end of file