diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..10e97abb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,25 @@
+*.pyc
+*.sw?
+*~
+/.coverage
+/.coverage.*
+.eggs/
+__pycache__
+*.egg-info/
+version.txt
+build/
+dist/
+/analysis.org
+/swh/deposit/fixtures/private_data.yaml
+/swh/deposit.json
+/test.json
+/swh/test
+db.sqlite3
+/.noseids
+*.tgz
+*.zip
+*.tar.gz
+*.tar.bz2
+*.tar.lzma
+.tox/
+.mypy_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..ad76d740
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,49 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v2.4.0
+ hooks:
+ - id: trailing-whitespace
+ - id: check-json
+ - id: check-yaml
+
+- repo: https://gitlab.com/pycqa/flake8
+ rev: 3.8.3
+ hooks:
+ - id: flake8
+
+- repo: https://github.com/codespell-project/codespell
+ rev: v1.16.0
+ hooks:
+ - id: codespell
+
+- repo: local
+ hooks:
+ - id: mypy
+ name: mypy
+ entry: env DJANGO_SETTINGS_MODULE=swh.deposit.settings.testing mypy
+ args: [swh]
+ pass_filenames: false
+ language: system
+ types: [python]
+
+- repo: https://github.com/PyCQA/isort
+ rev: 5.5.2
+ hooks:
+ - id: isort
+
+- repo: https://github.com/python/black
+ rev: 19.10b0
+ hooks:
+ - id: black
+
+# unfortunately, we are far from being able to enable this...
+# - repo: https://github.com/PyCQA/pydocstyle.git
+# rev: 4.0.0
+# hooks:
+# - id: pydocstyle
+# name: pydocstyle
+# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions.
+# entry: pydocstyle --convention=google
+# language: python
+# types: [python]
+
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..7a5c79d9
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+Copyright (C) 2015-2016 The Software Heritage developers
+
+See http://www.softwareheritage.org/ for more information.
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..0ad22b51
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,78 @@
+# Software Heritage Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as Software
+Heritage contributors and maintainers pledge to making participation in our
+project and our community a harassment-free experience for everyone, regardless
+of age, body size, disability, ethnicity, sex characteristics, gender identity
+and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at `conduct@softwareheritage.org`. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident. Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 00000000..7c3f9625
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1 @@
+Ishan Bhanuka
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..94a9ed02
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/Makefile.local b/Makefile.local
new file mode 100644
index 00000000..882a723c
--- /dev/null
+++ b/Makefile.local
@@ -0,0 +1,34 @@
+FLAKEFLAGS='--exclude=swh/deposit/manage.py,swh/deposit/settings.py,swh/deposit/migrations/'
+
+MANAGE=python3 -m swh.deposit.manage
+
+db-drop:
+ dropdb swh-deposit-dev || return 0
+
+db-create: db-drop
+ createdb swh-deposit-dev
+
+db-prepare:
+ $(MANAGE) makemigrations
+
+db-migrate:
+ $(MANAGE) migrate
+
+db-load-data:
+ $(MANAGE) loaddata deposit_data
+
+db-load-private-data: db-load-data
+ $(MANAGE) loaddata ../private_data.yaml
+
+run-dev:
+ $(MANAGE) runserver
+
+run:
+ gunicorn3 -b 127.0.0.1:5006 swh.deposit.wsgi
+
+# Override default rule to make sure DJANGO env var is properly set. It
+# *should* work without any override thanks to the mypy django-stubs plugin,
+# but it currently doesn't; see
+# https://github.com/typeddjango/django-stubs/issues/166
+check-mypy:
+ DJANGO_SETTINGS_MODULE=swh.deposit.settings.testing $(MYPY) $(MYPYFLAGS) swh
diff --git a/PKG-INFO b/PKG-INFO
index 0e524373..5b467b4c 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,37 +1,37 @@
Metadata-Version: 2.1
Name: swh.deposit
-Version: 0.0.90
+Version: 0.1.0
Summary: Software Heritage Deposit Server
Home-page: https://forge.softwareheritage.org/source/swh-deposit/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-deposit/
Description: # swh-deposit
This is [Software Heritage](https://www.softwareheritage.org)'s
[SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server
implementation, as well as a simple client to upload deposits on the server.
**S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository
**D**eposit) is an interoperability standard for digital file deposit.
This implementation will permit interaction between a client (a
repository) and a server (SWH repository) to permit deposits of
software source code archives and associated metadata.
The documentation is at ./docs/README-specification.md
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
Provides-Extra: server
diff --git a/bin/Makefile b/bin/Makefile
new file mode 100644
index 00000000..7a8898ff
--- /dev/null
+++ b/bin/Makefile
@@ -0,0 +1,45 @@
+DEPOSIT_ID=1
+ARCHIVE=../../swh-deposit.zip
+ARCHIVE2=../../swh-model.zip
+STATUS=--no-partial
+PARTIAL_STATUS=--partial
+UPDATE_STATUS='done'
+ATOM_ENTRY=../../atom-entry.xml
+EXTERNAL_ID='external-id'
+
+create-archives:
+ 7z a $(ARCHIVE) $(FOLDER)
+ 7z a $(ARCHIVE2) $(FOLDER2)
+
+new:
+ ./create_deposit.sh $(ARCHIVE) $(STATUS)
+
+new-complete:
+ ./create_deposit_with_metadata.sh $(ARCHIVE) $(ATOM_ENTRY) $(STATUS) $(EXTERNAL_ID)
+
+new-partial:
+ make new STATUS=$(PARTIAL_STATUS) ARCHIVE=$(ARCHIVE)
+
+update:
+ ./update-deposit-with-another-archive.sh $(DEPOSIT_ID) $(ARCHIVE2) $(STATUS)
+
+update-partial:
+ make update DEPOSIT_ID=$(DEPOSIT_ID) ARCHIVE2=$(ARCHIVE2) STATUS=$(PARTIAL_STATUS)
+
+replace:
+ ./replace-deposit-archive.sh $(ARCHIVE2) $(DEPOSIT_ID)
+
+download:
+ ./download-deposit-archive.sh $(DEPOSIT_ID)
+
+status:
+ ./status.sh $(DEPOSIT_ID)
+
+service-document:
+ ./service-document.sh
+
+home:
+ ./home.sh
+
+update-status:
+ ./update-status.sh $(DEPOSIT_ID) $(UPDATE_STATUS)
diff --git a/bin/content.sh b/bin/content.sh
new file mode 100755
index 00000000..08012f94
--- /dev/null
+++ b/bin/content.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+DEPOSIT_ID=${1-1}
+
+curl -i -u "${CREDS}" ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/content/
diff --git a/bin/create_deposit.sh b/bin/create_deposit.sh
new file mode 100755
index 00000000..41e4010b
--- /dev/null
+++ b/bin/create_deposit.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+ARCHIVE=${1-'../../deposit.zip'}
+
+STATUS=${2-'--no-partial'}
+
+./swh-deposit \
+ --username ${USER} \
+ --password ${PASSWORD} \
+ --collection ${COLLECTION} \
+ --archive-deposit \
+ --archive ${ARCHIVE} \
+ ${STATUS} \
+ --url ${SERVER}/1
diff --git a/bin/create_deposit_atom.sh b/bin/create_deposit_atom.sh
new file mode 100755
index 00000000..db0c92d5
--- /dev/null
+++ b/bin/create_deposit_atom.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+ATOM=${1-'../../atom.xml'}
+PROGRESS=${2-'false'}
+
+curl -i -u "$CREDS" \
+ --data-binary @${ATOM} \
+ -X POST \
+ -H "In-Progress: ${PROGRESS}" \
+ -H 'Content-Type: application/atom+xml;type=entry' \
+ -H 'Slug: external-id' \
+ -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \
+ ${SERVER}/1/${COLLECTION}/
diff --git a/bin/create_deposit_with_metadata.sh b/bin/create_deposit_with_metadata.sh
new file mode 100755
index 00000000..d93a85bc
--- /dev/null
+++ b/bin/create_deposit_with_metadata.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+ARCHIVE=${1-'../../swh-deposit.zip'}
+ATOM_ENTRY=${2-'../../atom-entry.xml'}
+
+STATUS=${3-'--no-partial'}
+EXTERNAL_ID=${4-'external-id'}
+
+./swh-deposit \
+ --username ${USER} \
+ --password ${PASSWORD} \
+ --collection ${COLLECTION} \
+ --archive-deposit \
+ --archive ${ARCHIVE} \
+ --metadata-deposit \
+ --metadata ${ATOM_ENTRY} \
+ --slug ${EXTERNAL_ID} \
+ ${STATUS} \
+ --url ${SERVER}/1
diff --git a/bin/default-setup b/bin/default-setup
new file mode 100644
index 00000000..c27054f9
--- /dev/null
+++ b/bin/default-setup
@@ -0,0 +1,5 @@
+SERVER=http://127.0.0.1:5006
+USER='hal'
+PASSWORD='hal'
+COLLECTION=hal
+CREDS="$USER:$PASSWORD"
diff --git a/bin/download-deposit-archive.sh b/bin/download-deposit-archive.sh
new file mode 100755
index 00000000..2b875e31
--- /dev/null
+++ b/bin/download-deposit-archive.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+DEPOSIT_ID=${1-1}
+
+curl ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/raw/
diff --git a/bin/home.sh b/bin/home.sh
new file mode 100755
index 00000000..d3cf2df4
--- /dev/null
+++ b/bin/home.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+curl ${SERVER}
+echo
diff --git a/bin/replace-deposit-archive.sh b/bin/replace-deposit-archive.sh
new file mode 100755
index 00000000..5b2b50d2
--- /dev/null
+++ b/bin/replace-deposit-archive.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+ARCHIVE=${1-'../../swh-model.zip'}
+NAME=$(basename ${ARCHIVE})
+
+MD5=$(md5sum ${ARCHIVE} | cut -f 1 -d' ')
+
+DEPOSIT_ID=${2-1}
+
+curl -i -u "$CREDS" \
+ -X PUT \
+ --data-binary @${ARCHIVE} \
+ -H "In-Progress: false" \
+ -H "Content-MD5: ${MD5}" \
+ -H "Content-Disposition: attachment; filename=${NAME}" \
+ -H 'Slug: external-id' \
+ -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \
+ -H 'Content-type: application/zip' \
+ ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/media/
diff --git a/bin/service-document.sh b/bin/service-document.sh
new file mode 100755
index 00000000..f7818a11
--- /dev/null
+++ b/bin/service-document.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+curl -i -u "${CREDS}" ${SERVER}/1/servicedocument/
diff --git a/bin/status.sh b/bin/status.sh
new file mode 100755
index 00000000..6d3662d1
--- /dev/null
+++ b/bin/status.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+DEPOSIT_ID=${1-1}
+
+./swh-deposit \
+ --username ${USER} \
+ --password ${PASSWORD} \
+ --collection ${COLLECTION} \
+ --status \
+ --deposit-id ${DEPOSIT_ID} \
+ --url ${SERVER}/1
diff --git a/bin/update-deposit-with-another-archive.sh b/bin/update-deposit-with-another-archive.sh
new file mode 100755
index 00000000..644e7b80
--- /dev/null
+++ b/bin/update-deposit-with-another-archive.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+DEPOSIT_ID=${1-1}
+ARCHIVE=${2-'../../swh-core.zip'}
+
+NAME=$(basename ${ARCHIVE})
+MD5=$(md5sum ${ARCHIVE} | cut -f 1 -d' ')
+PROGRESS=${3-'false'}
+
+curl -i -u "${CREDS}" \
+ -X POST \
+ --data-binary @${ARCHIVE} \
+ -H "In-Progress: ${PROGRESS}" \
+ -H "Content-MD5: ${MD5}" \
+ -H "Content-Disposition: attachment; filename=${NAME}" \
+ -H 'Slug: external-id-2' \
+ -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \
+ -H 'Content-type: application/zip' \
+ ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/media/
diff --git a/bin/update-status.sh b/bin/update-status.sh
new file mode 100755
index 00000000..c5925a41
--- /dev/null
+++ b/bin/update-status.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+. ./default-setup
+
+DEPOSIT_ID=${1-1}
+UPDATE_STATUS=${2-'done'}
+
+curl -i \
+ -X PUT \
+ -H 'Content-Type: application/json' \
+ -d "{\"status\": \"${UPDATE_STATUS}\"}" \
+ ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/update/
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 00000000..16d82778
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,15 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.storage.pytest_plugin"]
+
+
+@pytest.fixture(scope="session")
+def swh_scheduler_celery_includes(swh_scheduler_celery_includes):
+ return swh_scheduler_celery_includes + [
+ "swh.deposit.loader.tasks",
+ ]
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 00000000..e379dea1
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,4 @@
+_build/
+apidoc/
+*-stamp
+
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 00000000..42355755
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,3 @@
+include ../../swh-docs/Makefile.sphinx
+
+APIDOC_EXCLUDES += ../swh/*/settings/*
diff --git a/swh/deposit/api/__init__.py b/docs/_static/.placeholder
similarity index 100%
copy from swh/deposit/api/__init__.py
copy to docs/_static/.placeholder
diff --git a/swh/deposit/api/__init__.py b/docs/_templates/.placeholder
similarity index 100%
copy from swh/deposit/api/__init__.py
copy to docs/_templates/.placeholder
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000..5a0b8f31
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,8 @@
+import os
+
+import django
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "swh.deposit.settings.development")
+django.setup()
+
+from swh.docs.sphinx.conf import * # NoQA
diff --git a/docs/dev-info.rst b/docs/dev-info.rst
new file mode 100644
index 00000000..1d613d0c
--- /dev/null
+++ b/docs/dev-info.rst
@@ -0,0 +1,176 @@
+Hacking on swh-deposit
+======================
+
+There are multiple modes to run and test the server locally:
+
+* development-like (automatic reloading when code changes)
+* production-like (no reloading)
+* integration tests (no side effects)
+
+Except for the tests which are mostly side effects free (except for the
+database access), the other modes will need some configuration files (up to 2)
+to run properly.
+
+Database
+--------
+
+swh-deposit uses a database to store the state of a deposit. The default
+db is expected to be called swh-deposit-dev.
+
+To simplify the use, the following makefile targets can be used:
+
+schema
+~~~~~~
+
+.. code:: shell
+
+ make db-create db-prepare db-migrate
+
+data
+~~~~
+
+Once the db is created, you need some data to be injected (request
+types, client, collection, etc...):
+
+.. code:: shell
+
+ make db-load-data db-load-private-data
+
+The private data are about having a user (``hal``) with a password
+(``hal``) who can access a collection (``hal``).
+
+Add the following to ``../private-data.yaml``:
+
+.. code:: yaml
+
+ - model: deposit.depositclient
+ fields:
+ user_ptr_id: 1
+ collections:
+ - 1
+ - model: auth.User
+ pk: 1
+ fields:
+ first_name: hal
+ last_name: hal
+ username: hal
+ password: "pbkdf2_sha256$30000$8lxjoGc9PiBm$DO22vPUJCTM17zYogBgBg5zr/97lH4pw10Mqwh85yUM="
+ - model: deposit.depositclient
+ fields:
+ user_ptr_id: 1
+ collections:
+ - 1
+ url: https://hal.inria.fr
+
+drop
+~~~~
+
+For information, you can drop the db:
+
+.. code:: shell
+
+ make db-drop
+
+Development-like environment
+----------------------------
+
+Development-like environment needs one configuration file to work
+properly.
+
+Configuration
+~~~~~~~~~~~~~
+
+**``{/etc/softwareheritage | ~/.config/swh | ~/.swh}``/deposit/server.yml**:
+
+.. code:: yaml
+
+ # dev option for running the server locally
+ host: 127.0.0.1
+ port: 5006
+
+ # production
+ authentication:
+ activated: true
+ white-list:
+ GET:
+ - /
+
+ # 20 Mib max size
+ max_upload_size: 20971520
+
+Run
+~~~
+
+Run the local server, using the default configuration file:
+
+.. code:: shell
+
+ make run-dev
+
+Production-like environment
+---------------------------
+
+Production-like environment needs additional section in the
+configuration file to work properly.
+
+This is more close to what's actually running in production.
+
+Configuration
+~~~~~~~~~~~~~
+
+This expects the same file describes in the previous chapter. Plus, an
+additional private section file containing private information that is
+not in the source code repository.
+
+**``{/etc/softwareheritage | ~/.config/swh | ~/.swh}``/deposit/private.yml**:
+
+.. code:: yaml
+
+ private:
+ secret_key: production-local
+ db:
+ name: swh-deposit-dev
+
+A production configuration file would look like:
+
+.. code:: yaml
+
+ private:
+ secret_key: production-secret-key
+ db:
+ name: swh-deposit-dev
+ host: db
+ port: 5467
+ user: user
+ password: user-password
+
+Run
+~~~
+
+.. code:: shell
+
+ make run
+
+Note: This expects gunicorn3 package installed on the system
+
+Tests
+-----
+
+To run the tests:
+
+.. code:: shell
+
+ make test
+
+As explained, those tests are mostly side-effect free. The db part is
+dealt with by django. The remaining part which patches those side-effect
+behavior is dealt with in the ``swh/deposit/tests/__init__.py`` module.
+
+Sum up
+------
+
+Prepare everything for your user to run:
+
+.. code:: shell
+
+ make db-drop db-create db-prepare db-migrate db-load-private-data run-dev
diff --git a/docs/endpoints/collection.rst b/docs/endpoints/collection.rst
new file mode 100644
index 00000000..53219258
--- /dev/null
+++ b/docs/endpoints/collection.rst
@@ -0,0 +1,73 @@
+Create deposit
+^^^^^^^^^^^^^^^
+
+.. http:post:: /1//
+
+ Create deposit in a collection.
+
+ The client sends a deposit request to a specific collection with:
+
+ * an archive holding the software source code (binary upload)
+ * an envelop with metadata describing information regarding a deposit (atom
+ entry deposit)
+
+ Also known as: COL-IRI
+
+ :param text : the client's credentials
+ :param text Content-Type: accepted mimetype
+ :param int Content-Length: tarball size
+ :param text Content-MD5: md5 checksum hex encoded of the tarball
+ :param text Content-Disposition: attachment; filename=[filename]; the filename
+ parameter must be text (ascii)
+ :param text Content-Disposition: for the metadata file set name parameter
+ to 'atom'.
+ :param bool In-progress: true if not final; false when final request.
+ :statuscode 201: success for deposit on POST
+ :statuscode 401: Unauthorized
+ :statuscode 404: access to an unknown collection
+ :statuscode 415: unsupported media type
+
+Sample request
+~~~~~~~~~~~~~~~
+.. code:: shell
+
+ curl -i -u hal: \
+ -F "file=@../deposit.json;type=application/zip;filename=payload" \
+ -F "atom=@../atom-entry.xml;type=application/atom+xml;charset=UTF-8" \
+ -H 'In-Progress: false' \
+ -H 'Slug: some-external-id' \
+ -XPOST https://deposit.softwareheritage.org/1/hal/
+
+Sample response
+~~~~~~~~~~~~~~~
+
+.. code:: shell
+
+ HTTP/1.0 201 Created
+ Date: Tue, 26 Sep 2017 10:32:35 GMT
+ Server: WSGIServer/0.2 CPython/3.5.3
+ Vary: Accept, Cookie
+ Allow: GET, POST, PUT, DELETE, HEAD, OPTIONS
+ Location: /1/hal/10/metadata/
+ X-Frame-Options: SAMEORIGIN
+ Content-Type: application/xml
+
+
+ 10
+ Sept. 26, 2017, 10:32 a.m.
+ None
+ deposited
+
+
+
+
+
+
+
+
+
+
+ http://purl.org/net/sword/package/SimpleZip
+
diff --git a/docs/endpoints/content.rst b/docs/endpoints/content.rst
new file mode 100644
index 00000000..ef89d1e9
--- /dev/null
+++ b/docs/endpoints/content.rst
@@ -0,0 +1,14 @@
+Display content
+^^^^^^^^^^^^^^^^
+
+.. http:get:: /1///content/
+
+ Display information on the content's representation in the sword
+ server.
+
+
+ Also known as: CONT-FILE-IRI
+
+ :param text : the client's credentials
+ :statuscode 200: no error
+ :statuscode 401: Unauthorized
diff --git a/docs/endpoints/service-document.rst b/docs/endpoints/service-document.rst
new file mode 100644
index 00000000..97a7af19
--- /dev/null
+++ b/docs/endpoints/service-document.rst
@@ -0,0 +1,48 @@
+Service document
+^^^^^^^^^^^^^^^^^
+
+.. http:get:: /1/servicedocument/
+
+ This is the starting endpoint for the client to discover its initial
+ collection. The answer to this query will describes:
+
+ * the server's abilities
+ * connected client's collection information
+
+ Also known as: SD-IRI - The Service Document IRI
+
+ :param text : the client's credentials
+ :statuscode 200: no error
+ :statuscode 401: Unauthorized
+
+
+
+Sample response
+~~~~~~~~~~~~~~~
+ .. code:: xml
+
+
+
+
+ 2.0
+ 20971520
+
+
+ The Software Heritage (SWH) archive
+
+ SWH Software Archive
+ application/zip
+ application/x-tar
+ Collection Policy
+ Software Heritage Archive
+ false
+ false
+ Collect, Preserve, Share
+ http://purl.org/net/sword/package/SimpleZip
+ https://deposit.softwareheritage.org/1/hal/
+
+
+
diff --git a/docs/endpoints/status.rst b/docs/endpoints/status.rst
new file mode 100644
index 00000000..ca773b0b
--- /dev/null
+++ b/docs/endpoints/status.rst
@@ -0,0 +1,74 @@
+Retrieve status
+^^^^^^^^^^^^^^^^
+
+.. http:get:: /1///
+
+ Returns deposit's status.
+
+ The different statuses:
+
+ - **partial**: multipart deposit is still ongoing
+ - **deposited**: deposit completed, ready for checks
+ - **rejected**: deposit failed the checks
+ - **verified**: content and metadata verified, ready for loading
+ - **loading**: loading in-progress
+ - **done**: loading completed successfully
+ - **failed**: the deposit loading has failed
+
+ Also known as STATE-IRI
+
+ :param text : the client's credentials
+ :statuscode 201: with the deposit's status
+ :statuscode 401: Unauthorized
+ :statuscode 404: access to an unknown deposit
+
+
+Rejected deposit
+~~~~~~~~~~~~~~~~
+
+It so happens that deposit could be rejected. In that case, the
+`deposit_status_detail` entry will explain failed checks.
+
+Many reasons are possibles, here are some:
+
+- Deposit without software archive (main goal of the deposit is to
+ deposit software source code)
+
+- Deposit with malformed software archive (i.e archive within archive)
+
+- Deposit with invalid software archive (corrupted archive, although,
+ this one should happen during upload and not during checks)
+
+- Deposit with unsupported archive format
+
+- Deposit with missing metadata
+
+
+Sample response
+~~~~~~~~~~~~~~~
+
+ Successful deposit:
+
+ .. code:: xml
+
+
+ 160
+ done
+ The deposit has been successfully loaded into the Software Heritage archive
+ swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9
+ swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9;origin=https://forge.softwareheritage.org/source/jesuisgpl/;visit=swh:1:snp:68c0d26104d47e278dd6be07ed61fafb561d0d20;anchor=swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb;path=/
+
+
+ Rejected deposit:
+
+ .. code:: xml
+
+
+ 148
+ rejected
+ - At least one url field must be compatible with the client's domain name (codemeta:url)
+
diff --git a/docs/endpoints/update-media.rst b/docs/endpoints/update-media.rst
new file mode 100644
index 00000000..de32634c
--- /dev/null
+++ b/docs/endpoints/update-media.rst
@@ -0,0 +1,27 @@
+Update content
+^^^^^^^^^^^^^^^
+
+.. http:post:: /1///media/
+
+ Add archive(s) to a deposit. Only possible if the deposit's status
+ is partial.
+
+.. http:put:: /1///media/
+
+ Replace all content by submitting a new archive. Only possible if
+ the deposit's status is partial.
+
+
+ Also known as: *update iri* (EM-IRI)
+
+ :param text : the client's credentials
+ :param text Content-Type: accepted mimetype
+ :param int Content-Length: tarball size
+ :param text Content-MD5: md5 checksum hex encoded of the tarball
+ :param text Content-Disposition: attachment; filename=[filename] ; the filename
+ parameter must be text (ascii)
+ :param bool In-progress: true if not final; false when final request.
+ :statuscode 204: success without payload on PUT
+ :statuscode 201: success for deposit on POST
+ :statuscode 401: Unauthorized
+ :statuscode 415: unsupported media type
diff --git a/docs/endpoints/update-metadata.rst b/docs/endpoints/update-metadata.rst
new file mode 100644
index 00000000..661d7516
--- /dev/null
+++ b/docs/endpoints/update-metadata.rst
@@ -0,0 +1,24 @@
+Update metadata
+^^^^^^^^^^^^^^^^
+
+.. http:post:: /1///metadata/
+
+ Add metadata to a deposit. Only possible if the deposit's status
+ is partial.
+
+.. http:put:: /1///metadata/
+
+ Replace all metadata by submitting a new metadata file. Only possible if
+ the deposit's status is partial.
+
+
+ Also known as: *update iri* (SE-IRI)
+
+ :param text : the client's credentials
+ :param text Content-Disposition: attachment; filename=[filename] ; the filename
+ parameter must be text (ascii), with a name parameter set to 'atom'.
+ :param bool In-progress: true if not final; false when final request.
+ :statuscode 204: success without payload on PUT
+ :statuscode 201: success for deposit on POST
+ :statuscode 401: Unauthorized
+ :statuscode 415: unsupported media type
diff --git a/docs/getting-started.rst b/docs/getting-started.rst
new file mode 100644
index 00000000..6915e309
--- /dev/null
+++ b/docs/getting-started.rst
@@ -0,0 +1,284 @@
+Getting Started
+===============
+
+This is a guide for how to prepare and push a software deposit with
+the `swh deposit` commands.
+
+The API is rooted at https://deposit.softwareheritage.org/1.
+
+For more details, see the `main documentation <./index.html>`__.
+
+Requirements
+------------
+
+You need to be referenced on SWH's client list to have:
+
+* credentials (needed for the basic authentication step)
+
+ - in this document we reference ```` as the client's name and
+ ```` as its associated authentication password.
+
+* an associated collection_.
+
+
+.. _collection: https://bitworking.org/projects/atom/rfc5023#rfc.section.8.3.3
+
+
+`Contact us for more information.
+`__
+
+Prepare a deposit
+-----------------
+* compress the files in a supported archive format:
+
+ - zip: common zip archive (no multi-disk zip files).
+ - tar: tar archive without compression or optionally any of the
+ following compression algorithm gzip (`.tar.gz`, `.tgz`), bzip2
+ (`.tar.bz2`) , or lzma (`.tar.lzma`)
+
+* (Optional) prepare a metadata file (more details :ref:`deposit-metadata`):
+
+
+Push deposit
+------------
+You can push a deposit with:
+
+* a single deposit (archive + metadata):
+
+ The user posts in one query a software
+ source code archive and associated metadata.
+ The deposit is directly marked with status ``deposited``.
+
+* a multisteps deposit:
+
+ 1. Create an incomplete deposit (marked with status ``partial``)
+ 2. Add data to a deposit (in multiple requests if needed)
+ 3. Finalize deposit (the status becomes ``deposited``)
+
+
+Single deposit
+^^^^^^^^^^^^^^
+
+
+Once the files are ready for deposit, we want to do the actual deposit
+in one shot, sending exactly one POST query:
+
+* 1 archive (content-type ``application/zip`` or ``application/x-tar``)
+* 1 metadata file in atom xml format (``content-type: application/atom+xml;type=entry``)
+
+For this, we need to provide the:
+
+* arguments: ``--username 'name' --password 'pass'`` as credentials
+* archive's path (example: ``--archive path/to/archive-name.tgz``)
+* software's name (optional if a metadata filepath is specified and the
+ artifact's name is included in the metadata file).
+* author's name (optional if a metadata filepath is specified and the authors
+ are included in the metadata file). This can be specified multiple times in
+ case of multiple authors.
+* (optionally) metadata file's path ``--metadata
+ path/to/file.metadata.xml``.
+* (optionally) ``--slug 'your-id'`` argument, a reference to a unique identifier
+ the client uses for the software object. If not provided, A UUID will be
+ generated by SWH.
+
+You can do this with the following command:
+
+minimal deposit
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --author "Jane Doe" \
+ --author "John Doe" \
+ --name 'je-suis-gpl' \
+ --archive je-suis-gpl.tgz
+
+with client's external identifier (``slug``)
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --author "Jane Doe" \
+ --name 'je-suis-gpl' \
+ --archive je-suis-gpl.tgz \
+ --slug je-suis-gpl
+
+to a specific client's collection
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --author "Jane Doe" \
+ --name 'je-suis-gpl' \
+ --archive je-suis-gpl.tgz \
+ --collection 'second-collection'
+
+
+You just posted a deposit to your collection on Software Heritage
+
+
+If everything went well, the successful response will contain the
+elements below:
+
+.. code:: shell
+
+ {
+ 'deposit_status': 'deposited',
+ 'deposit_id': '7',
+ 'deposit_date': 'Jan. 29, 2018, 12:29 p.m.'
+ }
+
+Note: As the deposit is in ``deposited`` status, you can no longer
+update the deposit after this query. It will be answered with a 403
+forbidden answer.
+
+If something went wrong, an equivalent response will be given with the
+`error` and `detail` keys explaining the issue, e.g.:
+
+.. code:: shell
+
+ {
+ 'error': 'Unknown collection name xyz',
+ 'detail': None,
+ 'deposit_status': None,
+ 'deposit_status_detail': None,
+ 'deposit_swh_id': None,
+ 'status': 404
+ }
+
+
+
+multisteps deposit
+^^^^^^^^^^^^^^^^^^^^^^^^^
+The steps to create a multisteps deposit:
+
+1. Create an incomplete deposit
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+First use the ``--partial`` argument to declare there is more to come
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --archive foo.tar.gz \
+ --partial
+
+
+2. Add content or metadata to the deposit
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Continue the deposit by using the ``--deposit-id`` argument given as a response
+for the first step. You can continue adding content or metadata while you use
+the ``--partial`` argument.
+
+To only add one new archive to the deposit:
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --archive add-foo.tar.gz \
+ --deposit-id 42 \
+ --partial
+
+To only add metadata to the deposit:
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --metadata add-foo.tar.gz.metadata.xml \
+ --deposit-id 42 \
+ --partial
+
+or:
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --name 'add-foo' --author 'someone' \
+ --deposit-id 42 \
+ --partial
+
+
+3. Finalize deposit
+~~~~~~~~~~~~~~~~~~~
+
+On your last addition (same command as before), by not declaring it
+``--partial``, the deposit will be considered completed. Its status will be
+changed to ``deposited``
+
+
+Update deposit
+----------------
+* replace deposit:
+
+ - only possible if the deposit status is ``partial`` and
+ ``--deposit-id `` is provided
+
+ - by using the ``--replace`` flag
+
+ - ``--metadata-deposit`` replaces associated existing metadata
+ - ``--archive-deposit`` replaces associated archive(s)
+ - by default, with no flag or both, you'll replace associated
+ metadata and archive(s):
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --deposit-id 11 \
+ --archive updated-je-suis-gpl.tgz \
+ --replace
+
+* update a loaded deposit with a new version:
+
+ - by using the external-id with the ``--slug`` argument, you will
+ link the new deposit with its parent deposit:
+
+.. code:: shell
+
+ $ swh deposit upload --username name --password secret \
+ --archive je-suis-gpl-v2.tgz \
+ --slug 'je-suis-gpl' \
+
+
+
+Check the deposit's status
+--------------------------
+
+You can check the status of the deposit by using the ``--deposit-id`` argument:
+
+.. code:: shell
+
+ $ swh deposit status --username name --password secret \
+ --deposit-id 11
+
+.. code:: json
+
+ {
+ 'deposit_id': '11',
+ 'deposit_status': 'deposited',
+ 'deposit_swh_id': None,
+ 'deposit_status_detail': 'Deposit is ready for additional checks \
+ (tarball ok, metadata, etc...)'
+ }
+
+The different statuses:
+
+- **partial**: multipart deposit is still ongoing
+- **deposited**: deposit completed
+- **rejected**: deposit failed the checks
+- **verified**: content and metadata verified
+- **loading**: loading in-progress
+- **done**: loading completed successfully
+- **failed**: the deposit loading has failed
+
+When the deposit has been loaded into the archive, the status will be
+marked ``done``. In the response, will also be available the
+, . For example:
+
+.. code:: json
+
+ {
+ 'deposit_id': '11',
+ 'deposit_status': 'done',
+ 'deposit_swh_id': 'swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9',
+ 'deposit_swh_id_context': 'swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9;origin=https://forge.softwareheritage.org/source/jesuisgpl/;visit=swh:1:snp:68c0d26104d47e278dd6be07ed61fafb561d0d20;anchor=swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb;path=/',
+ 'deposit_status_detail': 'The deposit has been successfully \
+ loaded into the Software Heritage archive'
+ }
diff --git a/docs/images/deposit-create-chart.png b/docs/images/deposit-create-chart.png
new file mode 100644
index 00000000..97c4eb45
Binary files /dev/null and b/docs/images/deposit-create-chart.png differ
diff --git a/docs/images/deposit-delete-chart.png b/docs/images/deposit-delete-chart.png
new file mode 100644
index 00000000..d9ba8ec4
Binary files /dev/null and b/docs/images/deposit-delete-chart.png differ
diff --git a/docs/images/deposit-update-chart.png b/docs/images/deposit-update-chart.png
new file mode 100644
index 00000000..d84eb52f
Binary files /dev/null and b/docs/images/deposit-update-chart.png differ
diff --git a/docs/images/status.png b/docs/images/status.png
new file mode 100644
index 00000000..00137fff
Binary files /dev/null and b/docs/images/status.png differ
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 00000000..486a1e41
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,28 @@
+.. _swh-deposit:
+
+Software Heritage - Deposit
+===========================
+
+Push-based deposit of software source code artifacts to the archive.
+
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ getting-started
+ spec-api
+ metadata
+ dev-info
+ sys-info
+ specs/specs
+ tests/tests_HAL.rst
+
+
+Reference Documentation
+-----------------------
+
+.. toctree::
+ :maxdepth: 2
+
+ /apidoc/swh.deposit
diff --git a/docs/metadata.rst b/docs/metadata.rst
new file mode 100644
index 00000000..f8e1cda2
--- /dev/null
+++ b/docs/metadata.rst
@@ -0,0 +1,185 @@
+.. _deposit-metadata:
+
+Deposit metadata
+================
+
+When making a software deposit into the SWH archive, one can add
+information describing the software artifact and the software project.
+
+
+Metadata requirements
+---------------------
+
+- **the schema/vocabulary** used *MUST* be specified with a persistent url
+ (DublinCore, DOAP, CodeMeta, etc.)
+
+ .. code:: xml
+
+
+ or
+
+ or
+
+
+- **the name** of the software deposit *MUST* be provided [atom:title,
+ codemeta:name, dcterms:title]
+
+- **the authors** of the software deposit *MUST* be provided
+
+- **the url** representing the location of the source *MAY* be provided under
+ the url tag. The url will be used for creating an origin object in the
+ archive.
+
+ .. code:: xml
+
+ www.url-example.com
+
+- **the external\_identifier** *MAY* be provided as an identifier
+
+- **the external\_identifier** *SHOULD* match the Slug external-identifier in
+ the header
+
+- **the description** of the software deposit *SHOULD* be provided
+ [codemeta:description]: short or long description of the software
+
+- **the license/s** of the software
+ deposit *SHOULD* be provided [codemeta:license]
+
+- other metadata *MAY* be added with terms defined by the schema in use.
+
+Examples
+--------
+
+Using only Atom
+~~~~~~~~~~~~~~~
+
+.. code:: xml
+
+
+
+ Awesome Compiler
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ 1785io25c695
+ 2017-10-07T15:17:08Z
+ some awesome author
+
+
+Using Atom with CodeMeta
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: xml
+
+
+
+ Awesome Compiler
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ 1785io25c695
+ 1785io25c695
+ origin url
+ other identifier, DOI, ARK
+ Domain
+
+ description
+ key-word 1
+ key-word 2
+ creation date
+ publication date
+ comment
+
+ article name
+ article id
+
+
+ Collaboration/Projet
+ project name
+ id
+
+ see also
+ Sponsor A
+ Sponsor B
+ Platform/OS
+ dependencies
+ Version
+ active
+
+ license
+ url spdx
+
+ .Net Framework 3.0
+ Python2.3
+
+ author1
+ Inria
+ UPMC
+
+
+ author2
+ Inria
+ UPMC
+
+ http://code.com
+ language 1
+ language 2
+ http://issuetracker.com
+
+
+Using Atom with DublinCore and CodeMeta (multi-schema entry)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: xml
+
+
+
+ Awesome Compiler
+ hal
+ urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
+ %s
+ hal-01587361
+ doi:10.5281/zenodo.438684
+ The assignment problem
+ AffectationRO
+ author
+ [INFO] Computer Science [cs]
+ [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO]
+ SOFTWARE
+ Project in OR: The assignment problemA java implementation for the assignment problem first release
+ description fr
+ 2015-06-01
+ 2017-10-19
+ en
+
+
+ origin url
+
+ 1.0.0
+ key word
+ Comment
+ Rfrence interne
+
+ link
+ Sponsor
+
+ Platform/OS
+ dependencies
+ Ended
+
+ license
+ url spdx
+
+
+ http://code.com
+ language 1
+ language 2
+
+
+Note
+----
+We aim on harmonizing the metadata from different origins and thus
+metadata will be translated to the `CodeMeta
+v.2 `__ vocabulary if
+possible.
diff --git a/docs/spec-api.rst b/docs/spec-api.rst
new file mode 100644
index 00000000..4a6b3cc2
--- /dev/null
+++ b/docs/spec-api.rst
@@ -0,0 +1,112 @@
+API Specification
+=================
+
+This is `Software Heritage `__'s
+`SWORD
+2.0 `__
+Server implementation.
+
+**S.W.O.R.D** (**S**\ imple **W**\ eb-Service **O**\ ffering
+**R**\ epository **D**\ eposit) is an interoperability standard for
+digital file deposit.
+
+This implementation will permit interaction between a client (a repository) and
+a server (SWH repository) to push deposits of software source code archives
+with associated metadata.
+
+*Note:*
+
+* In the following document, we will use the ``archive`` or ``software source
+ code archive`` interchangeably.
+* The supported archive formats are:
+
+ * zip: common zip archive (no multi-disk zip files).
+ * tar: tar archive without compression or optionally any of the following
+ compression algorithm gzip (.tar.gz, .tgz), bzip2 (.tar.bz2) , or lzma
+ (.tar.lzma)
+
+Collection
+----------
+
+SWORD defines a ``collection`` concept. In SWH's case, this collection
+refers to a group of deposits. A ``deposit`` is some form of software
+source code archive(s) associated with metadata.
+By default the client's collection will have the client's name.
+
+Limitations
+-----------
+* upload limitation of 100Mib
+* no mediation
+
+API overview
+------------
+
+API access is over HTTPS.
+
+The API is protected through basic authentication.
+
+
+Endpoints
+---------
+
+The API endpoints are rooted at https://deposit.softwareheritage.org/1/.
+
+Data is sent and received as XML (as specified in the SWORD 2.0
+specification).
+
+.. include:: endpoints/service-document.rst
+
+.. include:: endpoints/collection.rst
+
+.. include:: endpoints/update-media.rst
+
+.. include:: endpoints/update-metadata.rst
+
+.. include:: endpoints/status.rst
+
+.. include:: endpoints/content.rst
+
+
+Possible errors:
+----------------
+
+* common errors:
+
+ * 401 (unauthenticated) if a client does not provide credential or provide
+ wrong ones
+ * 403 (forbidden) if a client tries access to a collection it does not own
+ * 404 (not found) if a client tries access to an unknown collection
+ * 404 (not found) if a client tries access to an unknown deposit
+ * 415 (unsupported media type) if a wrong media type is provided to the
+ endpoint
+
+* archive/binary deposit:
+
+ * 403 (forbidden) if the length of the archive exceeds the max size
+ configured
+ * 412 (precondition failed) if the length or hash provided mismatch the
+ reality of the archive.
+ * 415 (unsupported media type) if a wrong media type is provided
+
+* multipart deposit:
+
+ * 412 (precondition failed) if the md5 hash provided mismatch the reality of
+ the archive
+ * 415 (unsupported media type) if a wrong media type is provided
+
+* Atom entry deposit:
+
+ * 400 (bad request) if the request's body is empty (for creation only)
+
+
+
+
+Sources
+-------
+
+* `SWORD v2 specification
+ `__
+* `arxiv documentation `__
+* `Dataverse example `__
+* `SWORD used on HAL `__
+* `xml examples for CCSD `__
diff --git a/docs/specs/blueprint.rst b/docs/specs/blueprint.rst
new file mode 100644
index 00000000..fb762efd
--- /dev/null
+++ b/docs/specs/blueprint.rst
@@ -0,0 +1,114 @@
+Use cases
+---------
+
+
+Deposit creation
+~~~~~~~~~~~~~~~~
+
+From client's deposit repository server to SWH's repository server:
+
+1. The client requests for the server's abilities and its associated collection
+ (GET query to the *SD/service document uri*)
+
+2. The server answers the client with the service document which gives the
+ *collection uri* (also known as *COL/collection IRI*).
+
+3. The client sends a deposit (optionally a zip archive, some metadata or both)
+ through the *collection uri*.
+
+ This can be done in:
+
+ * one POST request (metadata + archive).
+ * one POST request (metadata or archive) + other PUT or POST request to the
+ *update uris* (*edit-media iri* or *edit iri*)
+
+ a. Server validates the client's input or returns detailed error if any
+
+ b. Server stores information received (metadata or software archive source
+ code or both)
+
+4. The server notifies the client it acknowledged the client's request. An
+ ``http 201 Created`` response with a deposit receipt in the body response is
+ sent back. That deposit receipt will hold the necessary information to
+ eventually complete the deposit later on if it was incomplete (also known as
+ status ``partial``).
+
+Schema representation
+^^^^^^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+
+
+.. figure:: ../images/deposit-create-chart.png
+ :alt:
+
+
+Updating an existing deposit
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+5. Client updates existing deposit through the *update uris* (one or more POST
+ or PUT requests to either the *edit-media iri* or *edit iri*).
+
+ 1. Server validates the client's input or returns detailed error if any
+
+ 2. Server stores information received (metadata or software archive source
+ code or both)
+
+ This would be the case for example if the client initially posted a
+ ``partial`` deposit (e.g. only metadata with no archive, or an archive
+ without metadata, or a split archive because the initial one exceeded
+ the limit size imposed by swh repository deposit)
+
+Schema representation
+^^^^^^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+
+
+.. figure:: ../images/deposit-update-chart.png
+ :alt:
+
+Deleting deposit (or associated archive, or associated metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+6. Deposit deletion is possible as long as the deposit is still in ``partial``
+ state.
+
+ 1. Server validates the client's input or returns detailed error if any
+ 2. Server actually delete information according to request
+
+Schema representation
+^^^^^^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+
+
+.. figure:: ../images/deposit-delete-chart.png
+ :alt:
+
+Client asks for operation status
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+7. Operation status can be read through a GET query to the *state iri*.
+
+Server: Triggering deposit checks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once the status ``deposited`` is reached for a deposit, checks for the
+associated archive(s) and metadata will be triggered. If those checks
+fail, the status is changed to ``rejected`` and nothing more happens
+there. Otherwise, the status is changed to ``verified``.
+
+Server: Triggering deposit load
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once the status ``verified`` is reached for a deposit, loading the
+deposit with its associated metadata will be triggered.
+
+The loading will result on status update, either ``done`` or ``failed``
+(depending on the loading's status).
+
+This is described in the `loading document <./spec-loading.html>`__.
diff --git a/docs/specs/metadata_example.xml b/docs/specs/metadata_example.xml
new file mode 100644
index 00000000..e30cf4b3
--- /dev/null
+++ b/docs/specs/metadata_example.xml
@@ -0,0 +1,31 @@
+
+
+
+ HAL
+ hal@ccsd.cnrs.fr
+
+ hal
+ hal-01243573
+ The assignment problem
+ https://hal.archives-ouvertes.fr/hal-01243573
+ other identifier, DOI, ARK
+ Domain
+ description
+
+ author1
+ Inria
+ UPMC
+
+
+ author2
+ Inria
+ UPMC
+
+
+
+
+
+
+
diff --git a/docs/specs/spec-loading.rst b/docs/specs/spec-loading.rst
new file mode 100644
index 00000000..f8aaab0a
--- /dev/null
+++ b/docs/specs/spec-loading.rst
@@ -0,0 +1,450 @@
+Loading specification
+=====================
+
+An important part of the deposit specifications is the loading procedure where
+a deposit is ingested into the Software Heritage (archive), using
+the tarball loader and the complete process of software artifacts creation
+in the archive.
+
+Tarball Loading
+---------------
+
+The ``swh-loader-tar`` module is already able to inject tarballs in swh
+with very limited metadata (mainly the origin).
+
+The loading of the deposit will use the deposit's associated data:
+
+* the metadata
+* the archive(s)
+
+
+Artifacts creation
+------------------
+
+Deposit to artifacts mapping
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is a global view of the deposit ingestion
+
++------------------------------------+-----------------------------------------+
+| swh artifact | representation in deposit |
++====================================+=========================================+
+| origin | https://hal.inria.fr/hal-id |
++------------------------------------+-----------------------------------------+
+| origin_metadata | aggregated metadata |
++------------------------------------+-----------------------------------------+
+| snapshot | reception of all occurrences (branches) |
++------------------------------------+-----------------------------------------+
+| branches | master & |
+| | branch (optional): tag to release |
++------------------------------------+-----------------------------------------+
+| release | (optional) synthetic release created |
+| | from metadata |
++------------------------------------+-----------------------------------------+
+| revision | synthetic revision pointing to |
+| | the expanded submitted tarball |
++------------------------------------+-----------------------------------------+
+| directory | root directory of the expanded submitted|
+| | tarball |
++------------------------------------+-----------------------------------------+
+
+
+Origin artifact
+~~~~~~~~~~~~~~~
+
+We create an origin URL by concatenating the client URI and the value of the
+Slug header of the initial POST request of the deposit.
+
+.. code-block:: json
+
+ {
+ "origin": {
+ "id": 89283768,
+ "origin_visits_url": "/api/1/origin/89283768/visits/",
+ "type": "deposit",
+ "url": "https://hal.archives-ouvertes.fr/hal-02140606"
+ }
+ }
+
+Visits
+~~~~~~
+
+We identify with a visit each deposit push of the same external_id.
+Here in the example below, two snapshots are identified by two different visits.
+
+.. code-block:: json
+
+ {
+ "visits": [
+ {
+ "date": "2019-06-03T09:28:10.223007+00:00",
+ "origin": 89283768,
+ "origin_visit_url": "/api/1/origin/89283768/visit/2/",
+ "snapshot": "a3773941561cc557853898773a19c07cfe2efc5a",
+ "snapshot_url": "/api/1/snapshot/a3773941561cc557853898773a19c07cfe2efc5a/",
+ "status": "full",
+ "type": "deposit",
+ "visit": 2
+ },
+ {
+ "date": "2019-05-27T12:23:31.037273+00:00",
+ "origin": 89283768,
+ "origin_visit_url": "/api/1/origin/89283768/visit/1/",
+ "snapshot": "43fdb8291f1bf6962211c370e394f6abb1cbe01d",
+ "snapshot_url": "/api/1/snapshot/43fdb8291f1bf6962211c370e394f6abb1cbe01d/",
+ "status": "full",
+ "type": "deposit",
+ "visit": 1
+ }
+ ]
+ }
+
+Snapshot artifact
+~~~~~~~~~~~~~~~~~
+
+The snapshot represents one deposit push. The ``HEAD`` branch points to a
+synthetic revision.
+
+ .. code-block:: json
+
+ {
+ "snapshot": {
+ "branches": {
+ "HEAD": {
+ "target": "396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52",
+ "target_type": "revision",
+ "target_url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/"
+ }
+ },
+ "id": "a3773941561cc557853898773a19c07cfe2efc5a",
+ "next_branch": null
+ }
+ }
+
+Note that previous versions of the deposit-loader named the branch ``master``
+instead, and created release branches under certain conditions.
+
+Release artifact
+~~~~~~~~~~~~~~~~
+
+.. warning::
+
+ This part of the specification is not implemented yet, only releases are
+ currently being created.
+
+The content is deposited with a set of descriptive metadata in the CodeMeta
+vocabulary. The following CodeMeta terms implies that the
+artifact is a release:
+
+- `releaseNotes`
+- `softwareVersion`
+
+If present, a release artifact will be created with the mapping below:
+
++-------------------+-----------------------------------+-----------------+----------------+
+| SWH release field | Description | CodeMeta term | Fallback value |
++===================+===================================+=================+================+
+| target | revision containing all metadata | X |X |
++-------------------+-----------------------------------+-----------------+----------------+
+| target_type | revision | X |X |
++-------------------+-----------------------------------+-----------------+----------------+
+| name | release or tag name (mandatory) | softwareVersion | X |
++-------------------+-----------------------------------+-----------------+----------------+
+| message | message associated with release | releaseNotes | X |
++-------------------+-----------------------------------+-----------------+----------------+
+| date | release date = publication date | datePublished | deposit_date |
++-------------------+-----------------------------------+-----------------+----------------+
+| author | deposit client | author | client |
++-------------------+-----------------------------------+-----------------+----------------+
+
+
+.. code-block:: json
+
+ {
+ "release": {
+ "author": {
+ "email": "hal@ccsd.cnrs.fr",
+ "fullname": "HAL ",
+ "name": "HAL"
+ },
+ "author_url": "/api/1/person/x/",
+ "date": "2019-05-27T16:28:33+02:00",
+ "id": "a9f3396f372ed4a51d75e15ca16c1c2df1fc5c97",
+ "message": "AffectationRO Version 1.1 - added new feature\n",
+ "name": "1.1",
+ "synthetic": true,
+ "target": "396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52",
+ "target_type": "revision",
+ "target_url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/"
+ }
+ }
+
+
+Revision artifact
+~~~~~~~~~~~~~~~~~
+
+The metadata sent with the deposit is stored outside the revision,
+and does not affect the hash computation.
+It contains the same fields as any revision object; in particular:
+
++-------------------+-----------------------------------------+
+| SWH revision field| Description |
++===================+=========================================+
+| message | synthetic message, containing the name |
+| | of the deposit client and an internal |
+| | identifier of the deposit. For example: |
+| | ``hal: Deposit 817 in collection hal`` |
++-------------------+-----------------------------------------+
+| author | synthetic author (SWH itself, for now) |
++-------------------+-----------------------------------------+
+| committer | same as the author (for now) |
++-------------------+-----------------------------------------+
+| date | see below |
++-------------------+-----------------------------------------+
+| committer_date | see below |
++-------------------+-----------------------------------------+
+
+The date mapping
+^^^^^^^^^^^^^^^^
+
+A deposit may contain 4 different dates concerning the software artifacts.
+
+The deposit's revision will reflect the most accurate point in time available.
+Here are all dates that can be available in a deposit:
+
++----------------+---------------------------------+------------------------------------------------+
+| dates | location | Description |
++================+=================================+================================================+
+| reception_date | On SWORD reception (automatic) | the deposit was received at this ts |
++----------------+---------------------------------+------------------------------------------------+
+| complete_date | On SWH ingestion (automatic) | the ingestion was completed by SWH at this ts |
++----------------+---------------------------------+------------------------------------------------+
+| dateCreated | metadata in codeMeta (optional) | the software artifact was created at this ts |
++----------------+---------------------------------+------------------------------------------------+
+| datePublished | metadata in codeMeta (optional) | the software was published (contributed in HAL)|
++----------------+---------------------------------+------------------------------------------------+
+
+A visit targeting a snapshot contains one date:
+
++-------------------+----------------------------------------------+----------------+
+| SWH visit field | Description | value |
++===================+==============================================+================+
+| date | the origin pushed the deposit at this date | reception_date |
++-------------------+----------------------------------------------+----------------+
+
+A revision contains two dates:
+
++-------------------+-----------------------------------------+----------------+----------------+
+| SWH revision field| Description | CodeMeta term | Fallback value |
++===================+=========================================+================+================+
+| date | date of software artifact modification | dateCreated | reception_date |
++-------------------+-----------------------------------------+----------------+----------------+
+| committer_date | date of the commit in VCS | datePublished | reception_date |
++-------------------+-----------------------------------------+----------------+----------------+
+
+
+A release contains one date:
+
++-------------------+----------------------------------+----------------+-----------------+
+| SWH release field |Description | CodeMeta term | Fallback value |
++===================+==================================+================+=================+
+| date |release date = publication date | datePublished | reception_date |
++-------------------+----------------------------------+----------------+-----------------+
+
+
+.. code-block:: json
+
+ {
+ "revision": {
+ "author": {
+ "email": "robot@softwareheritage.org",
+ "fullname": "Software Heritage",
+ "id": 18233048,
+ "name": "Software Heritage"
+ },
+ "author_url": "/api/1/person/18233048/",
+ "committer": {
+ "email": "robot@softwareheritage.org",
+ "fullname": "Software Heritage",
+ "id": 18233048,
+ "name": "Software Heritage"
+ },
+ "committer_date": "2019-05-27T16:28:33+02:00",
+ "committer_url": "/api/1/person/18233048/",
+ "date": "2012-01-01T00:00:00+00:00",
+ "directory": "fb13b51abbcfd13de85d9ba8d070a23679576cd7",
+ "directory_url": "/api/1/directory/fb13b51abbcfd13de85d9ba8d070a23679576cd7/",
+ "history_url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/log/",
+ "id": "396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52",
+ "merge": false,
+ "message": "hal: Deposit 282 in collection hal",
+ "metadata": {
+ "@xmlns": "http://www.w3.org/2005/Atom",
+ "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "author": {
+ "email": "hal@ccsd.cnrs.fr",
+ "name": "HAL"
+ },
+ "client": "hal",
+ "codemeta:applicationCategory": "info",
+ "codemeta:author": {
+ "codemeta:name": "Morane Gruenpeter"
+ },
+ "codemeta:codeRepository": "www.code-repository.com",
+ "codemeta:contributor": "Morane Gruenpeter",
+ "codemeta:dateCreated": "2012",
+ "codemeta:datePublished": "2019-05-27T16:28:33+02:00",
+ "codemeta:description": "description\\_en test v2",
+ "codemeta:developmentStatus": "Inactif",
+ "codemeta:keywords": "mot_cle_en,mot_cle_2_en,mot_cle_fr",
+ "codemeta:license": [
+ {
+ "codemeta:name": "MIT License"
+ },
+ {
+ "codemeta:name": "CeCILL Free Software License Agreement v1.1"
+ }
+ ],
+ "codemeta:name": "Test\\_20190527\\_01",
+ "codemeta:operatingSystem": "OS",
+ "codemeta:programmingLanguage": "Java",
+ "codemeta:referencePublication": null,
+ "codemeta:relatedLink": null,
+ "codemeta:releaseNotes": "releaseNote",
+ "codemeta:runtimePlatform": "outil",
+ "codemeta:softwareVersion": "1.0.1",
+ "codemeta:url": "https://hal.archives-ouvertes.fr/hal-02140606",
+ "codemeta:version": "2",
+ "external_identifier": "hal-02140606",
+ "id": "hal-02140606",
+ "original_artifact": [
+ {
+ "archive_type": "zip",
+ "blake2s256": "96be3ddedfcee9669ad9c42b0bb3a706daf23824d04311c63505a4d8db02df00",
+ "length": 193072,
+ "name": "archive.zip",
+ "sha1": "5b6ecc9d5bb113ff69fc275dcc9b0d993a8194f1",
+ "sha1_git": "bd10e4d3ede17162692d7e211e08e87e67994488",
+ "sha256": "3e2ce93384251ce6d6da7b8f2a061a8ebdaf8a28b8d8513223ca79ded8a10948"
+ }
+ ]
+ },
+ "parents": [
+ {
+ "id": "a9fdc3937d2b704b915852a64de2ab1b4b481003",
+ "url": "/api/1/revision/a9fdc3937d2b704b915852a64de2ab1b4b481003/"
+ }
+ ],
+ "synthetic": true,
+ "type": "tar",
+ "url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/"
+ }
+ }
+
+Directory artifact
+~~~~~~~~~~~~~~~~~~
+
+The directory artifact is the archive(s)' raw content deposited.
+
+.. code-block:: json
+
+ {
+ "directory": [
+ {
+ "dir_id": "fb13b51abbcfd13de85d9ba8d070a23679576cd7",
+ "length": null,
+ "name": "AffectationRO",
+ "perms": 16384,
+ "target": "fbc418f9ac2c39e8566b04da5dc24b14e65b23b1",
+ "target_url": "/api/1/directory/fbc418f9ac2c39e8566b04da5dc24b14e65b23b1/",
+ "type": "dir"
+ }
+ ]
+ }
+
+
+Questions raised concerning loading
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- A deposit has one origin, yet an origin can have multiple deposits?
+
+No, an origin can have multiple requests for the same deposit. Which
+should end up in one single deposit (when the client pushes its final
+request saying deposit 'done' through the header In-Progress).
+
+Only update of existing 'partial' deposit is permitted. Other than that,
+the deposit 'update' operation.
+
+To create a new version of a software (already deposited), the client
+must prior to this create a new deposit.
+
+Illustration First deposit loading:
+
+HAL's deposit 01535619 = SWH's deposit **01535619-1**
+
+::
+
+ + 1 origin with url:https://hal.inria.fr/medihal-01535619
+
+ + 1 synthetic revision
+
+ + 1 directory
+
+HAL's update on deposit 01535619 = SWH's deposit **01535619-2**
+
+(\*with HAL updates can only be on the metadata and a new version is
+required if the content changes)
+
+::
+
+ + 1 origin with url:https://hal.inria.fr/medihal-01535619
+
+ + new synthetic revision (with new metadata)
+
+ + same directory
+
+HAL's deposit 01535619-v2 = SWH's deposit **01535619-v2-1**
+
+::
+
+ + same origin
+
+ + new revision
+
+ + new directory
+
+
+Scheduling loading
+~~~~~~~~~~~~~~~~~~
+
+All ``archive`` and ``metadata`` deposit requests should be aggregated before
+loading.
+
+The loading should be scheduled via the scheduler's api.
+
+Only ``deposited`` deposit are concerned by the loading.
+
+When the loading is done and successful, the deposit entry is updated:
+
+ - ``status`` is updated to ``done``
+ - ``swh-id`` is populated with the resulting :ref:`SWHID
+ `
+ - ``complete_date`` is updated to the loading's finished time
+
+When the loading has failed, the deposit entry is updated:
+ - ``status`` is updated to ``failed``
+ - ``swh-id`` and ``complete_data`` remains as is
+
+*Note:* As a further improvement, we may prefer having a retry policy with
+graceful delays for further scheduling.
+
+Metadata loading
+~~~~~~~~~~~~~~~~
+
+- the metadata received with the deposit are kept in a dedicated table
+ ``raw_extrinsic_metadata``, distinct from the ``revision`` and ``origin``
+ tables.
+
+- ``authority`` is computed from the deposit client information, and ``fetcher``
+ is the deposit loader.
+
diff --git a/docs/specs/spec-meta-deposit.rst b/docs/specs/spec-meta-deposit.rst
new file mode 100644
index 00000000..6ef3c30a
--- /dev/null
+++ b/docs/specs/spec-meta-deposit.rst
@@ -0,0 +1,118 @@
+The metadata-deposit
+====================
+
+Goal
+----
+A client wishes to deposit only metadata about an origin or object in the
+Software Heritage archive.
+
+The metadata-deposit is a special deposit where no content is
+provided and the data transferred to Software Heritage is only
+the metadata about an object in the archive.
+
+Requirements
+------------
+The scope of the metadata-deposit is different than the
+sparse-deposit. While a sparse-deposit creates a revision with referenced
+directories and content files, the metadata-deposit references any of the
+following:
+
+- origin
+- snapshot
+- release
+- revision
+- directory
+- content
+
+
+A complete metadata example
+---------------------------
+The reference element is included in the metadata xml atomEntry under the
+swh namespace:
+
+TODO: publish schema at https://www.softwareheritage.org/schema/2018/deposit
+
+.. code:: xml
+
+
+
+
+ HAL
+ hal@ccsd.cnrs.fr
+
+ hal
+ hal-01243573
+ The assignment problem
+ https://hal.archives-ouvertes.fr/hal-01243573
+ other identifier, DOI, ARK
+ Domain
+ description
+
+ author1
+ Inria
+ UPMC
+
+
+ author2
+ Inria
+ UPMC
+
+
+
+
+
+
+
+
+References
+^^^^^^^^^^
+
+Origins
+=======
+
+The metadata may be on an origin, identified by the origin's URL:
+
+.. code:: xml
+
+
+
+
+
+
+
+Graph objects
+=============
+
+It may also reference an object in the `SWH graph `: contents,
+directories, revisions, releases, and snapshots:
+
+.. code:: xml
+
+
+
+
+
+
+
+The value of the ``swhid`` attribute must be a `SWHID `,
+with any context qualifiers in this list:
+
+* ``origin``
+* ``visit``
+* ``anchor``
+* ``path``
+
+and they should be provided whenever relevant, especially ``origin``.
+
+Other qualifiers are not allowed (for example, ``line`` isn't because SWH
+cannot store metadata at a finer level than entire contents).
+
+
+Loading procedure
+------------------
+
+In this case, the metadata-deposit will be injected as a metadata entry of
+the relevant object, with the information about the contributor of the deposit.
+Contrary to the complete and sparse deposit, there will be no object creation.
diff --git a/docs/specs/spec-sparse-deposit.rst b/docs/specs/spec-sparse-deposit.rst
new file mode 100644
index 00000000..461694fa
--- /dev/null
+++ b/docs/specs/spec-sparse-deposit.rst
@@ -0,0 +1,102 @@
+The sparse-deposit
+==================
+
+Goal
+----
+A client wishes to transfer a tarball for which part of the content is
+already in the SWH archive.
+
+Requirements
+------------
+To do so, a list of paths with targets must be provided in the metadata and
+the paths to the missing directories/content should not be included
+in the tarball. The list will be referred to
+as the manifest list using the entry name 'bindings' in the metadata.
+
++----------------------+-------------------------------------+
+| path | swh-id |
++======================+=====================================+
+| path/to/file.txt | swh:1:cnt:aaaaaaaaaaaaaaaaaaaaa... |
++----------------------+-------------------------------------+
+| path/to/dir/ | swh:1:dir:aaaaaaaaaaaaaaaaaaaaa... |
++----------------------+-------------------------------------+
+
+Note: the *name* of the file or the directory is given by the path and is not
+part of the identified object.
+
+TODO: see if a trailing "/" is mandatory for implementation.
+
+A concrete example
+------------------
+The manifest list is included in the metadata xml atomEntry under the
+swh namespace:
+
+TODO: publish schema at https://www.softwareheritage.org/schema/2018/deposit
+
+.. code:: xml
+
+
+
+
+ HALit mandatory to have a trailing "/",
+ hal@ccsd.cnrs.fr
+
+ hal
+ hal-01243573
+ The assignment problem
+ https://hal.archives-ouvertes.fr/hal-01243573
+ other identifier, DOI, ARK
+ Domain
+ description
+
+ author1
+ Inria
+ UPMC
+
+
+ author2
+ Inria
+ UPMC
+
+
+
+
+
+
+
+
+
+
+
+
+Deposit verification
+--------------------
+
+After checking the integrity of the deposit content and
+metadata, the following checks should be added:
+
+1. validate the manifest list structure with a correct swh-id for each path (syntax check on the swh-id format)
+2. verify that the path name corresponds to the object type
+3. locate the identifiers in the SWH archive
+
+Each failing check should return a different error with the deposit
+and result in a 'rejected' deposit.
+
+Loading procedure
+------------------
+The injection procedure should include:
+
+- load the tarball new data
+- create new objects using the path name and create links from the path to the
+ SWH object using the identifier
+- calculate identifier of the new objects at each level
+- return final swh-id of the new revision
+
+Invariant: the same content should yield the same swh-id,
+that's why a complete deposit with all the content and
+a sparse-deposit with the correct links will result
+with the same root directory swh-id.
+The same is expected with the revision swh-id if the metadata provided is
+identical.
diff --git a/docs/specs/spec-technical.rst b/docs/specs/spec-technical.rst
new file mode 100644
index 00000000..d1111b8a
--- /dev/null
+++ b/docs/specs/spec-technical.rst
@@ -0,0 +1,100 @@
+Technical specifications
+========================
+
+Requirements
+~~~~~~~~~~~~
+
+* one dedicated database to store the deposit's state - swh-deposit
+* one dedicated temporary storage to store archives before loading
+* one client to test the communication with SWORD protocol
+
+Deposit reception schema
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+* SWORD imposes the use of basic authentication, so we need a way to
+ authenticate client. Also, a client can access collections:
+
+ **deposit\_client** table:
+
+ - id (bigint): Client's identifier
+ - username (str): Client's username
+ - password (pass): Client's encrypted password
+ - collections ([id]): List of collections the client can access
+
+* Collections group deposits together:
+
+ **deposit\_collection** table:
+
+ - id (bigint): Collection's identifier
+ - name (str): Collection's human readable name
+
+* A deposit is the main object the repository is all about:
+
+ **deposit** table:
+
+ - id (bigint): deposit's identifier
+ - reception\_date (date): First deposit's reception date
+ - complete\_data (date): Date when the deposit is deemed complete and ready
+ for loading
+ - collection (id): The collection the deposit belongs to
+ - external id (text): client's internal identifier (e.g hal's id, etc...).
+ - client\_id (id) : Client which did the deposit
+ - swh\_id (str) : swh identifier result once the loading is complete
+ - status (enum): The deposit's current status
+
+- As mentioned, a deposit can have a status, whose possible values are:
+
+ .. code:: text
+
+ 'partial', -- the deposit is new or partially received since it
+ -- can be done in multiple requests
+ 'expired', -- deposit has been there too long and is now deemed
+ -- ready to be garbage collected
+ 'deposited' -- deposit complete, it is ready to be checked to ensure data consistency
+ 'verified', -- deposit is fully received, checked, and ready for loading
+ 'loading', -- loading is ongoing on swh's side
+ 'done', -- loading is successful
+ 'failed' -- loading is a failure
+
+* A deposit is stateful and can be made in multiple requests:
+
+ **deposit\_request** table:
+
+ - id (bigint): identifier
+ - type (id): deposit request's type (possible values: 'archive', 'metadata')
+ - deposit\_id (id): deposit whose request belongs to
+ - metadata: metadata associated to the request
+ - date (date): date of the requests
+
+ Information sent along a request are stored in a ``deposit_request`` row.
+
+ They can be either of type ``metadata`` (atom entry, multipart's atom entry
+ part) or of type ``archive`` (binary upload, multipart's binary upload part).
+
+ When the deposit is complete (status ``deposited``), those ``metadata`` and
+ ``archive`` deposit requests will be read and aggregated. They will then be
+ sent as parameters to the loading routine.
+
+ During loading, some of those metadata are kept in the ``origin_metadata``
+ table and some other are stored in the ``revision`` table (see `metadata
+ loading <#metadata-loading>`__).
+
+ The only update actions occurring on the deposit table are in regards of:
+
+ - status changes (see figure below):
+
+ - ``partial`` -> {``expired``/``deposited``},
+ - ``deposited`` -> {``rejected``/``verified``},
+ - ``verified`` -> ``loading``
+ - ``loading`` -> {``done``/``failed``}
+
+ - ``complete_date`` when the deposit is
+ finalized (when the status is changed to ``deposited``)
+ - ``swh-id`` is populated once we have the loading result
+
+.. raw:: html
+
+
+
+.. figure:: ../images/status.png
+ :alt:
diff --git a/docs/specs/specs.rst b/docs/specs/specs.rst
new file mode 100644
index 00000000..8abdb491
--- /dev/null
+++ b/docs/specs/specs.rst
@@ -0,0 +1,14 @@
+.. _swh-deposit-specs:
+
+Blueprint Specifications
+=========================
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Contents:
+
+ blueprint.rst
+ spec-loading.rst
+ spec-technical.rst
+ spec-sparse-deposit.rst
+ spec-meta-deposit.rst
diff --git a/docs/specs/swh.xsd b/docs/specs/swh.xsd
new file mode 100644
index 00000000..a082f4d5
--- /dev/null
+++ b/docs/specs/swh.xsd
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/sys-info.rst b/docs/sys-info.rst
new file mode 100644
index 00000000..12374a6e
--- /dev/null
+++ b/docs/sys-info.rst
@@ -0,0 +1,95 @@
+Deployment of the swh-deposit
+=============================
+
+As usual, the debian packaged is created and uploaded to the swh debian
+repository. Once the package is installed, we need to do a few things in
+regards to the database.
+
+Prepare the database setup (existence, connection, etc...).
+-----------------------------------------------------------
+
+This is defined through the packaged ``swh.deposit.settings.production``
+module and the expected **/etc/softwareheritage/deposit/server.yml**.
+
+As usual, the expected configuration files are deployed through our
+puppet manifest (cf. puppet-environment/swh-site,
+puppet-environment/swh-role, puppet-environment/swh-profile)
+
+Environment (production)
+------------------------
+
+`SWH_CONFIG_FILENAME` must be defined and target the deposit's server
+configuration file. So either prefix the following commands or export the
+environment variable in your shell session.
+
+.. code:: shell
+
+ export SWH_CONFIG_FILENAME=/etc/softwareheritage/deposit/server.yml
+
+Migrate/bootstrap the db schema
+-------------------------------
+
+.. code:: shell
+
+ sudo django-admin migrate --settings=swh.deposit.settings.production
+
+Load minimum defaults data
+--------------------------
+
+.. code:: shell
+
+ sudo django-admin loaddata \
+ --settings=swh.deposit.settings.production deposit_data
+
+This adds the minimal 'hal' collection
+
+Note: swh.deposit.fixtures.deposit\_data is packaged
+
+Add client and collection
+-------------------------
+
+.. code:: shell
+
+ swh deposit admin \
+ --config-file /etc/softwareheritage/deposit/server.yml \
+ --platform production \
+ user create \
+ --collection \
+ --username \
+ --password
+
+This adds a user ```` which can access the collection
+````. The password will be used for the authentication
+access to the deposit api.
+
+Note:
+ - If the collection does not exist, it is created alongside
+ - The password is plain text but stored encrypted (so yes, for now
+ we know the user's password)
+ - For production platform, you must either set an `SWH_CONFIG_FILENAME`
+ environment variable or pass alongside the `--config-file` parameter
+
+Reschedule a deposit
+---------------------
+
+.. code:: shell
+
+ swh deposit admin \
+ --config-file /etc/softwareheritage/deposit/server.yml \
+ --platform production \
+ deposit reschedule \
+ --deposit-id
+
+This will:
+
+- check the deposit's status to something reasonable (failed or done). That
+ means that the checks have passed alright but something went wrong during the
+ loading (failed: loading failed, done: loading ok, still for some reasons as
+ in bugs, we need to reschedule it)
+- reset the deposit's status to 'verified' (prior to any loading but after the
+ checks which are fine) and removes the different archives' identifiers
+ (swh-id, ...)
+- trigger back the loading task through the scheduler
+
+
+
diff --git a/docs/tests/tests_HAL.rst b/docs/tests/tests_HAL.rst
new file mode 100644
index 00000000..0a1eeb4c
--- /dev/null
+++ b/docs/tests/tests_HAL.rst
@@ -0,0 +1,67 @@
+Tests scenarios for client
+==========================
+
+Scenarios for HAL- on HAL's platform
+------------------------------------
+
+The same procedure is used for all tests:
+
+Software Author:
+
+#. prepare content
+#. fill out form
+#. submit
+
+HAL moderator:
+
+#. review content submitted
+#. check metadata fields on HAL
+#. validate submission
+
+SWH side:
+
+1. check content in SWH:
+
+ - directory was created
+ - revision was created
+ - release was created when releaseNotes and softwareVersion was included (new feature!)
+ - origin corresponds to HAL url
+
+2. check metadata fields on SWH (in revision)
+3. check directory
+4. check swh-id on HAL
+5. check browsability when entering SWH artifact from HAL
+6. check vault artifact recreation
+7. access deposit's origin from SWH
+
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| scenario | test case | data | result | exceptions or specific checks |
++=============+===========================================+==========+=========+=========================================+
+| submit code | content: .tar.gz | .zip | success | |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| submit code | content: .zip | .tar.gz | success | |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| submit code | content: no content | empty | fail | blocked on HAL |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| submit code | content: double compression (.zip in .zip)| .zip x 2 | fail | status `failed` on SWH |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| submit code | all metadata-single entry | metadata | success | check that all metadata is transmitted |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| submit code | multiple entries | metadata | success | languages / authors / descriptions |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| new version | new content- same metadata | content | success | check new swh-id in SWH and HAL |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| new version | same content- new metadata | metadata | ? | dead angle- doesn't arrives to SWH |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| new version | new content-new metadata | C & M | success | check artifacts history in revisions |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+| submit code | deposit on another hal platform | C & M | success | |
++-------------+-------------------------------------------+----------+---------+-----------------------------------------+
+
+Past known bugs:
+
+- v2 problem, where swh-id from first version is kept in the second version
+ instead of the new swh-id.
+- when deposit workers are down- error 500 is returned on HAL without real
+ explanation (because there is no error on SWH- deposit status
+ stays `deposited`).
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 00000000..b450c720
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,43 @@
+[mypy]
+namespace_packages = True
+warn_unused_ignores = True
+
+
+# support for django magic: https://github.com/typeddjango/django-stubs
+plugins = mypy_django_plugin.main
+
+[mypy.plugins.django-stubs]
+django_settings_module = swh.deposit.settings.testing
+
+
+# 3rd party libraries without stubs (yet)
+
+[mypy-celery.*]
+ignore_missing_imports = True
+
+[mypy-iso8601.*]
+ignore_missing_imports = True
+
+[mypy-pkg_resources.*]
+ignore_missing_imports = True
+
+[mypy-psycopg2.*]
+ignore_missing_imports = True
+
+[mypy-pytest.*]
+ignore_missing_imports = True
+
+[tenacity.*]
+ignore_missing_imports = True
+
+[mypy-rest_framework.*]
+ignore_missing_imports = True
+
+[mypy-xmltodict.*]
+ignore_missing_imports = True
+
+[mypy-swh.loader.tar.*]
+ignore_missing_imports = True
+
+[mypy-swh.storage.*]
+ignore_missing_imports = True
diff --git a/pyproject.toml b/pyproject.toml
index b5413f6c..69b8f4dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,2 +1,11 @@
[tool.black]
target-version = ['py37']
+
+[tool.isort]
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 88
+force_sort_within_sections = true
diff --git a/pytest.ini b/pytest.ini
index bfe57267..020ea949 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,8 +1,9 @@
[pytest]
-addopts = -p no:flask
-norecursedirs = docs
+# Remove the pytest_swh_* entries when they stop getting imported automatically
+addopts = -p no:flask -p no:pytest_swh_scheduler -p no:pytest_swh_storage
+norecursedirs = docs .*
DJANGO_SETTINGS_MODULE = swh.deposit.settings.testing
markers =
db: execute tests using a postgresql database
fs: execute tests using the filesystem
diff --git a/requirements-server.txt b/requirements-server.txt
index d2631e2c..5a906819 100644
--- a/requirements-server.txt
+++ b/requirements-server.txt
@@ -1,2 +1,3 @@
Django < 3
djangorestframework
+setuptools
diff --git a/requirements-swh-server.txt b/requirements-swh-server.txt
index 86a85993..5e81fabe 100644
--- a/requirements-swh-server.txt
+++ b/requirements-swh-server.txt
@@ -1,4 +1,4 @@
swh.core[http]
swh.loader.core >= 0.0.71
swh.scheduler >= 0.0.39
-swh.model >= 0.1.0
+swh.model >= 0.3.8
diff --git a/requirements-swh.txt b/requirements-swh.txt
index c1af7e51..9bc67248 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1 +1 @@
-swh.core >= 0.0.75
+swh.core[http] >= 0.3
diff --git a/resources/deposit/server.yml b/resources/deposit/server.yml
new file mode 100644
index 00000000..b7dbba1f
--- /dev/null
+++ b/resources/deposit/server.yml
@@ -0,0 +1,2 @@
+# 200 Mib max size
+max_upload_size: 209715200
diff --git a/setup.py b/setup.py
index 7926e240..566feef9 100755
--- a/setup.py
+++ b/setup.py
@@ -1,79 +1,79 @@
#!/usr/bin/env python3
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from setuptools import setup, find_packages
-
-from os import path
from io import open
+from os import path
+
+from setuptools import find_packages, setup
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
def parse_requirements(*names):
requirements = []
for name in names:
if name:
reqf = "requirements-%s.txt" % name
else:
reqf = "requirements.txt"
if not path.exists(reqf):
return requirements
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith("#"):
continue
requirements.append(line)
return requirements
setup(
name="swh.deposit",
description="Software Heritage Deposit Server",
long_description=long_description,
long_description_content_type="text/markdown",
python_requires=">=3.7",
author="Software Heritage developers",
author_email="swh-devel@inria.fr",
url="https://forge.softwareheritage.org/source/swh-deposit/",
packages=find_packages(),
install_requires=parse_requirements(None, "swh"),
tests_require=parse_requirements("test"),
- setup_requires=["vcversioner"],
+ setup_requires=["setuptools-scm"],
+ use_scm_version=True,
extras_require={
"testing": parse_requirements("test", "server", "swh-server"),
"server": parse_requirements("server", "swh-server"),
},
- vcversioner={},
include_package_data=True,
entry_points="""
[console_scripts]
swh-deposit=swh.deposit.cli:main
[swh.cli.subcommands]
- deposit=swh.deposit.cli:deposit
+ deposit=swh.deposit.cli
[swh.workers]
deposit.worker=swh.deposit.loader:register
""",
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
],
project_urls={
"Bug Reports": "https://forge.softwareheritage.org/maniphest",
"Funding": "https://www.softwareheritage.org/donate",
"Source": "https://forge.softwareheritage.org/source/swh-deposit",
"Documentation": "https://docs.softwareheritage.org/devel/swh-deposit/",
},
)
diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO
index 0e524373..5b467b4c 100644
--- a/swh.deposit.egg-info/PKG-INFO
+++ b/swh.deposit.egg-info/PKG-INFO
@@ -1,37 +1,37 @@
Metadata-Version: 2.1
Name: swh.deposit
-Version: 0.0.90
+Version: 0.1.0
Summary: Software Heritage Deposit Server
Home-page: https://forge.softwareheritage.org/source/swh-deposit/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-deposit/
Description: # swh-deposit
This is [Software Heritage](https://www.softwareheritage.org)'s
[SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server
implementation, as well as a simple client to upload deposits on the server.
**S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository
**D**eposit) is an interoperability standard for digital file deposit.
This implementation will permit interaction between a client (a
repository) and a server (SWH repository) to permit deposits of
software source code archives and associated metadata.
The documentation is at ./docs/README-specification.md
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
Provides-Extra: server
diff --git a/swh.deposit.egg-info/SOURCES.txt b/swh.deposit.egg-info/SOURCES.txt
index 6d6b7ec9..063a2c97 100644
--- a/swh.deposit.egg-info/SOURCES.txt
+++ b/swh.deposit.egg-info/SOURCES.txt
@@ -1,177 +1,215 @@
+.gitignore
+.pre-commit-config.yaml
+AUTHORS
+CODE_OF_CONDUCT.md
+CONTRIBUTORS
+LICENSE
MANIFEST.in
Makefile
+Makefile.local
README.md
+conftest.py
+mypy.ini
pyproject.toml
pytest.ini
requirements-server.txt
requirements-swh-server.txt
requirements-swh.txt
requirements-test.txt
requirements.txt
setup.cfg
setup.py
tox.ini
-version.txt
+bin/Makefile
+bin/content.sh
+bin/create_deposit.sh
+bin/create_deposit_atom.sh
+bin/create_deposit_with_metadata.sh
+bin/default-setup
+bin/download-deposit-archive.sh
+bin/home.sh
+bin/replace-deposit-archive.sh
+bin/service-document.sh
+bin/status.sh
+bin/update-deposit-with-another-archive.sh
+bin/update-status.sh
+docs/.gitignore
+docs/Makefile
+docs/conf.py
+docs/dev-info.rst
+docs/getting-started.rst
+docs/index.rst
+docs/metadata.rst
+docs/spec-api.rst
+docs/sys-info.rst
+docs/_static/.placeholder
+docs/_templates/.placeholder
+docs/endpoints/collection.rst
+docs/endpoints/content.rst
+docs/endpoints/service-document.rst
+docs/endpoints/status.rst
+docs/endpoints/update-media.rst
+docs/endpoints/update-metadata.rst
+docs/images/deposit-create-chart.png
+docs/images/deposit-delete-chart.png
+docs/images/deposit-update-chart.png
+docs/images/status.png
+docs/specs/blueprint.rst
+docs/specs/metadata_example.xml
+docs/specs/spec-loading.rst
+docs/specs/spec-meta-deposit.rst
+docs/specs/spec-sparse-deposit.rst
+docs/specs/spec-technical.rst
+docs/specs/specs.rst
+docs/specs/swh.xsd
+docs/tests/tests_HAL.rst
+resources/deposit/server.yml
swh/__init__.py
swh.deposit.egg-info/PKG-INFO
swh.deposit.egg-info/SOURCES.txt
swh.deposit.egg-info/dependency_links.txt
swh.deposit.egg-info/entry_points.txt
swh.deposit.egg-info/requires.txt
swh.deposit.egg-info/top_level.txt
swh/deposit/__init__.py
swh/deposit/apps.py
swh/deposit/auth.py
swh/deposit/client.py
swh/deposit/config.py
swh/deposit/errors.py
swh/deposit/exception.py
swh/deposit/gunicorn_config.py
swh/deposit/manage.py
swh/deposit/models.py
swh/deposit/parsers.py
swh/deposit/py.typed
swh/deposit/urls.py
swh/deposit/utils.py
swh/deposit/api/__init__.py
swh/deposit/api/common.py
swh/deposit/api/converters.py
swh/deposit/api/deposit.py
swh/deposit/api/deposit_content.py
swh/deposit/api/deposit_status.py
swh/deposit/api/deposit_update.py
swh/deposit/api/service_document.py
swh/deposit/api/urls.py
swh/deposit/api/private/__init__.py
swh/deposit/api/private/deposit_check.py
swh/deposit/api/private/deposit_list.py
swh/deposit/api/private/deposit_read.py
swh/deposit/api/private/deposit_update_status.py
swh/deposit/api/private/urls.py
swh/deposit/cli/__init__.py
swh/deposit/cli/admin.py
swh/deposit/cli/client.py
swh/deposit/fixtures/__init__.py
swh/deposit/fixtures/deposit_data.yaml
swh/deposit/loader/__init__.py
swh/deposit/loader/checker.py
swh/deposit/loader/tasks.py
swh/deposit/migrations/0001_initial.py
swh/deposit/migrations/0002_depositrequest_archive.py
swh/deposit/migrations/0003_temporaryarchive.py
swh/deposit/migrations/0004_delete_temporaryarchive.py
swh/deposit/migrations/0005_auto_20171019_1436.py
swh/deposit/migrations/0006_depositclient_url.py
swh/deposit/migrations/0007_auto_20171129_1609.py
swh/deposit/migrations/0008_auto_20171130_1513.py
swh/deposit/migrations/0009_deposit_parent.py
swh/deposit/migrations/0010_auto_20180110_0953.py
swh/deposit/migrations/0011_auto_20180115_1510.py
swh/deposit/migrations/0012_deposit_status_detail.py
swh/deposit/migrations/0013_depositrequest_raw_metadata.py
swh/deposit/migrations/0014_auto_20180720_1221.py
swh/deposit/migrations/0015_depositrequest_typemigration.py
swh/deposit/migrations/0016_auto_20190507_1408.py
swh/deposit/migrations/0017_auto_20190925_0906.py
swh/deposit/migrations/0018_migrate_swhids.py
swh/deposit/migrations/0019_auto_20200519_1035.py
swh/deposit/migrations/__init__.py
swh/deposit/settings/__init__.py
swh/deposit/settings/common.py
swh/deposit/settings/development.py
swh/deposit/settings/production.py
swh/deposit/settings/testing.py
swh/deposit/static/robots.txt
swh/deposit/static/css/bootstrap-responsive.min.css
swh/deposit/static/css/style.css
swh/deposit/static/img/arrow-up-small.png
swh/deposit/static/img/swh-logo-deposit.png
swh/deposit/static/img/swh-logo-deposit.svg
swh/deposit/static/img/icons/swh-logo-32x32.png
swh/deposit/static/img/icons/swh-logo-deposit-180x180.png
swh/deposit/static/img/icons/swh-logo-deposit-192x192.png
swh/deposit/static/img/icons/swh-logo-deposit-270x270.png
swh/deposit/templates/__init__.py
swh/deposit/templates/api.html
swh/deposit/templates/homepage.html
swh/deposit/templates/layout.html
swh/deposit/templates/deposit/__init__.py
swh/deposit/templates/deposit/content.xml
swh/deposit/templates/deposit/deposit_receipt.xml
swh/deposit/templates/deposit/error.xml
swh/deposit/templates/deposit/service_document.xml
swh/deposit/templates/deposit/status.xml
swh/deposit/templates/rest_framework/api.html
swh/deposit/tests/__init__.py
swh/deposit/tests/common.py
swh/deposit/tests/conftest.py
swh/deposit/tests/test_common.py
swh/deposit/tests/test_gunicorn_config.py
+swh/deposit/tests/test_init.py
swh/deposit/tests/test_utils.py
swh/deposit/tests/api/__init__.py
swh/deposit/tests/api/conftest.py
swh/deposit/tests/api/test_converters.py
swh/deposit/tests/api/test_deposit.py
swh/deposit/tests/api/test_deposit_atom.py
swh/deposit/tests/api/test_deposit_binary.py
swh/deposit/tests/api/test_deposit_delete.py
swh/deposit/tests/api/test_deposit_list.py
swh/deposit/tests/api/test_deposit_multipart.py
swh/deposit/tests/api/test_deposit_private_check.py
swh/deposit/tests/api/test_deposit_private_read_archive.py
swh/deposit/tests/api/test_deposit_private_read_metadata.py
swh/deposit/tests/api/test_deposit_private_update_status.py
swh/deposit/tests/api/test_deposit_schedule.py
swh/deposit/tests/api/test_deposit_status.py
swh/deposit/tests/api/test_deposit_update.py
swh/deposit/tests/api/test_exception.py
swh/deposit/tests/api/test_parser.py
swh/deposit/tests/api/test_service_document.py
-swh/deposit/tests/api/data/atom/codemeta-sample.xml
-swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml
-swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml
-swh/deposit/tests/api/data/atom/entry-data-empty-body.xml
-swh/deposit/tests/api/data/atom/entry-data-ko.xml
-swh/deposit/tests/api/data/atom/entry-data-minimal.xml
-swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml
-swh/deposit/tests/api/data/atom/entry-data0.xml
-swh/deposit/tests/api/data/atom/entry-data1.xml
-swh/deposit/tests/api/data/atom/entry-data2.xml
-swh/deposit/tests/api/data/atom/entry-data3.xml
-swh/deposit/tests/api/data/atom/entry-update-in-place.xml
-swh/deposit/tests/api/data/atom/error-with-decimal.xml
-swh/deposit/tests/api/data/atom/metadata.xml
-swh/deposit/tests/api/data/atom/tei-sample.xml
swh/deposit/tests/cli/__init__.py
swh/deposit/tests/cli/test_client.py
-swh/deposit/tests/cli/data/atom/codemeta-sample.xml
-swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml
-swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml
-swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml
-swh/deposit/tests/cli/data/atom/entry-data-ko.xml
-swh/deposit/tests/cli/data/atom/entry-data-minimal.xml
-swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml
-swh/deposit/tests/cli/data/atom/entry-data0.xml
-swh/deposit/tests/cli/data/atom/entry-data1.xml
-swh/deposit/tests/cli/data/atom/entry-data2.xml
-swh/deposit/tests/cli/data/atom/entry-data3.xml
-swh/deposit/tests/cli/data/atom/entry-update-in-place.xml
-swh/deposit/tests/cli/data/atom/error-with-decimal.xml
-swh/deposit/tests/cli/data/atom/metadata.xml
-swh/deposit/tests/cli/data/atom/tei-sample.xml
+swh/deposit/tests/data/atom/codemeta-sample.xml
+swh/deposit/tests/data/atom/entry-data-badly-formatted.xml
+swh/deposit/tests/data/atom/entry-data-deposit-binary.xml
+swh/deposit/tests/data/atom/entry-data-empty-body.xml
+swh/deposit/tests/data/atom/entry-data-ko.xml
+swh/deposit/tests/data/atom/entry-data-minimal.xml
+swh/deposit/tests/data/atom/entry-data-parsing-error-prone.xml
+swh/deposit/tests/data/atom/entry-data0.xml
+swh/deposit/tests/data/atom/entry-data1.xml
+swh/deposit/tests/data/atom/entry-data2.xml
+swh/deposit/tests/data/atom/entry-data3.xml
+swh/deposit/tests/data/atom/entry-update-in-place.xml
+swh/deposit/tests/data/atom/error-with-decimal.xml
+swh/deposit/tests/data/atom/metadata.xml
+swh/deposit/tests/data/atom/tei-sample.xml
swh/deposit/tests/loader/__init__.py
swh/deposit/tests/loader/common.py
swh/deposit/tests/loader/conftest.py
swh/deposit/tests/loader/test_checker.py
swh/deposit/tests/loader/test_client.py
swh/deposit/tests/loader/test_tasks.py
swh/deposit/tests/loader/data/http_example.org/hello.json
swh/deposit/tests/loader/data/http_example.org/hello_you
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_1_check
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_2_check
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_raw
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_update
swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_check
swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_metadata
swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_raw
\ No newline at end of file
diff --git a/swh.deposit.egg-info/entry_points.txt b/swh.deposit.egg-info/entry_points.txt
index dbdecaea..d627b0ce 100644
--- a/swh.deposit.egg-info/entry_points.txt
+++ b/swh.deposit.egg-info/entry_points.txt
@@ -1,8 +1,8 @@
[console_scripts]
swh-deposit=swh.deposit.cli:main
[swh.cli.subcommands]
- deposit=swh.deposit.cli:deposit
+ deposit=swh.deposit.cli
[swh.workers]
deposit.worker=swh.deposit.loader:register
\ No newline at end of file
diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt
index afc37349..88063b40 100644
--- a/swh.deposit.egg-info/requires.txt
+++ b/swh.deposit.egg-info/requires.txt
@@ -1,30 +1,32 @@
vcversioner
click
xmltodict
iso8601
requests
-swh.core>=0.0.75
+swh.core[http]>=0.3
[server]
Django<3
djangorestframework
+setuptools
swh.core[http]
swh.loader.core>=0.0.71
swh.scheduler>=0.0.39
-swh.model>=0.1.0
+swh.model>=0.3.8
[testing]
pytest
pytest-django
pytest-mock
swh.scheduler[testing]
swh.loader.core[testing]
pytest-postgresql>=2.1.0
requests_mock
django-stubs
Django<3
djangorestframework
+setuptools
swh.core[http]
swh.loader.core>=0.0.71
swh.scheduler>=0.0.39
-swh.model>=0.1.0
+swh.model>=0.3.8
diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py
index e69de29b..65c78f3d 100644
--- a/swh/deposit/api/__init__.py
+++ b/swh/deposit/api/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pkg_resources
+
+try:
+ __version__ = pkg_resources.get_distribution("swh.deposit").version
+except pkg_resources.DistributionNotFound:
+ __version__ = "devel"
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
index c8e5248a..6bed49c5 100644
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -1,962 +1,1033 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from abc import ABCMeta, abstractmethod
+import datetime
import hashlib
+import json
+from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
-from typing import Sequence, Type
-
-from abc import ABCMeta, abstractmethod
-from django.urls import reverse
-from django.http import HttpResponse
+from django.http import FileResponse, HttpResponse
from django.shortcuts import render
+from django.urls import reverse
from django.utils import timezone
from rest_framework import status
from rest_framework.authentication import BaseAuthentication, BasicAuthentication
from rest_framework.permissions import BasePermission, IsAuthenticated
+from rest_framework.request import Request
from rest_framework.views import APIView
from swh.model import hashutil
from swh.scheduler.utils import create_oneshot_task_dict
from ..config import (
- SWHDefaultConfig,
+ ARCHIVE_KEY,
+ ARCHIVE_TYPE,
+ CONT_FILE_IRI,
+ DEPOSIT_STATUS_DEPOSITED,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ DEPOSIT_STATUS_PARTIAL,
EDIT_SE_IRI,
EM_IRI,
- CONT_FILE_IRI,
- ARCHIVE_KEY,
METADATA_KEY,
+ METADATA_TYPE,
RAW_METADATA_KEY,
STATE_IRI,
- DEPOSIT_STATUS_DEPOSITED,
- DEPOSIT_STATUS_PARTIAL,
- DEPOSIT_STATUS_LOAD_SUCCESS,
- ARCHIVE_TYPE,
- METADATA_TYPE,
+ APIConfig,
)
from ..errors import (
- MAX_UPLOAD_SIZE_EXCEEDED,
BAD_REQUEST,
- ERROR_CONTENT,
CHECKSUM_MISMATCH,
- make_error_dict,
- MEDIATION_NOT_ALLOWED,
- make_error_response_from_dict,
+ ERROR_CONTENT,
FORBIDDEN,
- NOT_FOUND,
- make_error_response,
+ MAX_UPLOAD_SIZE_EXCEEDED,
+ MEDIATION_NOT_ALLOWED,
METHOD_NOT_ALLOWED,
- ParserError,
+ NOT_FOUND,
PARSING_ERROR,
+ ParserError,
+ make_error_dict,
+ make_error_response,
+ make_error_response_from_dict,
)
-from ..models import Deposit, DepositRequest, DepositCollection, DepositClient
+from ..models import Deposit, DepositClient, DepositCollection, DepositRequest
from ..parsers import parse_xml
-
ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"]
ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"]
-class SWHAPIView(APIView):
+class AuthenticatedAPIView(APIView):
"""Mixin intended as a based API view to enforce the basic
authentication check
"""
authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,)
permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,)
-class SWHBaseDeposit(SWHDefaultConfig, SWHAPIView, metaclass=ABCMeta):
+class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta):
"""Base deposit request class sharing multiple common behaviors.
"""
- def _read_headers(self, request):
+ def _read_headers(self, request: Request) -> Dict[str, Any]:
"""Read and unify the necessary headers from the request (those are
not stored in the same location or not properly formatted).
Args:
request (Request): Input request
Returns:
Dictionary with the following keys (some associated values may be
None):
- content-type
- content-length
- in-progress
- content-disposition
- packaging
- slug
- on-behalf-of
"""
meta = request._request.META
content_type = request.content_type
content_length = meta.get("CONTENT_LENGTH")
if content_length and isinstance(content_length, str):
content_length = int(content_length)
# final deposit if not provided
in_progress = meta.get("HTTP_IN_PROGRESS", False)
content_disposition = meta.get("HTTP_CONTENT_DISPOSITION")
if isinstance(in_progress, str):
in_progress = in_progress.lower() == "true"
content_md5sum = meta.get("HTTP_CONTENT_MD5")
if content_md5sum:
content_md5sum = bytes.fromhex(content_md5sum)
packaging = meta.get("HTTP_PACKAGING")
slug = meta.get("HTTP_SLUG")
on_behalf_of = meta.get("HTTP_ON_BEHALF_OF")
metadata_relevant = meta.get("HTTP_METADATA_RELEVANT")
return {
"content-type": content_type,
"content-length": content_length,
"in-progress": in_progress,
"content-disposition": content_disposition,
"content-md5sum": content_md5sum,
"packaging": packaging,
"slug": slug,
"on-behalf-of": on_behalf_of,
"metadata-relevant": metadata_relevant,
}
- def _compute_md5(self, filehandler):
+ def _compute_md5(self, filehandler) -> bytes:
"""Compute uploaded file's md5 sum.
Args:
filehandler (InMemoryUploadedFile): the file to compute the md5
hash
Returns:
the md5 checksum (str)
"""
h = hashlib.md5()
for chunk in filehandler:
h.update(chunk)
return h.digest()
def _deposit_put(
- self, request, deposit_id=None, in_progress=False, external_id=None
- ):
+ self,
+ request: Request,
+ deposit_id: Optional[int] = None,
+ in_progress: bool = False,
+ external_id: Optional[str] = None,
+ ) -> Deposit:
"""Save/Update a deposit in db.
Args:
- deposit_id (int): deposit identifier
- in_progress (dict): The deposit's status
- external_id (str): The external identifier to associate to
- the deposit
+ request: request data
+ deposit_id: deposit identifier
+ in_progress: deposit status
+ external_id: external identifier to associate to the deposit
Returns:
The Deposit instance saved or updated.
"""
+ complete_date: Optional[datetime.datetime] = None
+ deposit_parent: Optional[Deposit] = None
+
if in_progress is False:
complete_date = timezone.now()
status_type = DEPOSIT_STATUS_DEPOSITED
else:
- complete_date = None
status_type = DEPOSIT_STATUS_PARTIAL
if not deposit_id:
try:
- # find a deposit parent (same external id, status load
- # to success)
+ # find a deposit parent (same external id, status load to success)
deposit_parent = (
Deposit.objects.filter(
external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS
)
.order_by("-id")[0:1]
.get()
) # noqa
except Deposit.DoesNotExist:
- deposit_parent = None
+ # then no parent for that deposit, deposit_parent already None
+ pass
+ assert external_id is not None
deposit = Deposit(
collection=self._collection,
external_id=external_id,
complete_date=complete_date,
status=status_type,
client=self._client,
parent=deposit_parent,
)
else:
deposit = Deposit.objects.get(pk=deposit_id)
# update metadata
deposit.complete_date = complete_date
deposit.status = status_type
if self.config["checks"]:
deposit.save() # needed to have a deposit id
scheduler = self.scheduler
if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id:
task = create_oneshot_task_dict(
"check-deposit",
collection=deposit.collection.name,
deposit_id=deposit.id,
)
check_task_id = scheduler.create_tasks([task])[0]["id"]
deposit.check_task_id = check_task_id
deposit.save()
return deposit
def _deposit_request_put(
self,
- deposit,
- deposit_request_data,
- replace_metadata=False,
- replace_archives=False,
- ):
+ deposit: Deposit,
+ deposit_request_data: Dict[str, Any],
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> None:
"""Save a deposit request with metadata attached to a deposit.
Args:
- deposit (Deposit): The deposit concerned by the request
- deposit_request_data (dict): The dictionary with at most 2 deposit
- request types (archive, metadata) to associate to the deposit
- replace_metadata (bool): Flag defining if we add or update
+ deposit: The deposit concerned by the request
+ deposit_request_data: The dictionary with at most 2 deposit
+ request types (archive, metadata) to associate to the deposit
+ replace_metadata: Flag defining if we add or update
existing metadata to the deposit
- replace_archives (bool): Flag defining if we add or update
+ replace_archives: Flag defining if we add or update
archives to existing deposit
Returns:
None
"""
if replace_metadata:
DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete()
if replace_archives:
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
deposit_request = None
archive_file = deposit_request_data.get(ARCHIVE_KEY)
if archive_file:
deposit_request = DepositRequest(
type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file
)
deposit_request.save()
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
- raw_metadata = deposit_request_data.get(RAW_METADATA_KEY)
+ raw_metadata = deposit_request_data[RAW_METADATA_KEY]
deposit_request = DepositRequest(
type=METADATA_TYPE,
deposit=deposit,
metadata=metadata,
raw_metadata=raw_metadata.decode("utf-8"),
)
deposit_request.save()
assert deposit_request is not None
- def _delete_archives(self, collection_name, deposit_id):
- """Delete archives reference from the deposit id.
+ def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict:
+ """Delete archive references from the deposit id.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
- NOT_FOUND, "The deposit %s does not exist" % deposit_id
+ NOT_FOUND, f"The deposit {deposit_id} does not exist"
)
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
return {}
- def _delete_deposit(self, collection_name, deposit_id):
+ def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict:
"""Delete deposit reference.
Args:
- collection_name (str): Client's name
- deposit_id (id): The deposit to delete
+ collection_name: Client's collection
+ deposit_id: The deposit to delete
Returns
Empty dict when ok.
Dict with error key to describe the failure.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
- NOT_FOUND, "The deposit %s does not exist" % deposit_id
+ NOT_FOUND, f"The deposit {deposit_id} does not exist"
)
if deposit.collection.name != collection_name:
summary = "Cannot delete a deposit from another collection"
description = "Deposit %s does not belong to the collection %s" % (
deposit_id,
collection_name,
)
return make_error_dict(
BAD_REQUEST, summary=summary, verbose_description=description
)
DepositRequest.objects.filter(deposit=deposit).delete()
deposit.delete()
return {}
- def _check_preconditions_on(self, filehandler, md5sum, content_length=None):
+ def _check_preconditions_on(
+ self, filehandler, md5sum: str, content_length: Optional[int] = None
+ ) -> Optional[Dict]:
"""Check preconditions on provided file are respected. That is the
length and/or the md5sum hash match the file's content.
Args:
filehandler (InMemoryUploadedFile): The file to check
- md5sum (hex str): md5 hash expected from the file's content
- content_length (int): the expected length if provided.
+ md5sum: md5 hash expected from the file's content
+ content_length: the expected length if provided.
Returns:
Either none if no error or a dictionary with a key error
detailing the problem.
"""
+ max_upload_size = self.config["max_upload_size"]
if content_length:
- if content_length > self.config["max_upload_size"]:
+ if content_length > max_upload_size:
return make_error_dict(
MAX_UPLOAD_SIZE_EXCEEDED,
- "Upload size limit exceeded (max %s bytes)."
- % self.config["max_upload_size"],
- "Please consider sending the archive in " "multiple steps.",
+ f"Upload size limit exceeded (max {max_upload_size} bytes)."
+ "Please consider sending the archive in multiple steps.",
)
length = filehandler.size
if length != content_length:
return make_error_dict(
status.HTTP_412_PRECONDITION_FAILED, "Wrong length"
)
if md5sum:
_md5sum = self._compute_md5(filehandler)
if _md5sum != md5sum:
return make_error_dict(
CHECKSUM_MISMATCH,
"Wrong md5 hash",
- "The checksum sent %s and the actual checksum "
- "%s does not match."
- % (hashutil.hash_to_hex(md5sum), hashutil.hash_to_hex(_md5sum)),
+ f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual "
+ f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.",
)
return None
def _binary_upload(
self,
- request,
- headers,
- collection_name,
- deposit_id=None,
- replace_metadata=False,
- replace_archives=False,
- ):
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> Dict[str, Any]:
"""Binary upload routine.
Other than such a request, a 415 response is returned.
Args:
request (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier if provided
replace_metadata (bool): 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives (bool): 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 412 (precondition failed) if the length or md5 hash provided
mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is provided
"""
content_length = headers["content-length"]
if not content_length:
return make_error_dict(
BAD_REQUEST,
"CONTENT_LENGTH header is mandatory",
- "For archive deposit, the " "CONTENT_LENGTH header must be sent.",
+ "For archive deposit, the CONTENT_LENGTH header must be sent.",
)
content_disposition = headers["content-disposition"]
if not content_disposition:
return make_error_dict(
BAD_REQUEST,
"CONTENT_DISPOSITION header is mandatory",
- "For archive deposit, the " "CONTENT_DISPOSITION header must be sent.",
+ "For archive deposit, the CONTENT_DISPOSITION header must be sent.",
)
packaging = headers["packaging"]
if packaging and packaging not in ACCEPT_PACKAGINGS:
return make_error_dict(
BAD_REQUEST,
- "Only packaging %s is supported" % ACCEPT_PACKAGINGS,
- "The packaging provided %s is not supported" % packaging,
+ f"Only packaging {ACCEPT_PACKAGINGS} is supported",
+ f"The packaging provided {packaging} is not supported",
)
filehandler = request.FILES["file"]
precondition_status_response = self._check_preconditions_on(
filehandler, headers["content-md5sum"], content_length
)
if precondition_status_response:
return precondition_status_response
external_id = headers["slug"]
# actual storage of data
archive_metadata = filehandler
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
self._deposit_request_put(
deposit,
{ARCHIVE_KEY: archive_metadata},
replace_metadata=replace_metadata,
replace_archives=replace_archives,
)
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"status": deposit.status,
"archive": filehandler.name,
}
- def _read_metadata(self, metadata_stream):
+ def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]:
"""Given a metadata stream, reads the metadata and returns both the
parsed and the raw metadata.
"""
raw_metadata = metadata_stream.read()
metadata = parse_xml(raw_metadata)
return raw_metadata, metadata
def _multipart_upload(
self,
- request,
- headers,
- collection_name,
- deposit_id=None,
- replace_metadata=False,
- replace_archives=False,
- ):
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> Dict:
"""Multipart upload supported with exactly:
- 1 archive (zip)
- 1 atom entry
Other than such a request, a 415 response is returned.
Args:
request (Request): the request holding information to parse
and inject in db
- headers (dict): request headers formatted
- collection_name (str): the associated client
- deposit_id (id): deposit identifier if provided
- replace_metadata (bool): 'Update or add' request to existing
+ headers: request headers formatted
+ collection_name: the associated client
+ deposit_id: deposit identifier if provided
+ replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
- replace_archives (bool): 'Update or add' request to existing
+ replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash provided
mismatch the reality of the archive
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 415 (unsupported media type) if a wrong media type is provided
"""
external_id = headers["slug"]
content_types_present = set()
- data = {
+ data: Dict[str, Optional[Any]] = {
"application/zip": None, # expected either zip
"application/x-tar": None, # or x-tar
"application/atom+xml": None,
}
for key, value in request.FILES.items():
fh = value
- if fh.content_type in content_types_present:
+ content_type = fh.content_type
+ if content_type in content_types_present:
return make_error_dict(
ERROR_CONTENT,
"Only 1 application/zip (or application/x-tar) archive "
"and 1 atom+xml entry is supported (as per sword2.0 "
"specification)",
"You provided more than 1 application/(zip|x-tar) "
"or more than 1 application/atom+xml content-disposition "
"header in the multipart deposit",
)
- content_types_present.add(fh.content_type)
- data[fh.content_type] = fh
+ content_types_present.add(content_type)
+ assert content_type is not None
+ data[content_type] = fh
if len(content_types_present) != 2:
return make_error_dict(
ERROR_CONTENT,
"You must provide both 1 application/zip (or "
"application/x-tar) and 1 atom+xml entry for multipart "
"deposit",
"You need to provide only 1 application/(zip|x-tar) "
"and 1 application/atom+xml content-disposition header "
"in the multipart deposit",
)
filehandler = data["application/zip"]
if not filehandler:
filehandler = data["application/x-tar"]
precondition_status_response = self._check_preconditions_on(
filehandler, headers["content-md5sum"]
)
if precondition_status_response:
return precondition_status_response
try:
raw_metadata, metadata = self._read_metadata(data["application/atom+xml"])
except ParserError:
return make_error_dict(
PARSING_ERROR,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
# actual storage of data
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
METADATA_KEY: metadata,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
deposit, deposit_request_data, replace_metadata, replace_archives
)
+ assert filehandler is not None
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"archive": filehandler.name,
"status": deposit.status,
}
def _atom_entry(
self,
- request,
- headers,
- collection_name,
- deposit_id=None,
- replace_metadata=False,
- replace_archives=False,
- ):
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ replace_metadata: bool = False,
+ replace_archives: bool = False,
+ ) -> Dict[str, Any]:
"""Atom entry deposit.
Args:
request (Request): the request holding information to parse
and inject in db
- headers (dict): request headers formatted
- collection_name (str): the associated client
- deposit_id (id): deposit identifier if provided
- replace_metadata (bool): 'Update or add' request to existing
+ headers: request headers formatted
+ collection_name: the associated client
+ deposit_id: deposit identifier if provided
+ replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
- replace_archives (bool): 'Update or add' request to existing
+ replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id: deposit id associated to the deposit
- deposit_date: date of the deposit
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is provided
"""
try:
raw_metadata, metadata = self._read_metadata(request.data)
except ParserError:
return make_error_dict(
BAD_REQUEST,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
if not metadata:
return make_error_dict(
BAD_REQUEST,
"Empty body request is not supported",
"Atom entry deposit is supposed to send for metadata. "
"If the body is empty, there is no metadata.",
)
external_id = metadata.get("external_identifier", headers["slug"])
+ # TODO: Determine if we are in the metadata-only deposit case. If it is, then
+ # save deposit and deposit request typed 'metadata' and send metadata to the
+ # metadata storage. Otherwise, do as existing deposit.
+
deposit = self._deposit_put(
request,
deposit_id=deposit_id,
in_progress=headers["in-progress"],
external_id=external_id,
)
self._deposit_request_put(
deposit,
{METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata},
replace_metadata,
replace_archives,
)
return {
"deposit_id": deposit.id,
"deposit_date": deposit.reception_date,
"archive": None,
"status": deposit.status,
}
- def _empty_post(self, request, headers, collection_name, deposit_id):
+ def _empty_post(
+ self, request: Request, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Empty post to finalize an empty deposit.
Args:
- request (Request): the request holding information to parse
+ request: the request holding information to parse
and inject in db
- headers (dict): request headers formatted
- collection_name (str): the associated client
- deposit_id (id): deposit identifier
+ headers: request headers formatted
+ collection_name: the associated client
+ deposit_id: deposit identifier
Returns:
Dictionary of result with the deposit's id, the date
it was completed and no archive.
"""
deposit = Deposit.objects.get(pk=deposit_id)
deposit.complete_date = timezone.now()
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return {
"deposit_id": deposit_id,
"deposit_date": deposit.complete_date,
"status": deposit.status,
"archive": None,
}
- def _make_iris(self, request, collection_name, deposit_id):
+ def _make_iris(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Define the IRI endpoints
Args:
request (Request): The initial request
collection_name (str): client/collection's name
deposit_id (id): Deposit identifier
Returns:
Dictionary of keys with the iris' urls.
"""
args = [collection_name, deposit_id]
return {
iri: request.build_absolute_uri(reverse(iri, args=args))
for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI]
}
- def additional_checks(self, request, headers, collection_name, deposit_id=None):
+ def additional_checks(
+ self,
+ request: Request,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Dict[str, Any]:
"""Permit the child class to enrich additional checks.
Returns:
dict with 'error' detailing the problem.
"""
return {}
- def checks(self, request, collection_name, deposit_id=None):
+ def checks(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> Dict[str, Any]:
try:
self._collection = DepositCollection.objects.get(name=collection_name)
except DepositCollection.DoesNotExist:
return make_error_dict(
- NOT_FOUND, "Unknown collection name %s" % collection_name
+ NOT_FOUND, f"Unknown collection name {collection_name}"
)
+ assert self._collection is not None
username = request.user.username
if username: # unauthenticated request can have the username empty
try:
- self._client = DepositClient.objects.get(username=username)
+ self._client: DepositClient = DepositClient.objects.get( # type: ignore
+ username=username
+ )
except DepositClient.DoesNotExist:
- return make_error_dict(NOT_FOUND, "Unknown client name %s" % username)
+ return make_error_dict(NOT_FOUND, f"Unknown client name {username}")
- if self._collection.id not in self._client.collections:
+ collection_id = self._collection.id
+ collections = self._client.collections
+ assert collections is not None
+ if collection_id not in collections:
return make_error_dict(
FORBIDDEN,
- "Client %s cannot access collection %s"
- % (username, collection_name),
+ f"Client {username} cannot access collection {collection_name}",
)
if deposit_id:
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
- NOT_FOUND, "Deposit with id %s does not exist" % deposit_id
+ NOT_FOUND, f"Deposit with id {deposit_id} does not exist"
)
checks = self.restrict_access(request, deposit)
if checks:
return checks
headers = self._read_headers(request)
if headers["on-behalf-of"]:
return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.")
checks = self.additional_checks(request, headers, collection_name, deposit_id)
if "error" in checks:
return checks
return {"headers": headers}
- def restrict_access(self, request, deposit=None):
+ def restrict_access(
+ self, request: Request, deposit: Optional[Deposit] = None
+ ) -> Dict[str, Any]:
if deposit:
if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
summary = "You can only act on deposit with status '%s'" % (
DEPOSIT_STATUS_PARTIAL,
)
- description = "This deposit has status '%s'" % deposit.status
+ description = f"This deposit has status '{deposit.status}'"
return make_error_dict(
BAD_REQUEST, summary=summary, verbose_description=description
)
+ return {}
- def _basic_not_allowed_method(self, request, method):
+ def _basic_not_allowed_method(self, request: Request, method: str):
return make_error_response(
request,
METHOD_NOT_ALLOWED,
- "%s method is not supported on this endpoint" % method,
+ f"{method} method is not supported on this endpoint",
)
- def get(self, request, *args, **kwargs):
+ def get(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Union[HttpResponse, FileResponse]:
return self._basic_not_allowed_method(request, "GET")
- def post(self, request, *args, **kwargs):
+ def post(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
return self._basic_not_allowed_method(request, "POST")
- def put(self, request, *args, **kwargs):
+ def put(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> HttpResponse:
return self._basic_not_allowed_method(request, "PUT")
- def delete(self, request, *args, **kwargs):
+ def delete(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
return self._basic_not_allowed_method(request, "DELETE")
-class SWHGetDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
+class APIGet(APIBase, metaclass=ABCMeta):
"""Mixin for class to support GET method.
"""
- def get(self, request, collection_name, deposit_id, format=None):
+ def get(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Union[HttpResponse, FileResponse]:
"""Endpoint to create/add resources to deposit.
Returns:
200 response when no error during routine occurred
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
r = self.process_get(request, collection_name, deposit_id)
- if isinstance(r, tuple):
- status, content, content_type = r
- return HttpResponse(content, status=status, content_type=content_type)
-
- return r
+ status, content, content_type = r
+ if content_type == "swh/generator":
+ with content as path:
+ return FileResponse(
+ open(path, "rb"), status=status, content_type="application/zip"
+ )
+ if content_type == "application/json":
+ return HttpResponse(
+ json.dumps(content), status=status, content_type=content_type
+ )
+ return HttpResponse(content, status=status, content_type=content_type)
@abstractmethod
- def process_get(self, request, collection_name, deposit_id):
+ def process_get(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Any, str]:
"""Routine to deal with the deposit's get processing.
Returns:
Tuple status, stream of content, content-type
"""
pass
-class SWHPostDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
+class APIPost(APIBase, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
- def post(self, request, collection_name, deposit_id=None, format=None):
+ def post(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
"""Endpoint to create/add resources to deposit.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
headers = checks["headers"]
_status, _iri_key, data = self.process_post(
request, headers, collection_name, deposit_id
)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
data["packagings"] = ACCEPT_PACKAGINGS
iris = self._make_iris(request, collection_name, data["deposit_id"])
data.update(iris)
response = render(
request,
"deposit/deposit_receipt.xml",
context=data,
content_type="application/xml",
status=_status,
)
- response._headers["location"] = "Location", data[_iri_key]
+ response._headers["location"] = "Location", data[_iri_key] # type: ignore
return response
@abstractmethod
- def process_post(self, request, headers, collection_name, deposit_id=None):
+ def process_post(
+ self,
+ request,
+ headers: Dict,
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Tuple[int, str, Dict]:
"""Routine to deal with the deposit's processing.
Returns
Tuple of:
- response status code (200, 201, etc...)
- key iri (EM_IRI, EDIT_SE_IRI, etc...)
- dictionary of the processing result
"""
pass
-class SWHPutDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
+class APIPut(APIBase, metaclass=ABCMeta):
"""Mixin for class to support PUT method.
"""
- def put(self, request, collection_name, deposit_id, format=None):
+ def put(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> HttpResponse:
"""Endpoint to update deposit resources.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
headers = checks["headers"]
data = self.process_put(request, headers, collection_name, deposit_id)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
- def process_put(self, request, headers, collection_name, deposit_id):
+ def process_put(
+ self, request: Request, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Routine to deal with updating a deposit in some way.
Returns
dictionary of the processing result
"""
pass
-class SWHDeleteDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
+class APIDelete(APIBase, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
- def delete(self, request, collection_name, deposit_id):
+ def delete(
+ self, request: Request, collection_name: str, deposit_id: Optional[int] = None
+ ) -> HttpResponse:
"""Endpoint to delete some deposit's resources (archives, deposit).
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(request, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(request, checks["error"])
+ assert deposit_id is not None
data = self.process_delete(request, collection_name, deposit_id)
error = data.get("error")
if error:
return make_error_response_from_dict(request, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
- def process_delete(self, request, collection_name, deposit_id):
+ def process_delete(
+ self, request: Request, collection_name: str, deposit_id: int
+ ) -> Dict:
"""Routine to delete a resource.
This is mostly not allowed except for the
- EM_IRI (cf. .api.deposit_update.SWHUpdateArchiveDeposit)
+ EM_IRI (cf. .api.deposit_update.APIUpdateArchive)
"""
- pass
+ return {}
diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py
index c5258081..8cc4455c 100644
--- a/swh/deposit/api/deposit.py
+++ b/swh/deposit/api/deposit.py
@@ -1,95 +1,112 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Any, Dict, Optional, Tuple
+
from rest_framework import status
-from .common import SWHPostDepositAPI, ACCEPT_ARCHIVE_CONTENT_TYPES
from ..config import EDIT_SE_IRI
-from ..errors import make_error_dict, BAD_REQUEST
-from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser
-from ..parsers import SWHAtomEntryParser
-from ..parsers import SWHMultiPartParser
+from ..errors import BAD_REQUEST, make_error_dict
+from ..parsers import (
+ SWHAtomEntryParser,
+ SWHFileUploadTarParser,
+ SWHFileUploadZipParser,
+ SWHMultiPartParser,
+)
+from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIPost
-class SWHDeposit(SWHPostDepositAPI):
+class APIPostDeposit(APIPost):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'Col IRI' in the sword specification.
HTTP verbs supported: POST
"""
parser_classes = (
SWHMultiPartParser,
SWHFileUploadZipParser,
SWHFileUploadTarParser,
SWHAtomEntryParser,
)
- def additional_checks(self, req, headers, collection_name, deposit_id=None):
+ def additional_checks(
+ self,
+ req,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Dict[str, Any]:
slug = headers["slug"]
if not slug:
msg = "Missing SLUG header in request"
verbose_description = "Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing." # noqa
return make_error_dict(BAD_REQUEST, msg, verbose_description)
return {}
- def process_post(self, req, headers, collection_name, deposit_id=None):
+ def process_post(
+ self,
+ req,
+ headers: Dict[str, Any],
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Tuple[int, str, Dict[str, Any]]:
"""Create a first deposit as:
- archive deposit (1 zip)
- multipart (1 zip + 1 atom entry)
- atom entry
Args:
req (Request): the request holding the information to parse
and inject in db
collection_name (str): the associated client
Returns:
An http response (HttpResponse) according to the situation.
If everything is ok, a 201 response (created) with a
deposit receipt.
Otherwise, depending on the upload, the following errors
can be returned:
- archive deposit:
- 400 (bad request) if the request is not providing an external
identifier
- 403 (forbidden) if the length of the archive exceeds the
max size configured
- 412 (precondition failed) if the length or hash provided
mismatch the reality of the archive.
- 415 (unsupported media type) if a wrong media type is
provided
- multipart deposit:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash
provided mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is
provided
- Atom entry deposit:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is
provided
"""
assert deposit_id is None
if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES:
data = self._binary_upload(req, headers, collection_name)
elif req.content_type.startswith("multipart/"):
data = self._multipart_upload(req, headers, collection_name)
else:
data = self._atom_entry(req, headers, collection_name)
return status.HTTP_201_CREATED, EDIT_SE_IRI, data
diff --git a/swh/deposit/api/deposit_content.py b/swh/deposit/api/deposit_content.py
index a5065ae6..fbab2fe4 100644
--- a/swh/deposit/api/deposit_content.py
+++ b/swh/deposit/api/deposit_content.py
@@ -1,47 +1,47 @@
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from django.http import HttpResponse
from django.shortcuts import render
from rest_framework import status
-from .common import SWHBaseDeposit
-from ..errors import NOT_FOUND, make_error_response
-from ..errors import make_error_response_from_dict
+from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict
from ..models import DEPOSIT_STATUS_DETAIL, Deposit, DepositRequest
+from .common import APIBase
-class SWHDepositContent(SWHBaseDeposit):
- def get(self, req, collection_name, deposit_id, format=None):
+class APIContent(APIBase):
+ def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse:
checks = self.checks(req, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(req, checks["error"])
try:
deposit = Deposit.objects.get(pk=deposit_id)
if deposit.collection.name != collection_name:
raise Deposit.DoesNotExist
except Deposit.DoesNotExist:
return make_error_response(
req,
NOT_FOUND,
"deposit %s does not belong to collection %s"
% (deposit_id, collection_name),
)
requests = DepositRequest.objects.filter(deposit=deposit)
context = {
"deposit_id": deposit.id,
"status": deposit.status,
"status_detail": DEPOSIT_STATUS_DETAIL[deposit.status],
"requests": requests,
}
return render(
req,
"deposit/content.xml",
context=context,
content_type="application/xml",
status=status.HTTP_200_OK,
)
diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py
index 18c26556..9c87db9c 100644
--- a/swh/deposit/api/deposit_status.py
+++ b/swh/deposit/api/deposit_status.py
@@ -1,65 +1,65 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from django.http import HttpResponse
from django.shortcuts import render
from rest_framework import status
-from .common import SWHBaseDeposit
-from .converters import convert_status_detail
-from ..errors import NOT_FOUND, make_error_response
-from ..errors import make_error_response_from_dict
+from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict
from ..models import DEPOSIT_STATUS_DETAIL, Deposit
+from .common import APIBase
+from .converters import convert_status_detail
-class SWHDepositStatus(SWHBaseDeposit):
+class APIStatus(APIBase):
"""Deposit status.
What's known as 'State IRI' in the sword specification.
HTTP verbs supported: GET
"""
- def get(self, req, collection_name, deposit_id, format=None):
+ def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse:
checks = self.checks(req, collection_name, deposit_id)
if "error" in checks:
return make_error_response_from_dict(req, checks["error"])
try:
deposit = Deposit.objects.get(pk=deposit_id)
if deposit.collection.name != collection_name:
raise Deposit.DoesNotExist
except Deposit.DoesNotExist:
return make_error_response(
req,
NOT_FOUND,
"deposit %s does not belong to collection %s"
% (deposit_id, collection_name),
)
status_detail = convert_status_detail(deposit.status_detail)
if not status_detail:
status_detail = DEPOSIT_STATUS_DETAIL[deposit.status]
context = {
"deposit_id": deposit.id,
"status_detail": status_detail,
}
keys = (
"status",
"swh_id",
"swh_id_context",
"external_id",
)
for k in keys:
context[k] = getattr(deposit, k, None)
return render(
req,
"deposit/status.xml",
context=context,
content_type="application/xml",
status=status.HTTP_200_OK,
)
diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py
index 21707415..ded1bf5f 100644
--- a/swh/deposit/api/deposit_update.py
+++ b/swh/deposit/api/deposit_update.py
@@ -1,169 +1,185 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Any, Dict, Optional, Tuple
+
from rest_framework import status
-from .common import SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI
-from .common import ACCEPT_ARCHIVE_CONTENT_TYPES
from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI
-from ..errors import make_error_dict, BAD_REQUEST
-from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser
-from ..parsers import SWHAtomEntryParser
-from ..parsers import SWHMultiPartParser
+from ..errors import BAD_REQUEST, make_error_dict
+from ..parsers import (
+ SWHAtomEntryParser,
+ SWHFileUploadTarParser,
+ SWHFileUploadZipParser,
+ SWHMultiPartParser,
+)
+from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut
-class SWHUpdateArchiveDeposit(SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI):
+class APIUpdateArchive(APIPost, APIPut, APIDelete):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'EM IRI' in the sword specification.
HTTP verbs supported: PUT, POST, DELETE
"""
parser_classes = (
SWHFileUploadZipParser,
SWHFileUploadTarParser,
)
- def process_put(self, req, headers, collection_name, deposit_id):
+ def process_put(
+ self, req, headers, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Replace existing content for the existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa
Returns:
204 No content
"""
if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES:
msg = "Packaging format supported is restricted to %s" % (
", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES)
)
return make_error_dict(BAD_REQUEST, msg)
return self._binary_upload(
req, headers, collection_name, deposit_id=deposit_id, replace_archives=True
)
- def process_post(self, req, headers, collection_name, deposit_id):
+ def process_post(
+ self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None
+ ) -> Tuple[int, str, Dict]:
"""Add new content to the existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa
Returns:
201 Created
Headers: Location: [Cont-File-IRI]
Body: [optional Deposit Receipt]
"""
if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES:
msg = "Packaging format supported is restricted to %s" % (
", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES)
)
- return "unused", "unused", make_error_dict(BAD_REQUEST, msg)
+ unused = 0
+ return unused, "unused", make_error_dict(BAD_REQUEST, msg)
return (
status.HTTP_201_CREATED,
CONT_FILE_IRI,
self._binary_upload(req, headers, collection_name, deposit_id),
)
- def process_delete(self, req, collection_name, deposit_id):
+ def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict:
"""Delete content (archives) from existing deposit.
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa
Returns:
204 Created
"""
return self._delete_archives(collection_name, deposit_id)
-class SWHUpdateMetadataDeposit(
- SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI
-):
+class APIUpdateMetadata(APIPost, APIPut, APIDelete):
"""Deposit request class defining api endpoints for sword deposit.
What's known as 'Edit IRI' (and SE IRI) in the sword specification.
HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE
"""
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser)
- def process_put(self, req, headers, collection_name, deposit_id):
+ def process_put(
+ self, req, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict[str, Any]:
"""Replace existing deposit's metadata/archive with new ones.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa
Returns:
204 No content
"""
if req.content_type.startswith("multipart/"):
return self._multipart_upload(
req,
headers,
collection_name,
deposit_id=deposit_id,
replace_archives=True,
replace_metadata=True,
)
return self._atom_entry(
req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True
)
- def process_post(self, req, headers, collection_name, deposit_id):
+ def process_post(
+ self,
+ request,
+ headers: Dict,
+ collection_name: str,
+ deposit_id: Optional[int] = None,
+ ) -> Tuple[int, str, Dict]:
"""Add new metadata/archive to existing deposit.
source:
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata # noqa
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart # noqa
This also deals with an empty post corner case to finalize a
deposit.
Returns:
In optimal case for a multipart and atom-entry update, a
201 Created response. The body response will hold a
deposit. And the response headers will contain an entry
'Location' with the EM-IRI.
For the empty post case, this returns a 200.
"""
- if req.content_type.startswith("multipart/"):
+ assert deposit_id is not None
+ if request.content_type.startswith("multipart/"):
return (
status.HTTP_201_CREATED,
EM_IRI,
self._multipart_upload(
- req, headers, collection_name, deposit_id=deposit_id
+ request, headers, collection_name, deposit_id=deposit_id
),
)
# check for final empty post
# source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html
# #continueddeposit_complete
if headers["content-length"] == 0 and headers["in-progress"] is False:
- data = self._empty_post(req, headers, collection_name, deposit_id)
+ data = self._empty_post(request, headers, collection_name, deposit_id)
return (status.HTTP_200_OK, EDIT_SE_IRI, data)
return (
status.HTTP_201_CREATED,
EM_IRI,
- self._atom_entry(req, headers, collection_name, deposit_id=deposit_id),
+ self._atom_entry(request, headers, collection_name, deposit_id=deposit_id),
)
- def process_delete(self, req, collection_name, deposit_id):
+ def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict:
"""Delete the container (deposit).
source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa
"""
return self._delete_deposit(collection_name, deposit_id)
diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py
index db3e2f5a..4a9aaaa8 100644
--- a/swh/deposit/api/private/__init__.py
+++ b/swh/deposit/api/private/__init__.py
@@ -1,109 +1,96 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.deposit import utils
-
-from ...config import METADATA_TYPE, SWHDefaultConfig
-from ...models import DepositRequest, Deposit
-
from rest_framework.permissions import AllowAny
-from swh.deposit.api.common import SWHAPIView
-from swh.deposit.errors import make_error_dict, NOT_FOUND
+from swh.deposit import utils
+from swh.deposit.api.common import AuthenticatedAPIView
+from swh.deposit.errors import NOT_FOUND, make_error_dict
+
+from ...config import METADATA_TYPE, APIConfig
+from ...models import Deposit, DepositRequest
class DepositReadMixin:
"""Deposit Read mixin
"""
def _deposit_requests(self, deposit, request_type):
"""Given a deposit, yields its associated deposit_request
Args:
deposit (Deposit): Deposit to list requests for
request_type (str): 'archive' or 'metadata'
Yields:
deposit requests of type request_type associated to the deposit
"""
if isinstance(deposit, int):
deposit = Deposit.objects.get(pk=deposit)
deposit_requests = DepositRequest.objects.filter(
type=request_type, deposit=deposit
).order_by("id")
for deposit_request in deposit_requests:
yield deposit_request
def _metadata_get(self, deposit):
"""Given a deposit, aggregate all metadata requests.
Args:
deposit (Deposit): The deposit instance to extract
metadata from.
Returns:
metadata dict from the deposit.
"""
metadata = (
m.metadata
for m in self._deposit_requests(deposit, request_type=METADATA_TYPE)
)
return utils.merge(*metadata)
-class SWHPrivateAPIView(SWHDefaultConfig, SWHAPIView):
+class APIPrivateView(APIConfig, AuthenticatedAPIView):
"""Mixin intended as private api (so no authentication) based API view
(for the private ones).
"""
authentication_classes = ()
permission_classes = (AllowAny,)
def checks(self, req, collection_name, deposit_id=None):
"""Override default checks implementation to allow empty collection.
"""
if deposit_id:
try:
Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND, "Deposit with id %s does not exist" % deposit_id
)
headers = self._read_headers(req)
checks = self.additional_checks(req, headers, collection_name, deposit_id)
if "error" in checks:
return checks
return {"headers": headers}
def get(
- self,
- request,
- collection_name=None,
- deposit_id=None,
- format=None,
- *args,
- **kwargs,
+ self, request, collection_name=None, deposit_id=None, *args, **kwargs,
):
- return super().get(request, collection_name, deposit_id, format)
+ return super().get(request, collection_name, deposit_id)
def put(
- self,
- request,
- collection_name=None,
- deposit_id=None,
- format=None,
- *args,
- **kwargs,
+ self, request, collection_name=None, deposit_id=None, *args, **kwargs,
):
- return super().put(request, collection_name, deposit_id, format)
+ return super().put(request, collection_name, deposit_id)
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
index 76924560..d2afd5e7 100644
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -1,230 +1,234 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
+from itertools import chain
import re
+from shutil import get_unpack_formats
import tarfile
+from typing import Dict, Optional, Tuple
import zipfile
-from itertools import chain
-from shutil import get_unpack_formats
-
from rest_framework import status
from swh.scheduler.utils import create_oneshot_task_dict
-from . import DepositReadMixin, SWHPrivateAPIView
-from ..common import SWHGetDepositAPI
-from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED
-from ...config import ARCHIVE_TYPE
-from ...models import Deposit
+from . import APIPrivateView, DepositReadMixin
+from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED
+from ...models import Deposit, DepositRequest
+from ..common import APIGet
MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing"
MANDATORY_ARCHIVE_UNREADABLE = (
"At least one of its associated archives is not readable" # noqa
)
MANDATORY_ARCHIVE_INVALID = (
"Mandatory archive is invalid (i.e contains only one archive)" # noqa
)
MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported"
MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected"
ARCHIVE_EXTENSIONS = [
"zip",
"tar",
"tar.gz",
"xz",
"tar.xz",
"bz2",
"tar.bz2",
"Z",
"tar.Z",
"tgz",
"7z",
]
PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS))
def known_archive_format(filename):
return any(
filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats()))
)
-class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
+class APIChecks(APIPrivateView, APIGet, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
- def _check_deposit_archives(self, deposit):
+ def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]:
"""Given a deposit, check each deposit request of type archive.
Args:
The deposit to check archives for
Returns
tuple (status, error_detail): True, None if all archives
are ok, (False, ) otherwise.
"""
requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE))
if len(requests) == 0: # no associated archive is refused
return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]}
errors = []
for archive_request in requests:
check, error_message = self._check_archive(archive_request)
if not check:
errors.append(
{"summary": error_message, "fields": [archive_request.id]}
)
if not errors:
return True, None
return False, {"archive": errors}
- def _check_archive(self, archive_request):
+ def _check_archive(
+ self, archive_request: DepositRequest
+ ) -> Tuple[bool, Optional[str]]:
"""Check that a deposit associated archive is ok:
- readable
- supported archive format
- valid content: the archive does not contain a single archive file
If any of those checks are not ok, return the corresponding
failing check.
Args:
archive_path (DepositRequest): Archive to check
Returns:
(True, None) if archive is check compliant, (False,
) otherwise.
"""
archive_path = archive_request.archive.path
if not known_archive_format(archive_path):
return False, MANDATORY_ARCHIVE_UNSUPPORTED
try:
if zipfile.is_zipfile(archive_path):
- with zipfile.ZipFile(archive_path) as f:
- files = f.namelist()
+ with zipfile.ZipFile(archive_path) as zipfile_:
+ files = zipfile_.namelist()
elif tarfile.is_tarfile(archive_path):
- with tarfile.open(archive_path) as f:
- files = f.getnames()
+ with tarfile.open(archive_path) as tarfile_:
+ files = tarfile_.getnames()
else:
return False, MANDATORY_ARCHIVE_UNSUPPORTED
except Exception:
return False, MANDATORY_ARCHIVE_UNREADABLE
if len(files) > 1:
return True, None
element = files[0]
if PATTERN_ARCHIVE_EXTENSION.match(element):
# archive in archive!
return False, MANDATORY_ARCHIVE_INVALID
return True, None
- def _check_metadata(self, metadata):
+ def _check_metadata(self, metadata: Dict) -> Tuple[bool, Optional[Dict]]:
"""Check to execute on all metadata for mandatory field presence.
Args:
metadata (dict): Metadata dictionary to check for mandatory fields
Returns:
tuple (status, error_detail): True, None if metadata are
ok (False, ) otherwise.
"""
required_fields = {
"author": False,
}
alternate_fields = {
("name", "title"): False, # alternate field, at least one
# of them must be present
}
for field, value in metadata.items():
for name in required_fields:
if name in field:
required_fields[name] = True
for possible_names in alternate_fields:
for possible_name in possible_names:
if possible_name in field:
alternate_fields[possible_names] = True
continue
mandatory_result = [k for k, v in required_fields.items() if not v]
optional_result = [" or ".join(k) for k, v in alternate_fields.items() if not v]
if mandatory_result == [] and optional_result == []:
return True, None
detail = []
if mandatory_result != []:
detail.append(
{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}
)
if optional_result != []:
detail.append(
{"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,}
)
return False, {"metadata": detail}
- def process_get(self, req, collection_name, deposit_id):
+ def process_get(
+ self, req, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Dict, str]:
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
req (Request):
collection_name (str): Collection owning the deposit
deposit_id (id): Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
deposit = Deposit.objects.get(pk=deposit_id)
metadata = self._metadata_get(deposit)
- problems = {}
+ problems: Dict = {}
# will check each deposit's associated request (both of type
# archive and metadata) for errors
archives_status, error_detail = self._check_deposit_archives(deposit)
if not archives_status:
+ assert error_detail is not None
problems.update(error_detail)
metadata_status, error_detail = self._check_metadata(metadata)
if not metadata_status:
+ assert error_detail is not None
problems.update(error_detail)
deposit_status = archives_status and metadata_status
# if any problems arose, the deposit is rejected
if not deposit_status:
deposit.status = DEPOSIT_STATUS_REJECTED
deposit.status_detail = problems
response = {
"status": deposit.status,
"details": deposit.status_detail,
}
else:
deposit.status = DEPOSIT_STATUS_VERIFIED
response = {
"status": deposit.status,
}
if not deposit.load_task_id and self.config["checks"]:
url = deposit.origin_url
task = create_oneshot_task_dict(
"load-deposit", url=url, deposit_id=deposit.id, retries_left=3
)
load_task_id = self.scheduler.create_tasks([task])[0]["id"]
deposit.load_task_id = load_task_id
deposit.save()
- return status.HTTP_200_OK, json.dumps(response), "application/json"
+ return status.HTTP_200_OK, response, "application/json"
diff --git a/swh/deposit/api/private/deposit_list.py b/swh/deposit/api/private/deposit_list.py
index c63a14df..a5c81c12 100644
--- a/swh/deposit/api/private/deposit_list.py
+++ b/swh/deposit/api/private/deposit_list.py
@@ -1,66 +1,66 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from rest_framework import serializers
from rest_framework.fields import _UnvalidatedField
from rest_framework.generics import ListAPIView
from rest_framework.pagination import PageNumberPagination
-from rest_framework import serializers
-from . import SWHPrivateAPIView
-from ..converters import convert_status_detail
+from . import APIPrivateView
from ...models import Deposit
+from ..converters import convert_status_detail
class DefaultPagination(PageNumberPagination):
page_size = 100
page_size_query_param = "page_size"
class StatusDetailField(_UnvalidatedField):
"""status_detail field is a dict, we want a simple message instead.
So, we reuse the convert_status_detail from deposit_status
endpoint to that effect.
"""
def to_representation(self, value):
return convert_status_detail(value)
class DepositSerializer(serializers.ModelSerializer):
status_detail = StatusDetailField()
class Meta:
model = Deposit
fields = "__all__"
-class DepositList(ListAPIView, SWHPrivateAPIView):
+class APIList(ListAPIView, APIPrivateView):
"""Deposit request class to list the deposit's status per page.
HTTP verbs supported: GET
"""
serializer_class = DepositSerializer
pagination_class = DefaultPagination
def get_queryset(self):
params = self.request.query_params
exclude_like = params.get("exclude")
if exclude_like:
# sql injection: A priori, nothing to worry about, django does it for
# queryset
# https://docs.djangoproject.com/en/3.0/topics/security/#sql-injection-protection # noqa
# https://docs.djangoproject.com/en/2.2/topics/security/#sql-injection-protection # noqa
deposits = (
Deposit.objects.all()
.exclude(external_id__startswith=exclude_like)
.order_by("id")
)
else:
deposits = Deposit.objects.all().order_by("id")
return deposits
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
index a387fc7f..51b6636e 100644
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,218 +1,195 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
+from contextlib import contextmanager
import os
import shutil
import tempfile
+from typing import Any, Dict, Tuple
-from contextlib import contextmanager
-from django.http import FileResponse
from rest_framework import status
from swh.core import tarball
-from swh.model import identifiers
+from swh.deposit.api import __version__
from swh.deposit.utils import normalize_date
+from swh.model import identifiers
-from . import DepositReadMixin, SWHPrivateAPIView
-from ...config import SWH_PERSON, ARCHIVE_TYPE
-from ..common import SWHGetDepositAPI
+from . import APIPrivateView, DepositReadMixin
+from ...config import ARCHIVE_TYPE, SWH_PERSON
from ...models import Deposit
+from ..common import APIGet
@contextmanager
def aggregate_tarballs(extraction_dir, archive_paths):
"""Aggregate multiple tarballs into one and returns this new archive's
path.
Args:
extraction_dir (path): Path to use for the tarballs computation
archive_paths ([str]): Deposit's archive paths
Returns:
Tuple (directory to clean up, archive path (aggregated or not))
"""
# rebuild one zip archive from (possibly) multiple ones
os.makedirs(extraction_dir, 0o755, exist_ok=True)
dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir)
# root folder to build an aggregated tarball
aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate")
os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
# uncompress in a temporary location all archives
for archive_path in archive_paths:
tarball.uncompress(archive_path, aggregated_tarball_rootdir)
# Aggregate into one big tarball the multiple smaller ones
temp_tarpath = shutil.make_archive(
aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir
)
# can already clean up temporary directory
shutil.rmtree(aggregated_tarball_rootdir)
try:
yield temp_tarpath
finally:
shutil.rmtree(dir_path)
-class SWHDepositReadArchives(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
+class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
- ADDITIONAL_CONFIG = {
- "extraction_dir": ("str", "/tmp/swh-deposit/archive/"),
- }
-
def __init__(self):
super().__init__()
self.extraction_dir = self.config["extraction_dir"]
if not os.path.exists(self.extraction_dir):
os.makedirs(self.extraction_dir)
- def process_get(self, request, collection_name, deposit_id):
+ def process_get(
+ self, request, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Any, str]:
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
request (Request):
- collection_name (str): Collection owning the deposit
- deposit_id (id): Deposit concerned by the reading
+ collection_name: Collection owning the deposit
+ deposit_id: Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
archive_paths = [
r.archive.path
for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE)
]
- with aggregate_tarballs(self.extraction_dir, archive_paths) as path:
- return FileResponse(
- open(path, "rb"),
- status=status.HTTP_200_OK,
- content_type="application/zip",
- )
+ return (
+ status.HTTP_200_OK,
+ aggregate_tarballs(self.extraction_dir, archive_paths),
+ "swh/generator",
+ )
-class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
+class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin):
"""Class in charge of aggregating metadata on a deposit.
- """
-
- ADDITIONAL_CONFIG = {
- "provider": (
- "dict",
- {
- # 'provider_name': '', # those are not set since read from the
- # 'provider_url': '', # deposit's client
- "provider_type": "deposit_client",
- "metadata": {},
- },
- ),
- "tool": (
- "dict",
- {
- "name": "swh-deposit",
- "version": "0.0.1",
- "configuration": {"sword_version": "2"},
- },
- ),
- }
+ """
def __init__(self):
super().__init__()
self.provider = self.config["provider"]
- self.tool = self.config["tool"]
+ self.tool = {
+ "name": "swh-deposit",
+ "version": __version__,
+ "configuration": {"sword_version": "2"},
+ }
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
Args:
deposit (Deposit): Deposit model representation
metadata (Dict): Metadata dict representation
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
commit_date = metadata.get("codemeta:datePublished")
author_date = metadata.get("codemeta:dateCreated")
if author_date and commit_date:
pass
elif commit_date:
author_date = commit_date
elif author_date:
commit_date = author_date
else:
author_date = deposit.complete_date
commit_date = deposit.complete_date
return (normalize_date(author_date), normalize_date(commit_date))
def metadata_read(self, deposit):
"""Read and aggregate multiple data on deposit into one unified data
dictionary.
Args:
deposit (Deposit): Deposit concerned by the data aggregation.
Returns:
Dictionary of data representing the deposit to inject in swh.
"""
metadata = self._metadata_get(deposit)
# Read information metadata
data = {"origin": {"type": "deposit", "url": deposit.origin_url,}}
# metadata provider
self.provider["provider_name"] = deposit.client.last_name
self.provider["provider_url"] = deposit.client.provider_url
author_date, commit_date = self._normalize_dates(deposit, metadata)
if deposit.parent:
swh_persistent_id = deposit.parent.swh_id
- persistent_identifier = identifiers.parse_persistent_identifier(
- swh_persistent_id
- )
- parent_revision = persistent_identifier.object_id
+ swhid = identifiers.parse_swhid(swh_persistent_id)
+ parent_revision = swhid.object_id
parents = [parent_revision]
else:
parents = []
data["origin_metadata"] = {
"provider": self.provider,
"tool": self.tool,
"metadata": metadata,
}
data["deposit"] = {
"id": deposit.id,
"client": deposit.client.username,
"collection": deposit.collection.name,
"author": SWH_PERSON,
"author_date": author_date,
"committer": SWH_PERSON,
"committer_date": commit_date,
"revision_parents": parents,
}
return data
- def process_get(self, request, collection_name, deposit_id):
+ def process_get(
+ self, request, collection_name: str, deposit_id: int
+ ) -> Tuple[int, Dict, str]:
deposit = Deposit.objects.get(pk=deposit_id)
data = self.metadata_read(deposit)
- d = {}
- if data:
- d = json.dumps(data)
-
- return status.HTTP_200_OK, d, "application/json"
+ return status.HTTP_200_OK, data if data else {}, "application/json"
diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py
index 67fa99f0..af6bcb6c 100644
--- a/swh/deposit/api/private/deposit_update_status.py
+++ b/swh/deposit/api/private/deposit_update_status.py
@@ -1,105 +1,107 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from rest_framework.parsers import JSONParser
+from typing import Dict
-from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT
+from rest_framework.parsers import JSONParser
-from . import SWHPrivateAPIView
-from ..common import SWHPutDepositAPI
-from ...errors import make_error_dict, BAD_REQUEST
-from ...models import Deposit, DEPOSIT_STATUS_DETAIL
-from ...models import DEPOSIT_STATUS_LOAD_SUCCESS
+from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
+from . import APIPrivateView
+from ...errors import BAD_REQUEST, make_error_dict
+from ...models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, Deposit
+from ..common import APIPut
MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"]
-class SWHUpdateStatusDeposit(SWHPrivateAPIView, SWHPutDepositAPI):
+class APIUpdateStatus(APIPrivateView, APIPut):
"""Deposit request class to update the deposit's status.
HTTP verbs supported: PUT
"""
parser_classes = (JSONParser,)
def additional_checks(self, request, headers, collection_name, deposit_id=None):
"""Enrich existing checks to the default ones.
New checks:
- Ensure the status is provided
- Ensure it exists
- no missing information on load success update
"""
data = request.data
status = data.get("status")
if not status:
msg = "The status key is mandatory with possible values %s" % list(
DEPOSIT_STATUS_DETAIL.keys()
)
return make_error_dict(BAD_REQUEST, msg)
if status not in DEPOSIT_STATUS_DETAIL:
msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys())
return make_error_dict(BAD_REQUEST, msg)
if status == DEPOSIT_STATUS_LOAD_SUCCESS:
missing_keys = []
for key in MANDATORY_KEYS:
value = data.get(key)
if value is None:
missing_keys.append(key)
if missing_keys:
msg = (
f"Updating deposit status to {status}"
f" requires information {','.join(missing_keys)}"
)
return make_error_dict(BAD_REQUEST, msg)
return {}
- def process_put(self, request, headers, collection_name, deposit_id):
+ def process_put(
+ self, request, headers: Dict, collection_name: str, deposit_id: int
+ ) -> Dict:
"""Update the deposit with status and SWHIDs
Returns:
204 No content
400 Bad request if checks fail
"""
data = request.data
deposit = Deposit.objects.get(pk=deposit_id)
status = data["status"]
deposit.status = status
if status == DEPOSIT_STATUS_LOAD_SUCCESS:
origin_url = data["origin_url"]
directory_id = data["directory_id"]
revision_id = data["revision_id"]
- dir_id = persistent_identifier(DIRECTORY, directory_id)
- snp_id = persistent_identifier(SNAPSHOT, data["snapshot_id"])
- rev_id = persistent_identifier(REVISION, revision_id)
+ dir_id = swhid(DIRECTORY, directory_id)
+ snp_id = swhid(SNAPSHOT, data["snapshot_id"])
+ rev_id = swhid(REVISION, revision_id)
deposit.swh_id = dir_id
# new id with contextual information
- deposit.swh_id_context = persistent_identifier(
+ deposit.swh_id_context = swhid(
DIRECTORY,
directory_id,
metadata={
"origin": origin_url,
"visit": snp_id,
"anchor": rev_id,
"path": "/",
},
)
else: # rejected
deposit.status = status
deposit.save()
return {}
diff --git a/swh/deposit/api/private/urls.py b/swh/deposit/api/private/urls.py
index 19330bbf..e48290d6 100644
--- a/swh/deposit/api/private/urls.py
+++ b/swh/deposit/api/private/urls.py
@@ -1,79 +1,78 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.conf.urls import url
from ...config import (
- PRIVATE_GET_RAW_CONTENT,
- PRIVATE_PUT_DEPOSIT,
- PRIVATE_GET_DEPOSIT_METADATA,
PRIVATE_CHECK_DEPOSIT,
+ PRIVATE_GET_DEPOSIT_METADATA,
+ PRIVATE_GET_RAW_CONTENT,
PRIVATE_LIST_DEPOSITS,
+ PRIVATE_PUT_DEPOSIT,
)
-from .deposit_read import SWHDepositReadArchives
-from .deposit_read import SWHDepositReadMetadata
-from .deposit_update_status import SWHUpdateStatusDeposit
-from .deposit_check import SWHChecksDeposit
-from .deposit_list import DepositList
+from .deposit_check import APIChecks
+from .deposit_list import APIList
+from .deposit_read import APIReadArchives, APIReadMetadata
+from .deposit_update_status import APIUpdateStatus
urlpatterns = [
# Retrieve deposit's raw archives' content
# -> GET
url(
r"^(?P[^/]+)/(?P[^/]+)/raw/$",
- SWHDepositReadArchives.as_view(),
+ APIReadArchives.as_view(),
name=PRIVATE_GET_RAW_CONTENT,
),
# Update deposit's status
# -> PUT
url(
r"^(?P[^/]+)/(?P[^/]+)/update/$",
- SWHUpdateStatusDeposit.as_view(),
+ APIUpdateStatus.as_view(),
name=PRIVATE_PUT_DEPOSIT,
),
# Retrieve metadata information on a specific deposit
# -> GET
url(
r"^(?P[^/]+)/(?P[^/]+)/meta/$",
- SWHDepositReadMetadata.as_view(),
+ APIReadMetadata.as_view(),
name=PRIVATE_GET_DEPOSIT_METADATA,
),
# Check archive and metadata information on a specific deposit
# -> GET
url(
r"^(?P[^/]+)/(?P[^/]+)/check/$",
- SWHChecksDeposit.as_view(),
+ APIChecks.as_view(),
name=PRIVATE_CHECK_DEPOSIT,
),
# Retrieve deposit's raw archives' content
# -> GET
url(
r"^(?P[^/]+)/raw/$",
- SWHDepositReadArchives.as_view(),
+ APIReadArchives.as_view(),
name=PRIVATE_GET_RAW_CONTENT + "-nc",
),
# Update deposit's status
# -> PUT
url(
r"^(?P[^/]+)/update/$",
- SWHUpdateStatusDeposit.as_view(),
+ APIUpdateStatus.as_view(),
name=PRIVATE_PUT_DEPOSIT + "-nc",
),
# Retrieve metadata information on a specific deposit
# -> GET
url(
r"^(?P[^/]+)/meta/$",
- SWHDepositReadMetadata.as_view(),
+ APIReadMetadata.as_view(),
name=PRIVATE_GET_DEPOSIT_METADATA + "-nc",
),
# Check archive and metadata information on a specific deposit
# -> GET
url(
r"^(?P[^/]+)/check/$",
- SWHChecksDeposit.as_view(),
+ APIChecks.as_view(),
name=PRIVATE_CHECK_DEPOSIT + "-nc",
),
- url(r"^deposits/$", DepositList.as_view(), name=PRIVATE_LIST_DEPOSITS),
+ url(r"^deposits/$", APIList.as_view(), name=PRIVATE_LIST_DEPOSITS),
]
diff --git a/swh/deposit/api/service_document.py b/swh/deposit/api/service_document.py
index 6aa3899c..a36cb304 100644
--- a/swh/deposit/api/service_document.py
+++ b/swh/deposit/api/service_document.py
@@ -1,34 +1,33 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.shortcuts import render
from django.urls import reverse
-from .common import SWHBaseDeposit, ACCEPT_PACKAGINGS
-from .common import ACCEPT_ARCHIVE_CONTENT_TYPES
from ..config import COL_IRI
from ..models import DepositClient, DepositCollection
+from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, ACCEPT_PACKAGINGS, APIBase
-class SWHServiceDocument(SWHBaseDeposit):
+class APIServiceDocument(APIBase):
def get(self, req, *args, **kwargs):
client = DepositClient.objects.get(username=req.user)
collections = {}
for col_id in client.collections:
col = DepositCollection.objects.get(pk=col_id)
col_uri = req.build_absolute_uri(reverse(COL_IRI, args=[col.name]))
collections[col.name] = col_uri
context = {
"max_upload_size": self.config["max_upload_size"],
"accept_packagings": ACCEPT_PACKAGINGS,
"accept_content_types": ACCEPT_ARCHIVE_CONTENT_TYPES,
"collections": collections,
}
return render(
req, "deposit/service_document.xml", context, content_type="application/xml"
)
diff --git a/swh/deposit/api/urls.py b/swh/deposit/api/urls.py
index 236db5aa..e7a686af 100644
--- a/swh/deposit/api/urls.py
+++ b/swh/deposit/api/urls.py
@@ -1,70 +1,68 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""SWH's deposit api URL Configuration
"""
from django.conf.urls import url
from django.shortcuts import render
-from ..config import EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI
-from ..config import SD_IRI, COL_IRI, STATE_IRI
-from .deposit import SWHDeposit
-from .deposit_status import SWHDepositStatus
-from .deposit_update import SWHUpdateMetadataDeposit
-from .deposit_update import SWHUpdateArchiveDeposit
-from .deposit_content import SWHDepositContent
-from .service_document import SWHServiceDocument
+from ..config import COL_IRI, CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI, SD_IRI, STATE_IRI
+from .deposit import APIPostDeposit
+from .deposit_content import APIContent
+from .deposit_status import APIStatus
+from .deposit_update import APIUpdateArchive, APIUpdateMetadata
+from .service_document import APIServiceDocument
def api_view(req):
return render(req, "api.html")
# PUBLIC API
urlpatterns = [
# simple view on the api
url(r"^$", api_view, name="api"),
# SD IRI - Service Document IRI
# -> GET
- url(r"^servicedocument/", SWHServiceDocument.as_view(), name=SD_IRI),
+ url(r"^servicedocument/", APIServiceDocument.as_view(), name=SD_IRI),
# Col IRI - Collection IRI
# -> POST
- url(r"^(?P[^/]+)/$", SWHDeposit.as_view(), name=COL_IRI),
+ url(r"^(?P[^/]+)/$", APIPostDeposit.as_view(), name=COL_IRI),
# EM IRI - Atom Edit Media IRI (update archive IRI)
# -> PUT (update-in-place existing archive)
# -> POST (add new archive)
url(
r"^(?P[^/]+)/(?P[^/]+)/media/$",
- SWHUpdateArchiveDeposit.as_view(),
+ APIUpdateArchive.as_view(),
name=EM_IRI,
),
# Edit IRI - Atom Entry Edit IRI (update metadata IRI)
# SE IRI - Sword Edit IRI ;; possibly same as Edit IRI
# -> PUT (update in place)
# -> POST (add new metadata)
url(
r"^(?P[^/]+)/(?P[^/]+)/metadata/$",
- SWHUpdateMetadataDeposit.as_view(),
+ APIUpdateMetadata.as_view(),
name=EDIT_SE_IRI,
),
# State IRI
# -> GET
url(
r"^(?P[^/]+)/(?P[^/]+)/status/$",
- SWHDepositStatus.as_view(),
+ APIStatus.as_view(),
name=STATE_IRI,
),
# Cont/File IRI
# -> GET
url(
r"^(?P[^/]+)/(?P[^/]+)/content/$",
- SWHDepositContent.as_view(),
+ APIContent.as_view(),
name=CONT_FILE_IRI,
), # specification is not clear about
# FILE-IRI, we assume it's the same as
# the CONT-IRI one
]
diff --git a/swh/deposit/cli/__init__.py b/swh/deposit/cli/__init__.py
index cc6e00dc..b13d0e5b 100644
--- a/swh/deposit/cli/__init__.py
+++ b/swh/deposit/cli/__init__.py
@@ -1,40 +1,43 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import click
import logging
-from swh.core.cli import CONTEXT_SETTINGS
+# WARNING: do not import unnecessary things here to keep cli startup time under
+# control
+import click
+
+from swh.core.cli import CONTEXT_SETTINGS, swh as swh_cli_group
logger = logging.getLogger(__name__)
-@click.group(context_settings=CONTEXT_SETTINGS)
+@swh_cli_group.group(context_settings=CONTEXT_SETTINGS)
@click.pass_context
def deposit(ctx):
"""Deposit main command
"""
ctx.ensure_object(dict)
log_level = ctx.obj.get("log_level", logging.INFO)
logger.setLevel(log_level)
def main():
logging.basicConfig()
return deposit(auto_envvar_prefix="SWH_DEPOSIT")
# These import statements MUST be executed after defining the 'deposit' group
# since the subcommands in these are defined using this 'deposit' group.
from . import client # noqa
try:
from . import admin # noqa
except ImportError: # server part is optional
logger.debug("admin subcommand not loaded")
if __name__ == "__main__":
main()
diff --git a/swh/deposit/cli/admin.py b/swh/deposit/cli/admin.py
index 6b387940..a56581de 100644
--- a/swh/deposit/cli/admin.py
+++ b/swh/deposit/cli/admin.py
@@ -1,271 +1,275 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+# WARNING: do not import unnecessary things here to keep cli startup time under
+# control
import click
-from swh.deposit.config import setup_django_for
from swh.deposit.cli import deposit
@deposit.group("admin")
@click.option(
"--config-file",
"-C",
default=None,
type=click.Path(exists=True, dir_okay=False,),
help="Optional extra configuration file.",
)
@click.option(
"--platform",
default="development",
type=click.Choice(["development", "production"]),
help="development or production platform",
)
@click.pass_context
def admin(ctx, config_file, platform):
"""Server administration tasks (manipulate user or collections)"""
+ from swh.deposit.config import setup_django_for
+
# configuration happens here
setup_django_for(platform, config_file=config_file)
@admin.group("user")
@click.pass_context
def user(ctx):
"""Manipulate user."""
# configuration happens here
pass
def _create_collection(name):
"""Create the collection with name if it does not exist.
Args:
name (str): collection's name
Returns:
collection (DepositCollection): the existing collection object
(created or not)
"""
# to avoid loading too early django namespaces
from swh.deposit.models import DepositCollection
try:
collection = DepositCollection.objects.get(name=name)
click.echo("Collection %s exists, nothing to do." % name)
except DepositCollection.DoesNotExist:
click.echo("Create new collection %s" % name)
collection = DepositCollection.objects.create(name=name)
click.echo("Collection %s created" % name)
return collection
@user.command("create")
@click.option("--username", required=True, help="User's name")
@click.option("--password", required=True, help="Desired user's password (plain).")
@click.option("--firstname", default="", help="User's first name")
@click.option("--lastname", default="", help="User's last name")
@click.option("--email", default="", help="User's email")
@click.option("--collection", help="User's collection")
@click.option("--provider-url", default="", help="Provider URL")
@click.option("--domain", default="", help="The domain")
@click.pass_context
def user_create(
ctx,
username,
password,
firstname,
lastname,
email,
collection,
provider_url,
domain,
):
"""Create a user with some needed information (password, collection)
If the collection does not exist, the collection is then created
alongside.
The password is stored encrypted using django's utilities.
"""
# to avoid loading too early django namespaces
from swh.deposit.models import DepositClient
# If collection is not provided, fallback to username
if not collection:
collection = username
click.echo("collection: %s" % collection)
# create the collection if it does not exist
collection = _create_collection(collection)
# user create/update
try:
user = DepositClient.objects.get(username=username)
click.echo("User %s exists, updating information." % user)
user.set_password(password)
except DepositClient.DoesNotExist:
click.echo("Create new user %s" % username)
user = DepositClient.objects.create_user(username=username, password=password)
user.collections = [collection.id]
user.first_name = firstname
user.last_name = lastname
user.email = email
user.is_active = True
user.provider_url = provider_url
user.domain = domain
user.save()
click.echo("Information registered for user %s" % user)
@user.command("list")
@click.pass_context
def user_list(ctx):
"""List existing users.
This entrypoint is not paginated yet as there is not a lot of
entry.
"""
# to avoid loading too early django namespaces
from swh.deposit.models import DepositClient
users = DepositClient.objects.all()
if not users:
output = "Empty user list"
else:
output = "\n".join((user.username for user in users))
click.echo(output)
@user.command("exists")
@click.argument("username", required=True)
@click.pass_context
def user_exists(ctx, username):
"""Check if user exists.
"""
# to avoid loading too early django namespaces
from swh.deposit.models import DepositClient
try:
DepositClient.objects.get(username=username)
click.echo("User %s exists." % username)
ctx.exit(0)
except DepositClient.DoesNotExist:
click.echo("User %s does not exist." % username)
ctx.exit(1)
@admin.group("collection")
@click.pass_context
def collection(ctx):
"""Manipulate collections."""
pass
@collection.command("create")
@click.option("--name", required=True, help="Collection's name")
@click.pass_context
def collection_create(ctx, name):
_create_collection(name)
@collection.command("list")
@click.pass_context
def collection_list(ctx):
"""List existing collections.
This entrypoint is not paginated yet as there is not a lot of
entry.
"""
# to avoid loading too early django namespaces
from swh.deposit.models import DepositCollection
collections = DepositCollection.objects.all()
if not collections:
output = "Empty collection list"
else:
output = "\n".join((col.name for col in collections))
click.echo(output)
@admin.group("deposit")
@click.pass_context
def adm_deposit(ctx):
"""Manipulate deposit."""
pass
@adm_deposit.command("reschedule")
@click.option("--deposit-id", required=True, help="Deposit identifier")
@click.pass_context
def adm_deposit_reschedule(ctx, deposit_id):
"""Reschedule the deposit loading
This will:
- check the deposit's status to something reasonable (failed or done). That
means that the checks have passed alright but something went wrong during
the loading (failed: loading failed, done: loading ok, still for some
reasons as in bugs, we need to reschedule it)
- reset the deposit's status to 'verified' (prior to any loading but after
the checks which are fine) and removes the different archives'
identifiers (swh-id, ...)
- trigger back the loading task through the scheduler
"""
# to avoid loading too early django namespaces
from datetime import datetime
- from swh.deposit.models import Deposit
+
from swh.deposit.config import (
- DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_LOAD_FAILURE,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_VERIFIED,
- SWHDefaultConfig,
+ APIConfig,
)
+ from swh.deposit.models import Deposit
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
click.echo("Deposit %s does not exist." % deposit_id)
ctx.exit(1)
# Check the deposit is in a reasonable state
accepted_statuses = [DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE]
if deposit.status == DEPOSIT_STATUS_VERIFIED:
click.echo("Deposit %s's status already set for rescheduling." % (deposit_id))
ctx.exit(0)
if deposit.status not in accepted_statuses:
click.echo(
"Deposit %s's status be one of %s."
% (deposit_id, ", ".join(accepted_statuses))
)
ctx.exit(1)
task_id = deposit.load_task_id
if not task_id:
click.echo(
"Deposit %s cannot be rescheduled. It misses the "
"associated task." % deposit_id
)
ctx.exit(1)
# Reset the deposit's state
deposit.swh_id = None
deposit.swh_id_context = None
deposit.status = DEPOSIT_STATUS_VERIFIED
deposit.save()
# Trigger back the deposit
- scheduler = SWHDefaultConfig().scheduler
+ scheduler = APIConfig().scheduler
scheduler.set_status_tasks(
[task_id], status="next_run_not_scheduled", next_run=datetime.now()
)
diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py
index cf618307..1e68d8c4 100644
--- a/swh/deposit/cli/client.py
+++ b/swh/deposit/cli/client.py
@@ -1,497 +1,509 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
import logging
+
+# WARNING: do not import unnecessary things here to keep cli startup time under
+# control
+import os
import sys
-import tempfile
-import uuid
-import json
-import yaml
import click
-import xmltodict
-from swh.deposit.client import PublicApiDepositClient, MaintenanceError
from swh.deposit.cli import deposit
-
logger = logging.getLogger(__name__)
class InputError(ValueError):
"""Input script error
"""
pass
def generate_slug():
"""Generate a slug (sample purposes).
"""
+ import uuid
+
return str(uuid.uuid4())
def _url(url):
"""Force the /1 api version at the end of the url (avoiding confusing
issues without it).
Args:
url (str): api url used by cli users
Returns:
Top level api url to actually request
"""
if not url.endswith("/1"):
url = "%s/1" % url
return url
def generate_metadata_file(name, external_id, authors, temp_dir):
"""Generate a temporary metadata file with the minimum required metadata
This generates a xml file in a temporary location and returns the
path to that file.
This is up to the client of that function to clean up the
temporary file.
Args:
name (str): Software's name
external_id (str): External identifier (slug) or generated one
authors (List[str]): List of author names
Returns:
Filepath to the metadata generated file
"""
+ import xmltodict
+
path = os.path.join(temp_dir, "metadata.xml")
# generate a metadata file with the minimum required metadata
codemetadata = {
"entry": {
"@xmlns": "http://www.w3.org/2005/Atom",
"@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
"codemeta:name": name,
"codemeta:identifier": external_id,
"codemeta:author": [
{"codemeta:name": author_name} for author_name in authors
],
},
}
logging.debug("Temporary file: %s", path)
logging.debug("Metadata dict to generate as xml: %s", codemetadata)
s = xmltodict.unparse(codemetadata, pretty=True)
logging.debug("Metadata dict as xml generated: %s", s)
with open(path, "w") as fp:
fp.write(s)
return path
def _client(url, username, password):
"""Instantiate a client to access the deposit api server
Args:
url (str): Deposit api server
username (str): User
password (str): User's password
"""
+ from swh.deposit.client import PublicApiDepositClient
+
client = PublicApiDepositClient(
{"url": url, "auth": {"username": username, "password": password},}
)
return client
def _collection(client):
"""Retrieve the client's collection
"""
# retrieve user's collection
sd_content = client.service_document()
if "error" in sd_content:
raise InputError("Service document retrieval: %s" % (sd_content["error"],))
collection = sd_content["service"]["workspace"]["collection"]["sword:name"]
return collection
def client_command_parse_input(
username,
password,
archive,
metadata,
archive_deposit,
metadata_deposit,
collection,
slug,
partial,
deposit_id,
replace,
url,
name,
authors,
temp_dir,
):
"""Parse the client subcommand options and make sure the combination
is acceptable*. If not, an InputError exception is raised
explaining the issue.
By acceptable, we mean:
- A multipart deposit (create or update) requires:
- an existing software archive
- an existing metadata file or author(s) and name provided in
params
- A binary deposit (create/update) requires an existing software
archive
- A metadata deposit (create/update) requires an existing metadata
file or author(s) and name provided in params
- A deposit update requires a deposit_id
This will not prevent all failure cases though. The remaining
errors are already dealt with by the underlying api client.
Raises:
InputError explaining the user input related issue
MaintenanceError explaining the api status
Returns:
dict with the following keys:
'archive': the software archive to deposit
'username': username
'password': associated password
'metadata': the metadata file to deposit
'collection': the username's associated client
'slug': the slug or external id identifying the deposit to make
'partial': if the deposit is partial or not
'client': instantiated class
'url': deposit's server main entry point
'deposit_type': deposit's type (binary, multipart, metadata)
'deposit_id': optional deposit identifier
"""
if archive_deposit and metadata_deposit:
# too many flags use, remove redundant ones (-> multipart deposit)
archive_deposit = False
metadata_deposit = False
if not slug: # generate one as this is mandatory
slug = generate_slug()
if not metadata:
if name and authors:
metadata = generate_metadata_file(name, slug, authors, temp_dir)
elif not archive_deposit and not partial and not deposit_id:
# If we meet all the following conditions:
# * there is not an archive-only deposit
# * it is not part of a multipart deposit (either create/update
# or finish)
# * it misses either name or authors
raise InputError(
"Either a metadata file (--metadata) or both --author and "
"--name must be provided, unless this is an archive-only "
"deposit."
)
elif name or authors:
# If we are generating metadata, then all mandatory metadata
# must be present
raise InputError(
"Either a metadata file (--metadata) or both --author and "
"--name must be provided."
)
else:
# TODO: this is a multipart deposit, we might want to check that
# metadata are deposited at some point
pass
elif name or authors:
raise InputError(
"Using a metadata file (--metadata) is incompatible with "
"--author and --name, which are used to generate one."
)
if metadata_deposit:
archive = None
if archive_deposit:
metadata = None
if metadata_deposit and not metadata:
raise InputError(
"Metadata deposit must be provided for metadata "
"deposit (either a filepath or --name and --author)"
)
if not archive and not metadata and partial:
raise InputError(
- "Please provide an actionable command. See --help for more " "information"
+ "Please provide an actionable command. See --help for more information"
)
if replace and not deposit_id:
raise InputError("To update an existing deposit, you must provide its id")
client = _client(url, username, password)
if not collection:
collection = _collection(client)
return {
"archive": archive,
"username": username,
"password": password,
"metadata": metadata,
"collection": collection,
"slug": slug,
"in_progress": partial,
"client": client,
"url": url,
"deposit_id": deposit_id,
"replace": replace,
}
def _subdict(d, keys):
"return a dict from d with only given keys"
return {k: v for k, v in d.items() if k in keys}
def deposit_create(config, logger):
"""Delegate the actual deposit to the deposit client.
"""
logger.debug("Create deposit")
client = config["client"]
keys = ("collection", "archive", "metadata", "slug", "in_progress")
return client.deposit_create(**_subdict(config, keys))
def deposit_update(config, logger):
"""Delegate the actual deposit to the deposit client.
"""
logger.debug("Update deposit")
client = config["client"]
keys = (
"collection",
"deposit_id",
"archive",
"metadata",
"slug",
"in_progress",
"replace",
)
return client.deposit_update(**_subdict(config, keys))
@deposit.command()
@click.option("--username", required=True, help="(Mandatory) User's name")
@click.option(
"--password", required=True, help="(Mandatory) User's associated password"
)
@click.option(
"--archive",
type=click.Path(exists=True),
help="(Optional) Software archive to deposit",
)
@click.option(
"--metadata",
type=click.Path(exists=True),
help=(
"(Optional) Path to xml metadata file. If not provided, "
"this will use a file named .metadata.xml"
),
) # noqa
@click.option(
"--archive-deposit/--no-archive-deposit",
default=False,
help="(Optional) Software archive only deposit",
)
@click.option(
"--metadata-deposit/--no-metadata-deposit",
default=False,
help="(Optional) Metadata only deposit",
)
@click.option(
"--collection",
help="(Optional) User's collection. If not provided, this will be fetched.",
) # noqa
@click.option(
"--slug",
help=(
"(Optional) External system information identifier. "
"If not provided, it will be generated"
),
) # noqa
@click.option(
"--partial/--no-partial",
default=False,
help=(
"(Optional) The deposit will be partial, other deposits "
"will have to take place to finalize it."
),
) # noqa
@click.option(
"--deposit-id",
default=None,
help="(Optional) Update an existing partial deposit with its identifier",
) # noqa
@click.option(
"--replace/--no-replace",
default=False,
help="(Optional) Update by replacing existing metadata to a deposit",
) # noqa
@click.option(
"--url",
default="https://deposit.softwareheritage.org",
help=(
"(Optional) Deposit server api endpoint. By default, "
"https://deposit.softwareheritage.org/1"
),
) # noqa
@click.option("--verbose/--no-verbose", default=False, help="Verbose mode")
@click.option("--name", help="Software name")
@click.option(
"--author",
multiple=True,
help="Software author(s), this can be repeated as many times"
" as there are authors",
)
@click.option(
"-f",
"--format",
"output_format",
default="logging",
type=click.Choice(["logging", "yaml", "json"]),
help="Output format results.",
)
@click.pass_context
def upload(
ctx,
username,
password,
archive=None,
metadata=None,
archive_deposit=False,
metadata_deposit=False,
collection=None,
slug=None,
partial=False,
deposit_id=None,
replace=False,
url="https://deposit.softwareheritage.org",
verbose=False,
name=None,
author=None,
output_format=None,
):
"""Software Heritage Public Deposit Client
Create/Update deposit through the command line.
More documentation can be found at
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html.
"""
+ import tempfile
+
+ from swh.deposit.client import MaintenanceError
+
url = _url(url)
config = {}
with tempfile.TemporaryDirectory() as temp_dir:
try:
logger.debug("Parsing cli options")
config = client_command_parse_input(
username,
password,
archive,
metadata,
archive_deposit,
metadata_deposit,
collection,
slug,
partial,
deposit_id,
replace,
url,
name,
author,
temp_dir,
)
except InputError as e:
logger.error("Problem during parsing options: %s", e)
sys.exit(1)
except MaintenanceError as e:
logger.error(e)
sys.exit(1)
if verbose:
logger.info("Parsed configuration: %s" % (config,))
deposit_id = config["deposit_id"]
if deposit_id:
r = deposit_update(config, logger)
else:
r = deposit_create(config, logger)
print_result(r, output_format)
@deposit.command()
@click.option(
"--url",
default="https://deposit.softwareheritage.org",
help="(Optional) Deposit server api endpoint. By default, "
"https://deposit.softwareheritage.org/1",
)
@click.option("--username", required=True, help="(Mandatory) User's name")
@click.option(
"--password", required=True, help="(Mandatory) User's associated password"
)
@click.option("--deposit-id", default=None, required=True, help="Deposit identifier.")
@click.option(
"-f",
"--format",
"output_format",
default="logging",
type=click.Choice(["logging", "yaml", "json"]),
help="Output format results.",
)
@click.pass_context
def status(ctx, url, username, password, deposit_id, output_format):
"""Deposit's status
"""
+ from swh.deposit.client import MaintenanceError
+
url = _url(url)
logger.debug("Status deposit")
try:
client = _client(url, username, password)
collection = _collection(client)
except InputError as e:
logger.error("Problem during parsing options: %s", e)
sys.exit(1)
except MaintenanceError as e:
logger.error(e)
sys.exit(1)
print_result(
client.deposit_status(collection=collection, deposit_id=deposit_id),
output_format,
)
def print_result(data, output_format):
+ import json
+
+ import yaml
+
if output_format == "json":
click.echo(json.dumps(data))
elif output_format == "yaml":
click.echo(yaml.dump(data))
else:
logger.info(data)
diff --git a/swh/deposit/client.py b/swh/deposit/client.py
index 9418d03a..a27c166b 100644
--- a/swh/deposit/client.py
+++ b/swh/deposit/client.py
@@ -1,658 +1,651 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of defining an swh-deposit client
"""
+from abc import ABCMeta, abstractmethod
import hashlib
-import os
-import requests
-import xmltodict
import logging
-
-from abc import ABCMeta, abstractmethod
+import os
from typing import Any, Dict
from urllib.parse import urljoin
-from swh.core.config import SWHConfig
+import requests
+import xmltodict
+from swh.core.config import config_basepath, read_raw_config
logger = logging.getLogger(__name__)
class MaintenanceError(ValueError):
"""Informational maintenance error exception
"""
pass
def _parse(stream, encoding="utf-8"):
"""Given a xml stream, parse the result.
Args:
stream (bytes/text): The stream to parse
encoding (str): The encoding to use if to decode the bytes
stream
Returns:
A dict of values corresponding to the parsed xml
"""
if isinstance(stream, bytes):
stream = stream.decode(encoding)
data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False)
if "entry" in data:
data = data["entry"]
if "sword:error" in data:
data = data["sword:error"]
return dict(data)
def _parse_with_filter(stream, encoding="utf-8", keys=[]):
"""Given a xml stream, parse the result and filter with keys.
Args:
stream (bytes/text): The stream to parse
encoding (str): The encoding to use if to decode the bytes
stream
keys ([str]): Keys to filter the parsed result
Returns:
A dict of values corresponding to the parsed xml filtered by
the keys provided.
"""
data = _parse(stream, encoding=encoding)
m = {}
for key in keys:
m[key] = data.get(key)
return m
-class BaseApiDepositClient(SWHConfig):
+class BaseApiDepositClient:
"""Deposit client base class
"""
- CONFIG_BASE_FILENAME = "deposit/client"
- DEFAULT_CONFIG = {
- "url": ("str", "http://localhost:5006"),
- "auth": ("dict", {}), # with optional 'username'/'password' keys
- }
-
def __init__(self, config=None, _client=requests):
- super().__init__()
if config is None:
- self.config = super().parse_config_file()
+ config_file = os.environ["SWH_CONFIG_FILENAME"]
+ self.config: Dict[str, Any] = read_raw_config(config_basepath(config_file))
else:
self.config = config
self._client = _client
self.base_url = self.config["url"].strip("/") + "/"
auth = self.config["auth"]
if auth == {}:
self.auth = None
else:
self.auth = (auth["username"], auth["password"])
def do(self, method, url, *args, **kwargs):
"""Internal method to deal with requests, possibly with basic http
authentication.
Args:
method (str): supported http methods as in self._methods' keys
Returns:
The request's execution
"""
if hasattr(self._client, method):
method_fn = getattr(self._client, method)
else:
raise ValueError("Development error, unsupported method %s" % (method))
if self.auth:
kwargs["auth"] = self.auth
full_url = urljoin(self.base_url, url.lstrip("/"))
return method_fn(full_url, *args, **kwargs)
class PrivateApiDepositClient(BaseApiDepositClient):
"""Private API deposit client to:
- read a given deposit's archive(s)
- read a given deposit's metadata
- update a given deposit's status
"""
def archive_get(self, archive_update_url, archive):
"""Retrieve the archive from the deposit to a local directory.
Args:
archive_update_url (str): The full deposit archive(s)'s raw content
to retrieve locally
archive (str): the local archive's path where to store
the raw content
Returns:
The archive path to the local archive to load.
Or None if any problem arose.
"""
r = self.do("get", archive_update_url, stream=True)
if r.ok:
with open(archive, "wb") as f:
for chunk in r.iter_content():
f.write(chunk)
return archive
msg = "Problem when retrieving deposit archive at %s" % (archive_update_url,)
logger.error(msg)
raise ValueError(msg)
def metadata_get(self, metadata_url):
"""Retrieve the metadata information on a given deposit.
Args:
metadata_url (str): The full deposit metadata url to retrieve
locally
Returns:
The dictionary of metadata for that deposit or None if any
problem arose.
"""
r = self.do("get", metadata_url)
if r.ok:
return r.json()
msg = "Problem when retrieving metadata at %s" % metadata_url
logger.error(msg)
raise ValueError(msg)
def status_update(
self,
update_status_url,
status,
revision_id=None,
directory_id=None,
origin_url=None,
):
"""Update the deposit's status.
Args:
update_status_url (str): the full deposit's archive
status (str): The status to update the deposit with
revision_id (str/None): the revision's identifier to update to
directory_id (str/None): the directory's identifier to update to
origin_url (str/None): deposit's associated origin url
"""
payload = {"status": status}
if revision_id:
payload["revision_id"] = revision_id
if directory_id:
payload["directory_id"] = directory_id
if origin_url:
payload["origin_url"] = origin_url
self.do("put", update_status_url, json=payload)
def check(self, check_url):
"""Check the deposit's associated data (metadata, archive(s))
Args:
check_url (str): the full deposit's check url
"""
r = self.do("get", check_url)
if r.ok:
data = r.json()
return data["status"]
msg = "Problem when checking deposit %s" % check_url
logger.error(msg)
raise ValueError(msg)
class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta):
"""Base Deposit client to access the public api.
"""
def __init__(self, config, error_msg=None, empty_result={}):
super().__init__(config)
self.error_msg = error_msg
self.empty_result = empty_result
@abstractmethod
def compute_url(self, *args, **kwargs):
"""Compute api url endpoint to query."""
pass
@abstractmethod
def compute_method(self, *args, **kwargs):
"""Http method to use on the url"""
pass
@abstractmethod
def parse_result_ok(self, xml_content):
"""Given an xml result from the api endpoint, parse it and returns a
dict.
"""
pass
def compute_information(self, *args, **kwargs):
"""Compute some more information given the inputs (e.g http headers,
...)
"""
return {}
def parse_result_error(self, xml_content):
"""Given an error response in xml, parse it into a dict.
Returns:
dict with following keys:
'error': The error message
'detail': Some more detail about the error if any
"""
return _parse_with_filter(
xml_content, keys=["summary", "detail", "sword:verboseDescription"]
)
def do_execute(self, method, url, info):
"""Execute the http query to url using method and info information.
By default, execute a simple query to url with the http
method. Override this in daughter class to improve the
default behavior if needed.
"""
return self.do(method, url)
def execute(self, *args, **kwargs) -> Dict[str, Any]:
"""Main endpoint to prepare and execute the http query to the api.
Raises:
MaintenanceError if some api maintenance is happening.
Returns:
Dict of computed api data
"""
url = self.compute_url(*args, **kwargs)
method = self.compute_method(*args, **kwargs)
info = self.compute_information(*args, **kwargs)
try:
r = self.do_execute(method, url, info)
except Exception as e:
msg = self.error_msg % (url, e)
r = self.empty_result
r.update(
{"error": msg,}
)
return r
else:
if r.ok:
if int(r.status_code) == 204: # 204 returns no body
return {"status": r.status_code}
else:
return self.parse_result_ok(r.text)
else:
error = self.parse_result_error(r.text)
empty = self.empty_result
error.update(empty)
if r.status_code == 503:
summary = error.get("summary")
detail = error.get("sword:verboseDescription")
# Maintenance error
if summary and detail:
raise MaintenanceError(f"{summary}: {detail}")
error.update(
{"status": r.status_code,}
)
return error
class ServiceDocumentDepositClient(BaseDepositClient):
"""Service Document information retrieval.
"""
def __init__(self, config):
super().__init__(
config,
error_msg="Service document failure at %s: %s",
empty_result={"collection": None},
)
def compute_url(self, *args, **kwargs):
return "/servicedocument/"
def compute_method(self, *args, **kwargs):
return "get"
def parse_result_ok(self, xml_content):
"""Parse service document's success response.
"""
return _parse(xml_content)
class StatusDepositClient(BaseDepositClient):
"""Status information on a deposit.
"""
def __init__(self, config):
super().__init__(
config,
error_msg="Status check failure at %s: %s",
empty_result={
"deposit_status": None,
"deposit_status_detail": None,
"deposit_swh_id": None,
},
)
def compute_url(self, collection, deposit_id):
return "/%s/%s/status/" % (collection, deposit_id)
def compute_method(self, *args, **kwargs):
return "get"
def parse_result_ok(self, xml_content):
"""Given an xml content as string, returns a deposit dict.
"""
return _parse_with_filter(
xml_content,
keys=[
"deposit_id",
"deposit_status",
"deposit_status_detail",
"deposit_swh_id",
"deposit_swh_id_context",
"deposit_external_id",
],
)
class BaseCreateDepositClient(BaseDepositClient):
"""Deposit client base class to post new deposit.
"""
def __init__(self, config):
super().__init__(
config,
error_msg="Post Deposit failure at %s: %s",
empty_result={"deposit_id": None, "deposit_status": None,},
)
def compute_url(self, collection, *args, **kwargs):
return "/%s/" % collection
def compute_method(self, *args, **kwargs):
return "post"
def parse_result_ok(self, xml_content):
"""Given an xml content as string, returns a deposit dict.
"""
return _parse_with_filter(
xml_content,
keys=[
"deposit_id",
"deposit_status",
"deposit_status_detail",
"deposit_date",
],
)
def _compute_information(
self, collection, filepath, in_progress, slug, is_archive=True
):
"""Given a filepath, compute necessary information on that file.
Args:
filepath (str): Path to a file
is_archive (bool): is it an archive or not?
Returns:
dict with keys:
'content-type': content type associated
'md5sum': md5 sum
'filename': filename
"""
filename = os.path.basename(filepath)
if is_archive:
md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest()
extension = filename.split(".")[-1]
if "zip" in extension:
content_type = "application/zip"
else:
content_type = "application/x-tar"
else:
content_type = None
md5sum = None
return {
"slug": slug,
"in_progress": in_progress,
"content-type": content_type,
"md5sum": md5sum,
"filename": filename,
"filepath": filepath,
}
def compute_information(
self, collection, filepath, in_progress, slug, is_archive=True, **kwargs
):
info = self._compute_information(
collection, filepath, in_progress, slug, is_archive=is_archive
)
info["headers"] = self.compute_headers(info)
return info
def do_execute(self, method, url, info):
with open(info["filepath"], "rb") as f:
return self.do(method, url, data=f, headers=info["headers"])
class CreateArchiveDepositClient(BaseCreateDepositClient):
"""Post an archive (binary) deposit client."""
def compute_headers(self, info):
return {
"SLUG": info["slug"],
"CONTENT_MD5": info["md5sum"],
"IN-PROGRESS": str(info["in_progress"]),
"CONTENT-TYPE": info["content-type"],
"CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],),
}
class UpdateArchiveDepositClient(CreateArchiveDepositClient):
"""Update (add/replace) an archive (binary) deposit client."""
def compute_url(self, collection, *args, deposit_id=None, **kwargs):
return "/%s/%s/media/" % (collection, deposit_id)
def compute_method(self, *args, replace=False, **kwargs):
return "put" if replace else "post"
class CreateMetadataDepositClient(BaseCreateDepositClient):
"""Post a metadata deposit client."""
def compute_headers(self, info):
return {
"SLUG": info["slug"],
"IN-PROGRESS": str(info["in_progress"]),
"CONTENT-TYPE": "application/atom+xml;type=entry",
}
class UpdateMetadataDepositClient(CreateMetadataDepositClient):
"""Update (add/replace) a metadata deposit client."""
def compute_url(self, collection, *args, deposit_id=None, **kwargs):
return "/%s/%s/metadata/" % (collection, deposit_id)
def compute_method(self, *args, replace=False, **kwargs):
return "put" if replace else "post"
class CreateMultipartDepositClient(BaseCreateDepositClient):
"""Create a multipart deposit client."""
def _multipart_info(self, info, info_meta):
files = [
(
"file",
(info["filename"], open(info["filepath"], "rb"), info["content-type"]),
),
(
"atom",
(
info_meta["filename"],
open(info_meta["filepath"], "rb"),
"application/atom+xml",
),
),
]
headers = {
"SLUG": info["slug"],
"CONTENT_MD5": info["md5sum"],
"IN-PROGRESS": str(info["in_progress"]),
}
return files, headers
def compute_information(
self, collection, archive, metadata, in_progress, slug, **kwargs
):
info = self._compute_information(collection, archive, in_progress, slug)
info_meta = self._compute_information(
collection, metadata, in_progress, slug, is_archive=False
)
files, headers = self._multipart_info(info, info_meta)
return {"files": files, "headers": headers}
def do_execute(self, method, url, info):
return self.do(method, url, files=info["files"], headers=info["headers"])
class UpdateMultipartDepositClient(CreateMultipartDepositClient):
"""Update a multipart deposit client."""
def compute_url(self, collection, *args, deposit_id=None, **kwargs):
return "/%s/%s/metadata/" % (collection, deposit_id)
def compute_method(self, *args, replace=False, **kwargs):
return "put" if replace else "post"
class PublicApiDepositClient(BaseApiDepositClient):
"""Public api deposit client."""
def service_document(self):
"""Retrieve service document endpoint's information."""
return ServiceDocumentDepositClient(self.config).execute()
def deposit_status(self, collection, deposit_id):
"""Retrieve status information on a deposit."""
return StatusDepositClient(self.config).execute(collection, deposit_id)
def deposit_create(
self, collection, slug, archive=None, metadata=None, in_progress=False
):
"""Create a new deposit (archive, metadata, both as multipart)."""
if archive and not metadata:
return CreateArchiveDepositClient(self.config).execute(
collection, archive, in_progress, slug
)
elif not archive and metadata:
return CreateMetadataDepositClient(self.config).execute(
collection, metadata, in_progress, slug, is_archive=False
)
else:
return CreateMultipartDepositClient(self.config).execute(
collection, archive, metadata, in_progress, slug
)
def deposit_update(
self,
collection,
deposit_id,
slug,
archive=None,
metadata=None,
in_progress=False,
replace=False,
):
"""Update (add/replace) existing deposit (archive, metadata, both)."""
r = self.deposit_status(collection, deposit_id)
if "error" in r:
return r
status = r["deposit_status"]
if status != "partial":
return {
"error": "You can only act on deposit with status 'partial'",
"detail": "The deposit %s has status '%s'" % (deposit_id, status),
"deposit_status": status,
"deposit_id": deposit_id,
}
if archive and not metadata:
r = UpdateArchiveDepositClient(self.config).execute(
collection,
archive,
in_progress,
slug,
deposit_id=deposit_id,
replace=replace,
)
elif not archive and metadata:
r = UpdateMetadataDepositClient(self.config).execute(
collection,
metadata,
in_progress,
slug,
deposit_id=deposit_id,
replace=replace,
)
else:
r = UpdateMultipartDepositClient(self.config).execute(
collection,
archive,
metadata,
in_progress,
slug,
deposit_id=deposit_id,
replace=replace,
)
if "error" in r:
return r
return self.deposit_status(collection, deposit_id)
diff --git a/swh/deposit/config.py b/swh/deposit/config.py
index 410370d0..16221dfd 100644
--- a/swh/deposit/config.py
+++ b/swh/deposit/config.py
@@ -1,110 +1,99 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
-import logging
+from typing import Any, Dict
-from typing import Any, Dict, Tuple
-
-from swh.core.config import SWHConfig
+from swh.core import config
from swh.scheduler import get_scheduler
+from swh.scheduler.interface import SchedulerInterface
# IRIs (Internationalized Resource identifier) sword 2.0 specified
EDIT_SE_IRI = "edit_se_iri"
EM_IRI = "em_iri"
CONT_FILE_IRI = "cont_file_iri"
SD_IRI = "servicedocument"
COL_IRI = "upload"
STATE_IRI = "state_iri"
PRIVATE_GET_RAW_CONTENT = "private-download"
PRIVATE_CHECK_DEPOSIT = "check-deposit"
PRIVATE_PUT_DEPOSIT = "private-update"
PRIVATE_GET_DEPOSIT_METADATA = "private-read"
PRIVATE_LIST_DEPOSITS = "private-deposit-list"
ARCHIVE_KEY = "archive"
METADATA_KEY = "metadata"
RAW_METADATA_KEY = "raw-metadata"
ARCHIVE_TYPE = "archive"
METADATA_TYPE = "metadata"
AUTHORIZED_PLATFORMS = ["development", "production", "testing"]
DEPOSIT_STATUS_REJECTED = "rejected"
DEPOSIT_STATUS_PARTIAL = "partial"
DEPOSIT_STATUS_DEPOSITED = "deposited"
DEPOSIT_STATUS_VERIFIED = "verified"
DEPOSIT_STATUS_LOAD_SUCCESS = "done"
DEPOSIT_STATUS_LOAD_FAILURE = "failed"
# Revision author for deposit
SWH_PERSON = {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org",
}
+DEFAULT_CONFIG = {
+ "max_upload_size": 209715200,
+ "checks": True,
+}
+
+
def setup_django_for(platform=None, config_file=None):
"""Setup function for command line tools (swh.deposit.create_user) to
initialize the needed db access.
Note:
Do not import any django related module prior to this function
call. Otherwise, this will raise an
django.core.exceptions.ImproperlyConfigured error message.
Args:
platform (str): the platform the scheduling is running
config_file (str): Extra configuration file (typically for the
production platform)
Raises:
ValueError in case of wrong platform inputs.
"""
if platform is not None:
if platform not in AUTHORIZED_PLATFORMS:
raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS)
if "DJANGO_SETTINGS_MODULE" not in os.environ:
os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform
if config_file:
os.environ.setdefault("SWH_CONFIG_FILENAME", config_file)
import django
django.setup()
-class SWHDefaultConfig(SWHConfig):
- """Mixin intended to enrich views with SWH configuration.
+class APIConfig:
+ """API Configuration centralized class. This loads explicitly the configuration file out
+ of the SWH_CONFIG_FILENAME environment variable.
"""
- CONFIG_BASE_FILENAME = "deposit/server"
-
- DEFAULT_CONFIG = {
- "max_upload_size": ("int", 209715200),
- "checks": ("bool", True),
- "scheduler": (
- "dict",
- {"cls": "remote", "args": {"url": "http://localhost:5008/"}},
- ),
- }
-
- ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]]
-
- def __init__(self, **config):
- super().__init__()
- self.config = self.parse_config_file(
- additional_configs=[self.ADDITIONAL_CONFIG]
- )
- self.config.update(config)
- self.log = logging.getLogger("swh.deposit")
- if self.config.get("scheduler"):
- self.scheduler = get_scheduler(**self.config["scheduler"])
+ def __init__(self):
+ config_file = os.environ["SWH_CONFIG_FILENAME"]
+ conf = config.read_raw_config(config.config_basepath(config_file))
+ self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf)
+ self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"])
diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py
index 68cc346b..f41965dd 100644
--- a/swh/deposit/errors.py
+++ b/swh/deposit/errors.py
@@ -1,151 +1,150 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of providing the standard sword errors
"""
-from rest_framework import status
from django.shortcuts import render
-
+from rest_framework import status
FORBIDDEN = "forbidden"
UNAUTHORIZED = "unauthorized"
NOT_FOUND = "unknown"
BAD_REQUEST = "bad-request"
ERROR_CONTENT = "error-content"
CHECKSUM_MISMATCH = "checksum-mismatch"
MEDIATION_NOT_ALLOWED = "mediation-not-allowed"
METHOD_NOT_ALLOWED = "method-not-allowed"
MAX_UPLOAD_SIZE_EXCEEDED = "max_upload_size_exceeded"
PARSING_ERROR = "parsing-error"
class ParserError(ValueError):
"""Specific parsing error detected when parsing the xml metadata input
"""
pass
ERRORS = {
FORBIDDEN: {
"status": status.HTTP_403_FORBIDDEN,
"iri": "http://purl.org/net/sword/error/ErrorForbidden",
"tag": "sword:ErrorForbidden",
},
UNAUTHORIZED: {
"status": status.HTTP_401_UNAUTHORIZED,
"iri": "http://purl.org/net/sword/error/ErrorUnauthorized",
"tag": "sword:ErrorUnauthorized",
},
NOT_FOUND: {
"status": status.HTTP_404_NOT_FOUND,
"iri": "http://purl.org/net/sword/error/ErrorNotFound",
"tag": "sword:ErrorNotFound",
},
ERROR_CONTENT: {
"status": status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
"iri": "http://purl.org/net/sword/error/ErrorContent",
"tag": "sword:ErrorContent",
},
CHECKSUM_MISMATCH: {
"status": status.HTTP_412_PRECONDITION_FAILED,
"iri": "http://purl.org/net/sword/error/ErrorChecksumMismatch",
"tag": "sword:ErrorChecksumMismatch",
},
BAD_REQUEST: {
"status": status.HTTP_400_BAD_REQUEST,
"iri": "http://purl.org/net/sword/error/ErrorBadRequest",
"tag": "sword:ErrorBadRequest",
},
PARSING_ERROR: {
"status": status.HTTP_400_BAD_REQUEST,
"iri": "http://purl.org/net/sword/error/ErrorBadRequest",
"tag": "sword:ErrorBadRequest",
},
MEDIATION_NOT_ALLOWED: {
"status": status.HTTP_412_PRECONDITION_FAILED,
"iri": "http://purl.org/net/sword/error/MediationNotAllowed",
"tag": "sword:MediationNotAllowed",
},
METHOD_NOT_ALLOWED: {
"status": status.HTTP_405_METHOD_NOT_ALLOWED,
"iri": "http://purl.org/net/sword/error/MethodNotAllowed",
"tag": "sword:MethodNotAllowed",
},
MAX_UPLOAD_SIZE_EXCEEDED: {
"status": status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
"iri": "http://purl.org/net/sword/error/MaxUploadSizeExceeded",
"tag": "sword:MaxUploadSizeExceeded",
},
}
def make_error_dict(key, summary=None, verbose_description=None):
"""Utility function to factorize error message dictionary.
Args:
key (str): Error status key referenced in swh.deposit.errors module
summary (str/None): Error message clarifying the status
verbose_description (str/None): A more verbose
description or work around a potential problem.
Returns:
Dictionary with key 'error' detailing the 'status' and
associated 'message'
"""
return {
"error": {
"key": key,
"summary": summary,
"verboseDescription": verbose_description,
},
}
def make_error_response_from_dict(req, error):
"""Utility function to return an http response with error detail.
Args:
req (Request): original request
error (dict): Error described as dict, typically generated
from the make_error_dict function.
Returns:
HttpResponse with detailed error.
"""
error_information = ERRORS[error["key"]]
context = error
context.update(error_information)
return render(
req,
"deposit/error.xml",
context=error,
content_type="application/xml",
status=error_information["status"],
)
def make_error_response(req, key, summary=None, verbose_description=None):
"""Utility function to create an http response with detailed error.
Args:
req (Request): original request
key (str): Error status key referenced in swh.deposit.errors module
summary (str): Error message clarifying the status
verbose_description (str / None): A more verbose
description or work around a potential problem.
Returns:
Dictionary with key 'error' detailing the 'status' and
associated 'message'
"""
error = make_error_dict(key, summary, verbose_description)
return make_error_response_from_dict(req, error["error"])
diff --git a/swh/deposit/exception.py b/swh/deposit/exception.py
index cdd1f7d0..e0252e00 100644
--- a/swh/deposit/exception.py
+++ b/swh/deposit/exception.py
@@ -1,38 +1,37 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict, Optional
+from django.db.utils import OperationalError
+from django.http import HttpResponse
from rest_framework.exceptions import APIException
from rest_framework.views import exception_handler
-from django.http import HttpResponse
-
-from django.db.utils import OperationalError
def custom_exception_handler(
exc: APIException, context: Dict
) -> Optional[HttpResponse]:
"""Custom deposit exception handler to ensure consistent xml output
"""
# drf's default exception handler first, to get the standard error response
response = exception_handler(exc, context)
if isinstance(exc, OperationalError):
status = "Database backend maintenance"
detail = "Service temporarily unavailable, try again later."
data = f"""
{status}{detail}
""".encode(
"utf-8"
)
return HttpResponse(data, status=503, content_type="application/xml")
return response
diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py
index bb054529..5e239083 100644
--- a/swh/deposit/loader/checker.py
+++ b/swh/deposit/loader/checker.py
@@ -1,51 +1,42 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
+import os
+from typing import Any, Dict
-from typing import Mapping
-
-from swh.core.config import SWHConfig
-
+from swh.core import config
from swh.deposit.client import PrivateApiDepositClient
-
logger = logging.getLogger(__name__)
-class DepositChecker(SWHConfig):
+class DepositChecker:
"""Deposit checker implementation.
Trigger deposit's checks through the private api.
"""
- CONFIG_BASE_FILENAME = "deposit/checker"
-
- DEFAULT_CONFIG = {
- "deposit": ("dict", {"url": "http://localhost:5006/1/private/", "auth": {},})
- }
-
- def __init__(self, config=None):
- super().__init__()
- if config is None:
- self.config = self.parse_config_file()
- else:
- self.config = config
+ def __init__(self):
+ config_file = os.environ["SWH_CONFIG_FILENAME"]
+ self.config: Dict[str, Any] = config.read_raw_config(
+ config.config_basepath(config_file)
+ )
self.client = PrivateApiDepositClient(config=self.config["deposit"])
- def check(self, collection: str, deposit_id: str) -> Mapping[str, str]:
+ def check(self, collection: str, deposit_id: str) -> Dict[str, str]:
status = None
deposit_check_url = f"/{collection}/{deposit_id}/check/"
logger.debug("deposit-check-url: %s", deposit_check_url)
try:
r = self.client.check(deposit_check_url)
logger.debug("Check result: %s", r)
status = "eventful" if r == "verified" else "failed"
except Exception:
logger.exception("Failure during check on '%s'", deposit_check_url)
status = "failed"
logger.debug("Check status: %s", status)
return {"status": status}
diff --git a/swh/deposit/manage.py b/swh/deposit/manage.py
index 80fbcb86..eeb30601 100755
--- a/swh/deposit/manage.py
+++ b/swh/deposit/manage.py
@@ -1,53 +1,52 @@
#!/usr/bin/env python3
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import sys
from swh.core import config
-
DEFAULT_CONFIG = {
"port": ("int", 5006),
"host": ("str", "127.0.0.1"),
}
if __name__ == "__main__":
settings_file = "development"
if sys.argv[1] == "runserver":
# override the default host:port for the 'runserver' task
conf = config.load_named_config("deposit/server", default_conf=DEFAULT_CONFIG)
extra_cmd = ["%s:%s" % (conf["host"], conf["port"])]
cmd = sys.argv + extra_cmd
elif sys.argv[1] == "test":
# override the default settings file to read in testing mode
settings_file = "testing"
cmd = sys.argv
else: # otherwise, do nothing
cmd = sys.argv
os.environ.setdefault(
"DJANGO_SETTINGS_MODULE", "swh.deposit.settings.%s" % settings_file
)
try:
from django.core.management import execute_from_command_line
except ImportError:
# The above import may fail for some other reason. Ensure that the
# issue is really that Django is missing to avoid masking other
# exceptions on Python 2.
try:
import django # noqa
except ImportError:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
)
raise
execute_from_command_line(cmd)
diff --git a/swh/deposit/migrations/0002_depositrequest_archive.py b/swh/deposit/migrations/0002_depositrequest_archive.py
index 68e0b080..b8931667 100644
--- a/swh/deposit/migrations/0002_depositrequest_archive.py
+++ b/swh/deposit/migrations/0002_depositrequest_archive.py
@@ -1,23 +1,24 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-10-05 10:36
from __future__ import unicode_literals
from django.db import migrations, models
+
import swh.deposit.models
class Migration(migrations.Migration):
dependencies = [
("deposit", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="depositrequest",
name="archive",
field=models.FileField(
null=True, upload_to=swh.deposit.models.client_directory_path
),
),
]
diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py
index ebac5f14..a2724bee 100644
--- a/swh/deposit/migrations/0018_migrate_swhids.py
+++ b/swh/deposit/migrations/0018_migrate_swhids.py
@@ -1,363 +1,342 @@
# -*- coding: utf-8 -*-
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
from __future__ import unicode_literals
-import os
import logging
+import os
+from typing import Any, Dict, Optional, Tuple
from django.db import migrations
-from typing import Any, Dict, Optional, Tuple
from swh.core import config
from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS
from swh.model.hashutil import hash_to_bytes, hash_to_hex
-from swh.model.identifiers import (
- parse_persistent_identifier,
- persistent_identifier,
- DIRECTORY,
- REVISION,
- SNAPSHOT,
-)
+from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, parse_swhid, swhid
from swh.storage import get_storage as get_storage_client
-
+from swh.storage.algos.snapshot import snapshot_id_get_from_revision
SWH_PROVIDER_URL = "https://www.softwareheritage.org"
logger = logging.getLogger(__name__)
swh_storage = None
def get_storage() -> Optional[Any]:
"""Instantiate a storage client
"""
settings = os.environ.get("DJANGO_SETTINGS_MODULE")
if settings != "swh.deposit.settings.production": # Bypass for now
return None
global swh_storage
if not swh_storage:
config_file = os.environ.get("SWH_CONFIG_FILENAME")
if not config_file:
raise ValueError(
"Production: SWH_CONFIG_FILENAME must be set to the"
" configuration file needed!"
)
if not os.path.exists(config_file):
raise ValueError(
"Production: configuration file %s does not exist!" % (config_file,)
)
conf = config.load_named_config(config_file)
if not conf:
raise ValueError(
"Production: configuration %s does not exist." % (config_file,)
)
storage_config = conf.get("storage")
if not storage_config:
raise ValueError(
"Production: invalid configuration; missing 'storage' config entry."
)
swh_storage = get_storage_client(**storage_config)
return swh_storage
-def get_snapshot(storage, origin: str, revision_id: str) -> Optional[str]:
- """Retrieve the snapshot targeting the revision_id for the given origin.
-
- """
- all_visits = storage.origin_visit_get(origin)
- for visit in all_visits:
- if not visit["snapshot"]:
- continue
- detail_snapshot = storage.snapshot_get(visit["snapshot"])
- if not detail_snapshot:
- continue
- for branch_name, branch in detail_snapshot["branches"].items():
- if branch["target_type"] == "revision":
- revision = branch["target"]
- if hash_to_hex(revision) == revision_id:
- # Found the snapshot
- return hash_to_hex(visit["snapshot"])
- return None
-
-
def migrate_deposit_swhid_context_not_null(apps, schema_editor):
"""Migrate deposit SWHIDs to the new format.
Migrate deposit SWHIDs to the new format. Only deposit with status done and
swh_id_context not null are concerned.
"""
storage = get_storage()
if not storage:
logging.warning("Nothing to do")
return None
Deposit = apps.get_model("deposit", "Deposit")
for deposit in Deposit.objects.filter(
status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False
):
- obj_dir = parse_persistent_identifier(deposit.swh_id_context)
+ obj_dir = parse_swhid(deposit.swh_id_context)
assert obj_dir.object_type == DIRECTORY
- obj_rev = parse_persistent_identifier(deposit.swh_anchor_id)
+ obj_rev = parse_swhid(deposit.swh_anchor_id)
assert obj_rev.object_type == REVISION
if set(obj_dir.metadata.keys()) != {"origin"}:
# Assuming the migration is already done for that deposit
logger.warning(
"Deposit id %s: Migration already done, skipping", deposit.id
)
continue
# Starting migration
dir_id = obj_dir.object_id
origin = obj_dir.metadata["origin"]
- check_origin = storage.origin_get({"url": origin})
+ check_origin = storage.origin_get([origin])[0]
if not check_origin:
logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin)
continue
rev_id = obj_rev.object_id
# Find the snapshot targeting the revision
- snp_id = get_snapshot(storage, origin, rev_id)
- if not snp_id:
+ snp_id = snapshot_id_get_from_revision(storage, origin, hash_to_bytes(rev_id))
+ if snp_id is None:
logger.warning(
"Deposit id %s: Snapshot targeting revision %s not found!",
deposit.id,
rev_id,
)
continue
# Reference the old values to do some checks later
old_swh_id = deposit.swh_id
old_swh_id_context = deposit.swh_id_context
old_swh_anchor_id = deposit.swh_anchor_id
old_swh_anchor_id_context = deposit.swh_anchor_id_context
# Update
- deposit.swh_id_context = persistent_identifier(
+ deposit.swh_id_context = swhid(
DIRECTORY,
dir_id,
metadata={
"origin": origin,
- "visit": persistent_identifier(SNAPSHOT, snp_id),
- "anchor": persistent_identifier(REVISION, rev_id),
+ "visit": swhid(SNAPSHOT, snp_id.hex()),
+ "anchor": swhid(REVISION, rev_id),
"path": "/",
},
)
# Ensure only deposit.swh_id_context changed
logging.debug("deposit.id: {deposit.id}")
logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id)
assert old_swh_id == deposit.swh_id
logging.debug(
"deposit.swh_id_context: %s -> %s",
old_swh_id_context,
deposit.swh_id_context,
)
assert old_swh_id_context != deposit.swh_id_context
logging.debug(
"deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id
)
assert old_swh_anchor_id == deposit.swh_anchor_id
logging.debug(
"deposit.swh_anchor_id_context: %s -> %s",
old_swh_anchor_id_context,
deposit.swh_anchor_id_context,
)
assert old_swh_anchor_id_context == deposit.swh_anchor_id_context
# Commit
deposit.save()
def resolve_origin(deposit_id: int, provider_url: str, external_id: str) -> str:
"""Resolve the origin from provider-url and external-id
For some edge case, only the external_id is used as there is some old inconsistency
from testing which exists.
"""
map_edge_case_origin: Dict[Tuple[int, str], str] = {
(
76,
"hal-01588782",
): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782",
(
87,
"hal-01588927",
): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927",
(89, "hal-01588935"): "https://hal-preprod.archives-ouvertes.fr/hal-01588935",
(
88,
"hal-01588928",
): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588928",
(
90,
"hal-01588942",
): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942",
(143, "hal-01592430"): "https://hal-preprod.archives-ouvertes.fr/hal-01592430",
(
75,
"hal-01588781",
): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781",
}
origin = map_edge_case_origin.get((deposit_id, external_id))
if origin:
return origin
# Some simpler origin edge cases (mostly around the initial deposits)
map_origin = {
(
SWH_PROVIDER_URL,
"je-suis-gpl",
): "https://forge.softwareheritage.org/source/jesuisgpl/",
(
SWH_PROVIDER_URL,
"external-id",
): "https://hal.archives-ouvertes.fr/external-id",
}
key = (provider_url, external_id)
return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}")
def migrate_deposit_swhid_context_null(apps, schema_editor):
"""Migrate deposit SWHIDs to the new format.
Migrate deposit whose swh_id_context is not set (initial deposits not migrated at
the time). Only deposit with status done and swh_id_context null are concerned.
Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can
align them as well.
"""
storage = get_storage()
if not storage:
logging.warning("Nothing to do")
return None
Deposit = apps.get_model("deposit", "Deposit")
for deposit in Deposit.objects.filter(
status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True
):
- obj_rev = parse_persistent_identifier(deposit.swh_id)
+ obj_rev = parse_swhid(deposit.swh_id)
if obj_rev.object_type == DIRECTORY:
# Assuming the migration is already done for that deposit
logger.warning(
"Deposit id %s: Migration already done, skipping", deposit.id
)
continue
# Ensuring Migration not done
assert obj_rev.object_type == REVISION
assert deposit.swh_id is not None
assert deposit.swh_id_context is None
assert deposit.swh_anchor_id is None
assert deposit.swh_anchor_id_context is None
rev_id = obj_rev.object_id
- revisions = list(storage.revision_get([hash_to_bytes(rev_id)]))
- if not revisions:
+ rev_id_bytes = hash_to_bytes(rev_id)
+ revision = storage.revision_get([rev_id_bytes])[0]
+ if not revision:
logger.warning("Deposit id %s: Revision %s not found!", deposit.id, rev_id)
continue
- revision = revisions[0]
provider_url = deposit.client.provider_url
external_id = deposit.external_id
origin = resolve_origin(deposit.id, provider_url, external_id)
- check_origin = storage.origin_get({"url": origin})
+ check_origin = storage.origin_get([origin])[0]
if not check_origin:
logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin)
continue
dir_id = hash_to_hex(revision["directory"])
# Reference the old values to do some checks later
old_swh_id = deposit.swh_id
old_swh_id_context = deposit.swh_id_context
old_swh_anchor_id = deposit.swh_anchor_id
old_swh_anchor_id_context = deposit.swh_anchor_id_context
# retrieve the snapshot from the archive
- snp_id = get_snapshot(storage, origin, rev_id)
- if not snp_id:
+ snp_id = snapshot_id_get_from_revision(storage, origin, rev_id_bytes)
+ if snp_id is None:
logger.warning(
"Deposit id %s: Snapshot targeting revision %s not found!",
deposit.id,
rev_id,
)
continue
# New SWHIDs ids
- deposit.swh_id = persistent_identifier(DIRECTORY, dir_id)
- deposit.swh_id_context = persistent_identifier(
+ deposit.swh_id = swhid(DIRECTORY, dir_id)
+ deposit.swh_id_context = swhid(
DIRECTORY,
dir_id,
metadata={
"origin": origin,
- "visit": persistent_identifier(SNAPSHOT, snp_id),
- "anchor": persistent_identifier(REVISION, rev_id),
+ "visit": swhid(SNAPSHOT, snp_id.hex()),
+ "anchor": swhid(REVISION, rev_id),
"path": "/",
},
)
# Realign the remaining deposit SWHIDs fields
- deposit.swh_anchor_id = persistent_identifier(REVISION, rev_id)
- deposit.swh_anchor_id_context = persistent_identifier(
+ deposit.swh_anchor_id = swhid(REVISION, rev_id)
+ deposit.swh_anchor_id_context = swhid(
REVISION, rev_id, metadata={"origin": origin,}
)
# Ensure only deposit.swh_id_context changed
logging.debug("deposit.id: {deposit.id}")
logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id)
assert old_swh_id != deposit.swh_id
logging.debug(
"deposit.swh_id_context: %s -> %s",
old_swh_id_context,
deposit.swh_id_context,
)
assert old_swh_id_context != deposit.swh_id_context
assert deposit.swh_id_context is not None
logging.debug(
"deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id
)
assert deposit.swh_anchor_id == old_swh_id
assert deposit.swh_anchor_id is not None
logging.debug(
"deposit.swh_anchor_id_context: %s -> %s",
old_swh_anchor_id_context,
deposit.swh_anchor_id_context,
)
assert deposit.swh_anchor_id_context is not None
deposit.save()
class Migration(migrations.Migration):
dependencies = [
("deposit", "0017_auto_20190925_0906"),
]
operations = [
# Migrate and make the operations possibly reversible
# https://docs.djangoproject.com/en/3.0/ref/migration-operations/#django.db.migrations.operations.RunPython.noop # noqa
migrations.RunPython(
migrate_deposit_swhid_context_not_null,
reverse_code=migrations.RunPython.noop,
),
migrations.RunPython(
migrate_deposit_swhid_context_null, reverse_code=migrations.RunPython.noop
),
]
diff --git a/swh/deposit/models.py b/swh/deposit/models.py
index 04e86e6c..03b015e6 100644
--- a/swh/deposit/models.py
+++ b/swh/deposit/models.py
@@ -1,240 +1,240 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# Generated from:
# cd swh_deposit && \
# python3 -m manage inspectdb
import datetime
-from django.contrib.postgres.fields import JSONField, ArrayField
from django.contrib.auth.models import User, UserManager
+from django.contrib.postgres.fields import ArrayField, JSONField
from django.db import models
from django.utils.timezone import now
from .config import (
- DEPOSIT_STATUS_VERIFIED,
+ ARCHIVE_TYPE,
DEPOSIT_STATUS_DEPOSITED,
- DEPOSIT_STATUS_PARTIAL,
- DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_LOAD_FAILURE,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ DEPOSIT_STATUS_PARTIAL,
DEPOSIT_STATUS_REJECTED,
- ARCHIVE_TYPE,
+ DEPOSIT_STATUS_VERIFIED,
METADATA_TYPE,
)
class Dbversion(models.Model):
"""Db version
"""
version = models.IntegerField(primary_key=True)
release = models.DateTimeField(default=now, null=True)
description = models.TextField(blank=True, null=True)
class Meta:
db_table = "dbversion"
def __str__(self):
return str(
{
"version": self.version,
"release": self.release,
"description": self.description,
}
)
"""Possible status"""
DEPOSIT_STATUS = [
(DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL),
("expired", "expired"),
(DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED),
(DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED),
(DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED),
("loading", "loading"),
(DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS),
(DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE),
]
"""Possible status and the detailed meaning."""
DEPOSIT_STATUS_DETAIL = {
DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, "
"In-Progress header should be false",
"expired": "Deposit has been there too long and is now "
"deemed ready to be garbage collected",
DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks "
"(tarball ok, metadata, etc...)",
DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and "
"ready for loading",
DEPOSIT_STATUS_REJECTED: "Deposit failed the checks",
"loading": "Loading is ongoing on swh's side",
DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully "
"loaded into the Software Heritage archive",
DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the "
"Software Heritage archive failed",
}
class DepositClient(User):
"""Deposit client
"""
collections = ArrayField(models.IntegerField(), null=True)
objects = UserManager() # type: ignore
# this typing hint is due to a mypy/django-stubs limitation,
# see https://github.com/typeddjango/django-stubs/issues/174
provider_url = models.TextField(null=False)
domain = models.TextField(null=False)
class Meta:
db_table = "deposit_client"
def __str__(self):
return str(
{
"id": self.id,
"collections": self.collections,
"username": super().username,
"domain": self.domain,
"provider_url": self.provider_url,
}
)
class Deposit(models.Model):
"""Deposit reception table
"""
id = models.BigAutoField(primary_key=True)
# First deposit reception date
reception_date = models.DateTimeField(auto_now_add=True)
# Date when the deposit is deemed complete and ready for loading
complete_date = models.DateTimeField(null=True)
# collection concerned by the deposit
collection = models.ForeignKey("DepositCollection", models.DO_NOTHING)
# Deposit's external identifier
external_id = models.TextField()
# Deposit client
client = models.ForeignKey("DepositClient", models.DO_NOTHING)
# SWH's loading result identifier
swh_id = models.TextField(blank=True, null=True)
swh_id_context = models.TextField(blank=True, null=True)
# Deposit's status regarding loading
status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL)
status_detail = JSONField(null=True)
# deposit can have one parent
parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True)
check_task_id = models.TextField(
blank=True, null=True, verbose_name="Scheduler's associated checking task id"
)
load_task_id = models.TextField(
blank=True, null=True, verbose_name="Scheduler's associated loading task id"
)
class Meta:
db_table = "deposit"
def __str__(self):
d = {
"id": self.id,
"reception_date": self.reception_date,
"collection": self.collection.name,
"external_id": self.external_id,
"client": self.client.username,
"status": self.status,
}
if self.status in (DEPOSIT_STATUS_REJECTED):
d["status_detail"] = self.status_detail
return str(d)
@property
def origin_url(self):
return "%s/%s" % (self.client.provider_url.rstrip("/"), self.external_id)
def client_directory_path(instance: "DepositRequest", filename: str) -> str:
"""Callable to determine the upload archive path. This defaults to
MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/.
The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit
formatted using strftime.
Args:
instance: DepositRequest concerned by the upload
filename: Filename of the uploaded file
Returns:
The upload archive path.
"""
reception_date = instance.deposit.reception_date
assert isinstance(reception_date, datetime.datetime)
folder = reception_date.strftime("%Y%m%d-%H%M%S.%f")
return f"client_{instance.deposit.client.id}/{folder}/{filename}"
REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)]
class DepositRequest(models.Model):
"""Deposit request associated to one deposit.
"""
id = models.BigAutoField(primary_key=True)
# Deposit concerned by the request
deposit = models.ForeignKey(Deposit, models.DO_NOTHING)
date = models.DateTimeField(auto_now_add=True)
# Deposit request information on the data to inject
# this can be null when type is 'archive'
metadata = JSONField(null=True)
raw_metadata = models.TextField(null=True)
# this can be null when type is 'metadata'
archive = models.FileField(null=True, upload_to=client_directory_path)
type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True)
class Meta:
db_table = "deposit_request"
def __str__(self):
meta = None
if self.metadata:
from json import dumps
meta = dumps(self.metadata)
archive_name = None
if self.archive:
archive_name = self.archive.name
return str(
{
"id": self.id,
"deposit": self.deposit,
"metadata": meta,
"archive": archive_name,
}
)
class DepositCollection(models.Model):
id = models.BigAutoField(primary_key=True)
# Human readable name for the collection type e.g HAL, arXiv, etc...
name = models.TextField()
class Meta:
db_table = "deposit_collection"
def __str__(self):
return str({"id": self.id, "name": self.name})
diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py
index 0cb49065..9f52a3af 100644
--- a/swh/deposit/parsers.py
+++ b/swh/deposit/parsers.py
@@ -1,96 +1,94 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of defining parsers with SWORD 2.0 supported mediatypes.
"""
-import xmltodict
+from xml.parsers.expat import ExpatError
from django.conf import settings
-from rest_framework.parsers import BaseParser
-from rest_framework.parsers import FileUploadParser
-from rest_framework.parsers import MultiPartParser
-from xml.parsers.expat import ExpatError
+from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser
+import xmltodict
from swh.deposit.errors import ParserError
class SWHFileUploadZipParser(FileUploadParser):
"""File upload parser limited to zip archive.
"""
media_type = "application/zip"
class SWHFileUploadTarParser(FileUploadParser):
"""File upload parser limited to tarball (tar, tar.gz, tar.*) archives.
"""
media_type = "application/x-tar"
class SWHXMLParser(BaseParser):
"""
XML parser.
"""
media_type = "application/xml"
def parse(self, stream, media_type=None, parser_context=None):
"""
Parses the incoming bytestream as XML and returns the resulting data.
"""
parser_context = parser_context or {}
encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET)
data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False)
if "entry" in data:
data = data["entry"]
return data
class SWHAtomEntryParser(SWHXMLParser):
"""Atom entry parser limited to specific mediatype
"""
media_type = "application/atom+xml;type=entry"
def parse(self, stream, media_type=None, parser_context=None):
# We do not actually want to parse the stream yet
# because we want to keep the raw data as well
# this is done later in the atom entry call
- # (cf. swh.deposit.api.common.SWHBaseDeposit._atom_entry)
+ # (cf. swh.deposit.api.common.APIBase._atom_entry)
return stream
class SWHMultiPartParser(MultiPartParser):
"""Multipart parser limited to a subset of mediatypes.
"""
media_type = "multipart/*; *"
def parse_xml(raw_content):
"""Parse xml body.
Args:
raw_content (bytes): The content to parse
Raises:
ParserError in case of a malformed xml
Returns:
content parsed as dict.
"""
try:
return SWHXMLParser().parse(raw_content)
except ExpatError as e:
raise ParserError(str(e))
diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py
index 5cc7c8b1..e1e6ac4d 100644
--- a/swh/deposit/settings/production.py
+++ b/swh/deposit/settings/production.py
@@ -1,110 +1,111 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
+from swh.core import config
+
from .common import * # noqa
from .common import ALLOWED_HOSTS
-from swh.core import config
ALLOWED_HOSTS += ["deposit.softwareheritage.org"]
# Setup support for proxy headers
USE_X_FORWARDED_HOST = True
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
DEBUG = False
# Database
# https://docs.djangoproject.com/en/1.10/ref/settings/#databases
# https://docs.djangoproject.com/en/1.10/ref/settings/#std:setting-DATABASES
# https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/#databases
# Retrieve the deposit's configuration file
# and check the required setup is ok
# If not raise an error explaining the errors
config_file = os.environ.get("SWH_CONFIG_FILENAME")
if not config_file:
raise ValueError(
"Production: SWH_CONFIG_FILENAME must be set to the"
" configuration file needed!"
)
if not os.path.exists(config_file):
raise ValueError(
"Production: configuration file %s does not exist!" % (config_file,)
)
conf = config.load_named_config(config_file)
if not conf:
raise ValueError("Production: configuration %s does not exist." % (config_file,))
for key in ("scheduler", "private"):
if not conf.get(key):
raise ValueError(
"Production: invalid configuration; missing %s config entry." % (key,)
)
ALLOWED_HOSTS += conf.get("allowed_hosts", [])
private_conf = conf["private"]
SECRET_KEY = private_conf["secret_key"]
# https://docs.djangoproject.com/en/1.10/ref/settings/#logging
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"standard": {
"format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa
"datefmt": "%d/%b/%Y %H:%M:%S",
},
},
"handlers": {
"console": {
"level": "INFO",
"class": "logging.StreamHandler",
"formatter": "standard",
},
},
"loggers": {
"django": {"handlers": ["console"], "level": "INFO", "propagate": True,},
},
}
# database
db_conf = private_conf.get("db", {"name": "unset"})
db = {
"ENGINE": "django.db.backends.postgresql",
"NAME": db_conf["name"],
}
db_user = db_conf.get("user")
if db_user:
db["USER"] = db_user
db_pass = db_conf.get("password")
if db_pass:
db["PASSWORD"] = db_pass
db_host = db_conf.get("host")
if db_host:
db["HOST"] = db_host
db_port = db_conf.get("port")
if db_port:
db["PORT"] = db_port
# https://docs.djangoproject.com/en/1.10/ref/settings/#databases
DATABASES = {
"default": db,
}
# Upload user directory
# https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT
MEDIA_ROOT = private_conf.get("media_root")
diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py
index 1f5f779a..17e29af7 100644
--- a/swh/deposit/tests/api/conftest.py
+++ b/swh/deposit/tests/api/conftest.py
@@ -1,87 +1,93 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
-import pytest
+import os
from django.urls import reverse
+import pytest
+from swh.deposit.api.private.deposit_check import APIChecks
from swh.deposit.config import (
- DEPOSIT_STATUS_DEPOSITED,
COL_IRI,
+ DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_VERIFIED,
)
from swh.deposit.models import Deposit
from swh.deposit.parsers import parse_xml
-from swh.deposit.api.private.deposit_check import SWHChecksDeposit
+
+@pytest.fixture
+def datadir(request):
+ """Override default datadir to target main test datadir"""
+ return os.path.join(os.path.dirname(str(request.fspath)), "../data")
@pytest.fixture
def ready_deposit_ok(partial_deposit_with_metadata):
"""Returns a deposit ready for checks (it will pass the checks).
"""
deposit = partial_deposit_with_metadata
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return deposit
@pytest.fixture
def ready_deposit_verified(partial_deposit_with_metadata):
"""Returns a deposit ready for checks (it will pass the checks).
"""
deposit = partial_deposit_with_metadata
deposit.status = DEPOSIT_STATUS_VERIFIED
deposit.save()
return deposit
@pytest.fixture
def ready_deposit_only_metadata(partial_deposit_only_metadata):
"""Deposit in status ready that will fail the checks (because missing
archive).
"""
deposit = partial_deposit_only_metadata
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return deposit
@pytest.fixture
def ready_deposit_invalid_archive(authenticated_client, deposit_collection):
url = reverse(COL_IRI, args=[deposit_collection.name])
data = b"some data which is clearly not a zip file"
md5sum = hashlib.md5(data).hexdigest()
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=data,
# + headers
CONTENT_LENGTH=len(data),
# other headers needs HTTP_ prefix to be taken into account
HTTP_SLUG="external-id-invalid",
HTTP_CONTENT_MD5=md5sum,
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
response_content = parse_xml(response.content)
deposit_id = int(response_content["deposit_id"])
deposit = Deposit.objects.get(pk=deposit_id)
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return deposit
@pytest.fixture
def swh_checks_deposit():
- return SWHChecksDeposit()
+ return APIChecks()
diff --git a/swh/deposit/tests/api/test_deposit.py b/swh/deposit/tests/api/test_deposit.py
index 2e6cce7b..00c38d91 100644
--- a/swh/deposit/tests/api/test_deposit.py
+++ b/swh/deposit/tests/api/test_deposit.py
@@ -1,195 +1,194 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
+from io import BytesIO
from django.urls import reverse
-from io import BytesIO
from rest_framework import status
from swh.deposit.config import (
COL_IRI,
- EDIT_SE_IRI,
- DEPOSIT_STATUS_REJECTED,
- DEPOSIT_STATUS_PARTIAL,
- DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_LOAD_FAILURE,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ DEPOSIT_STATUS_PARTIAL,
+ DEPOSIT_STATUS_REJECTED,
+ EDIT_SE_IRI,
)
-
from swh.deposit.models import Deposit
from swh.deposit.parsers import parse_xml
def test_deposit_post_will_fail_with_401(client):
"""Without authentication, endpoint refuses access with 401 response
"""
url = reverse(COL_IRI, args=["hal"])
response = client.post(url)
assert response.status_code == status.HTTP_401_UNAUTHORIZED
def test_access_to_another_user_collection_is_forbidden(
authenticated_client, deposit_another_collection, deposit_user
):
"""Access to another user collection should return a 403
"""
coll2 = deposit_another_collection
url = reverse(COL_IRI, args=[coll2.name])
response = authenticated_client.post(url)
assert response.status_code == status.HTTP_403_FORBIDDEN
msg = "Client %s cannot access collection %s" % (deposit_user.username, coll2.name,)
assert msg in response.content.decode("utf-8")
def test_delete_on_col_iri_not_supported(authenticated_client, deposit_collection):
"""Delete on col iri should return a 405 response
"""
url = reverse(COL_IRI, args=[deposit_collection.name])
response = authenticated_client.delete(url)
assert response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED
assert "DELETE method is not supported on this endpoint" in response.content.decode(
"utf-8"
)
def create_deposit_with_rejection_status(authenticated_client, deposit_collection):
url = reverse(COL_IRI, args=[deposit_collection.name])
data = b"some data which is clearly not a zip file"
md5sum = hashlib.md5(data).hexdigest()
external_id = "some-external-id-1"
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=data,
# + headers
CONTENT_LENGTH=len(data),
# other headers needs HTTP_ prefix to be taken into account
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=md5sum,
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
actual_state = response_content["deposit_status"]
assert actual_state == DEPOSIT_STATUS_REJECTED
def test_act_on_deposit_rejected_is_not_permitted(
authenticated_client, deposit_collection, rejected_deposit, atom_dataset
):
deposit = rejected_deposit
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[deposit.collection.name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
HTTP_SLUG=deposit.external_id,
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
msg = "You can only act on deposit with status '%s'" % (
DEPOSIT_STATUS_PARTIAL,
)
assert msg in response.content.decode("utf-8")
def test_add_deposit_when_partial_makes_new_deposit(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Posting deposit on collection when previous is partial makes new deposit
"""
deposit = partial_deposit
assert deposit.status == DEPOSIT_STATUS_PARTIAL
# adding a new deposit with the same external id
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"] % deposit.external_id,
HTTP_SLUG=deposit.external_id,
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
assert deposit_id != deposit.id # new deposit
new_deposit = Deposit.objects.get(pk=deposit_id)
assert new_deposit != deposit
assert new_deposit.parent is None
def test_add_deposit_when_failed_makes_new_deposit_with_no_parent(
authenticated_client, deposit_collection, failed_deposit, atom_dataset
):
"""Posting deposit on collection when deposit done makes new deposit with
parent
"""
deposit = failed_deposit
assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE
# adding a new deposit with the same external id as a completed deposit
# creates the parenting chain
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"] % deposit.external_id,
HTTP_SLUG=deposit.external_id,
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
assert deposit_id != deposit.id
new_deposit = Deposit.objects.get(pk=deposit_id)
assert new_deposit != deposit
assert new_deposit.parent is None
def test_add_deposit_when_done_makes_new_deposit_with_parent_old_one(
authenticated_client, deposit_collection, completed_deposit, atom_dataset
):
"""Posting deposit on collection when deposit done makes new deposit with
parent
"""
# given multiple deposit already loaded
deposit = completed_deposit
assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS
# adding a new deposit with the same external id as a completed deposit
# creates the parenting chain
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"] % deposit.external_id,
HTTP_SLUG=deposit.external_id,
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
assert deposit_id != deposit.id
new_deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.collection == new_deposit.collection
assert deposit.external_id == new_deposit.external_id
assert new_deposit != deposit
assert new_deposit.parent == deposit
diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py
index 2de803d7..f551a317 100644
--- a/swh/deposit/tests/api/test_deposit_atom.py
+++ b/swh/deposit/tests/api/test_deposit_atom.py
@@ -1,326 +1,326 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import pytest
+from io import BytesIO
from django.urls import reverse
-from io import BytesIO
+import pytest
from rest_framework import status
from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED
-from swh.deposit.models import Deposit, DepositRequest, DepositCollection
+from swh.deposit.models import Deposit, DepositCollection, DepositRequest
from swh.deposit.parsers import parse_xml
def test_post_deposit_atom_201_even_with_decimal(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting an initial atom entry should return 201 with deposit receipt
"""
atom_error_with_decimal = atom_dataset["error-with-decimal"]
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_error_with_decimal,
HTTP_SLUG="external-id",
HTTP_IN_PROGRESS="false",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
dr = DepositRequest.objects.get(deposit=deposit)
assert dr.metadata is not None
sw_version = dr.metadata.get("codemeta:softwareVersion")
assert sw_version == "10.4"
def test_post_deposit_atom_400_with_empty_body(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting empty body request should return a 400 response
"""
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data-empty-body"],
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_atom_400_badly_formatted_atom(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting a badly formatted atom should return a 400 response
"""
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data-badly-formatted"],
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_atom_parsing_error(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting parsing error prone atom should return 400
"""
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data-parsing-error-prone"],
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_atom_no_slug_header(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting an atom entry without a slug header should return a 400
"""
url = reverse(COL_IRI, args=[deposit_collection.name])
# when
response = authenticated_client.post(
url,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"],
# + headers
HTTP_IN_PROGRESS="false",
)
assert b"Missing SLUG header" in response.content
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset):
"""Posting an atom entry to an unknown collection should return a 404
"""
unknown_collection = "unknown-one"
with pytest.raises(DepositCollection.DoesNotExist):
DepositCollection.objects.get(name=unknown_collection)
response = authenticated_client.post(
reverse(COL_IRI, args=[unknown_collection]), # <- unknown collection
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"],
HTTP_SLUG="something",
)
assert response.status_code == status.HTTP_404_NOT_FOUND
def test_post_deposit_atom_entry_initial(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting an initial atom entry should return 201 with deposit receipt
"""
# given
external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
atom_entry_data = atom_dataset["entry-data0"] % external_id
# when
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_entry_data,
HTTP_SLUG=external_id,
HTTP_IN_PROGRESS="false",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.collection == deposit_collection
assert deposit.external_id == external_id
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
# one associated request to a deposit
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.metadata is not None
assert deposit_request.raw_metadata == atom_entry_data
assert bool(deposit_request.archive) is False
def test_post_deposit_atom_entry_with_codemeta(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting an initial atom entry should return 201 with deposit receipt
"""
# given
external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
atom_entry_data = atom_dataset["codemeta-sample"] % external_id
# when
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_entry_data,
HTTP_SLUG=external_id,
HTTP_IN_PROGRESS="false",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.collection == deposit_collection
assert deposit.external_id == external_id
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
# one associated request to a deposit
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.metadata is not None
assert deposit_request.raw_metadata == atom_entry_data
assert bool(deposit_request.archive) is False
def test_post_deposit_atom_entry_tei(
authenticated_client, deposit_collection, atom_dataset
):
"""Posting initial atom entry as TEI should return 201 with receipt
"""
# given
external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
atom_entry_data = atom_dataset["tei-sample"]
# when
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_entry_data,
HTTP_SLUG=external_id,
HTTP_IN_PROGRESS="false",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.collection == deposit_collection
assert deposit.external_id == external_id
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
# one associated request to a deposit
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.metadata is not None
assert deposit_request.raw_metadata == atom_entry_data
assert bool(deposit_request.archive) is False
def test_post_deposit_atom_entry_multiple_steps(
authenticated_client, deposit_collection, atom_dataset
):
"""After initial deposit, updating a deposit should return a 201
"""
# given
external_id = "urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a"
with pytest.raises(Deposit.DoesNotExist):
deposit = Deposit.objects.get(external_id=external_id)
# when
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
HTTP_IN_PROGRESS="True",
HTTP_SLUG=external_id,
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = int(response_content["deposit_id"])
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.collection == deposit_collection
assert deposit.external_id == external_id
assert deposit.status == "partial"
# one associated request to a deposit
deposit_requests = DepositRequest.objects.filter(deposit=deposit)
assert len(deposit_requests) == 1
atom_entry_data = atom_dataset["entry-data-minimal"] % external_id.encode(
"utf-8"
) # noqa
update_uri = response._headers["location"][1]
# when updating the first deposit post
response = authenticated_client.post(
update_uri,
content_type="application/atom+xml;type=entry",
data=atom_entry_data,
HTTP_IN_PROGRESS="False",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = int(response_content["deposit_id"])
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.collection == deposit_collection
assert deposit.external_id == external_id
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert len(Deposit.objects.all()) == 1
# now 2 associated requests to a same deposit
deposit_requests = DepositRequest.objects.filter(deposit=deposit).order_by("id")
assert len(deposit_requests) == 2
atom_entry_data1 = atom_dataset["entry-data1"]
expected_meta = [
{"metadata": parse_xml(atom_entry_data1), "raw_metadata": atom_entry_data1},
{"metadata": parse_xml(atom_entry_data), "raw_metadata": atom_entry_data},
]
for i, deposit_request in enumerate(deposit_requests):
actual_metadata = deposit_request.metadata
assert actual_metadata == expected_meta[i]["metadata"]
assert deposit_request.raw_metadata == expected_meta[i]["raw_metadata"]
assert bool(deposit_request.archive) is False
diff --git a/swh/deposit/tests/api/test_deposit_binary.py b/swh/deposit/tests/api/test_deposit_binary.py
index eced5d17..00b4179b 100644
--- a/swh/deposit/tests/api/test_deposit_binary.py
+++ b/swh/deposit/tests/api/test_deposit_binary.py
@@ -1,567 +1,562 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import pytest
+from io import BytesIO
from django.core.files.uploadedfile import InMemoryUploadedFile
from django.urls import reverse
-from io import BytesIO
-
+import pytest
from rest_framework import status
-from swh.deposit.config import (
- COL_IRI,
- EM_IRI,
- DEPOSIT_STATUS_DEPOSITED,
-)
+from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED, EM_IRI
from swh.deposit.models import Deposit, DepositRequest
from swh.deposit.parsers import parse_xml
-from swh.deposit.tests.common import create_arborescence_archive, check_archive
+from swh.deposit.tests.common import check_archive, create_arborescence_archive
def test_post_deposit_binary_no_slug(
authenticated_client, deposit_collection, sample_archive
):
"""Posting a binary deposit without slug header should return 400
"""
url = reverse(COL_IRI, args=[deposit_collection.name])
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
assert b"Missing SLUG header" in response.content
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_binary_support(
authenticated_client, deposit_collection, sample_archive
):
"""Binary upload with content-type not in [zip,x-tar] should return 415
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id-1"
# when
response = authenticated_client.post(
url,
content_type="application/octet-stream",
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
# then
assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_binary_upload_ok(
authenticated_client, deposit_collection, sample_archive
):
"""Binary upload with correct headers should return 201 with receipt
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id-1"
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
# other headers needs HTTP_ prefix to be taken into account
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"],),
)
# then
response_content = parse_xml(BytesIO(response.content))
assert response.status_code == status.HTTP_201_CREATED
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_request = DepositRequest.objects.get(deposit=deposit)
check_archive(sample_archive["name"], deposit_request.archive.name)
assert deposit_request.metadata is None
assert deposit_request.raw_metadata is None
response_content = parse_xml(BytesIO(response.content))
assert response_content["deposit_archive"] == sample_archive["name"]
assert int(response_content["deposit_id"]) == deposit.id
assert response_content["deposit_status"] == deposit.status
edit_se_iri = reverse("edit_se_iri", args=[deposit_collection.name, deposit.id])
assert response._headers["location"] == (
"Location",
"http://testserver" + edit_se_iri,
)
def test_post_deposit_binary_failure_unsupported_packaging_header(
authenticated_client, deposit_collection, sample_archive
):
"""Bin deposit without supported content_disposition header returns 400
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id"
# when
response = authenticated_client.post(
url,
content_type="application/zip",
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="something-unsupported",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_binary_upload_no_content_disposition_header(
authenticated_client, deposit_collection, sample_archive
):
"""Binary upload without content_disposition header should return 400
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id"
# when
response = authenticated_client.post(
url,
content_type="application/zip",
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
)
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_mediation_not_supported(
authenticated_client, deposit_collection, sample_archive
):
"""Binary upload with mediation should return a 412 response
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id-1"
# when
response = authenticated_client.post(
url,
content_type="application/zip",
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_ON_BEHALF_OF="someone",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
# then
assert response.status_code == status.HTTP_412_PRECONDITION_FAILED
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_binary_upload_fail_if_upload_size_limit_exceeded(
authenticated_client, deposit_collection, sample_archive, tmp_path
):
"""Binary upload must not exceed the limit set up...
"""
tmp_path = str(tmp_path)
url = reverse(COL_IRI, args=[deposit_collection.name])
archive = create_arborescence_archive(
tmp_path, "archive2", "file2", b"some content in file", up_to_size=500
)
external_id = "some-external-id"
# when
response = authenticated_client.post(
url,
content_type="application/zip",
data=archive["data"],
# + headers
CONTENT_LENGTH=archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
# then
assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
assert b"Upload size limit exceeded" in response.content
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_2_post_2_different_deposits(
authenticated_client, deposit_collection, sample_archive
):
"""2 posting deposits should return 2 different 201 with receipt
"""
url = reverse(COL_IRI, args=[deposit_collection.name])
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG="some-external-id-1",
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
deposits = Deposit.objects.all()
assert len(deposits) == 1
assert deposits[0] == deposit
# second post
response = authenticated_client.post(
url,
content_type="application/x-tar", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG="another-external-id",
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename1",
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id2 = response_content["deposit_id"]
deposit2 = Deposit.objects.get(pk=deposit_id2)
assert deposit != deposit2
deposits = Deposit.objects.all().order_by("id")
assert len(deposits) == 2
assert list(deposits), [deposit == deposit2]
def test_post_deposit_binary_and_post_to_add_another_archive(
authenticated_client, deposit_collection, sample_archive, tmp_path
):
"""Updating a deposit should return a 201 with receipt
"""
tmp_path = str(tmp_path)
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id-1"
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="true",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"],),
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == "partial"
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.deposit == deposit
assert deposit_request.type == "archive"
check_archive(sample_archive["name"], deposit_request.archive.name)
# 2nd archive to upload
archive2 = create_arborescence_archive(
tmp_path, "archive2", "file2", b"some other content in file"
)
# uri to update the content
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id])
# adding another archive for the deposit and finalizing it
response = authenticated_client.post(
update_uri,
content_type="application/zip", # as zip
data=archive2["data"],
# + headers
CONTENT_LENGTH=archive2["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"]),
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_requests = list(
DepositRequest.objects.filter(deposit=deposit).order_by("id")
)
# 2 deposit requests for the same deposit
assert len(deposit_requests) == 2
assert deposit_requests[0].deposit == deposit
assert deposit_requests[0].type == "archive"
check_archive(sample_archive["name"], deposit_requests[0].archive.name)
assert deposit_requests[1].deposit == deposit
assert deposit_requests[1].type == "archive"
check_archive(archive2["name"], deposit_requests[1].archive.name)
# only 1 deposit in db
deposits = Deposit.objects.all()
assert len(deposits) == 1
def test_post_deposit_then_update_refused(
authenticated_client, deposit_collection, sample_archive, atom_dataset, tmp_path
):
"""Updating a deposit with status 'ready' should return a 400
"""
tmp_path = str(tmp_path)
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = "some-external-id-1"
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.deposit == deposit
check_archive("filename0", deposit_request.archive.name)
# updating/adding is forbidden
# uri to update the content
edit_se_iri = reverse("edit_se_iri", args=[deposit_collection.name, deposit_id])
em_iri = reverse("em_iri", args=[deposit_collection.name, deposit_id])
# Testing all update/add endpoint should fail
# since the status is ready
archive2 = create_arborescence_archive(
tmp_path, "archive2", "file2", b"some content in file 2"
)
# replacing file is no longer possible since the deposit's
# status is ready
r = authenticated_client.put(
em_iri,
content_type="application/zip",
data=archive2["data"],
CONTENT_LENGTH=archive2["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
assert r.status_code == status.HTTP_400_BAD_REQUEST
# adding file is no longer possible since the deposit's status
# is ready
r = authenticated_client.post(
em_iri,
content_type="application/zip",
data=archive2["data"],
CONTENT_LENGTH=archive2["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=filename0",
)
assert r.status_code == status.HTTP_400_BAD_REQUEST
# replacing metadata is no longer possible since the deposit's
# status is ready
r = authenticated_client.put(
edit_se_iri,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data-deposit-binary"],
CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]),
HTTP_SLUG=external_id,
)
assert r.status_code == status.HTTP_400_BAD_REQUEST
# adding new metadata is no longer possible since the
# deposit's status is ready
r = authenticated_client.post(
edit_se_iri,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data-deposit-binary"],
CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]),
HTTP_SLUG=external_id,
)
assert r.status_code == status.HTTP_400_BAD_REQUEST
archive_content = b"some content representing archive"
archive = InMemoryUploadedFile(
BytesIO(archive_content),
field_name="archive0",
name="archive0",
content_type="application/zip",
size=len(archive_content),
charset=None,
)
atom_entry = InMemoryUploadedFile(
BytesIO(atom_dataset["entry-data-deposit-binary"].encode("utf-8")),
field_name="atom0",
name="atom0",
content_type='application/atom+xml; charset="utf-8"',
size=len(atom_dataset["entry-data-deposit-binary"]),
charset="utf-8",
)
# replacing multipart metadata is no longer possible since the
# deposit's status is ready
r = authenticated_client.put(
edit_se_iri,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
)
assert r.status_code == status.HTTP_400_BAD_REQUEST
# adding new metadata is no longer possible since the
# deposit's status is ready
r = authenticated_client.post(
edit_se_iri,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
)
assert r.status_code == status.HTTP_400_BAD_REQUEST
diff --git a/swh/deposit/tests/api/test_deposit_delete.py b/swh/deposit/tests/api/test_deposit_delete.py
index 496af061..76959c24 100644
--- a/swh/deposit/tests/api/test_deposit_delete.py
+++ b/swh/deposit/tests/api/test_deposit_delete.py
@@ -1,123 +1,123 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
+from typing import Dict, Mapping
+
from django.urls import reverse
from rest_framework import status
-from typing import Dict, Mapping
from swh.deposit.config import (
+ ARCHIVE_KEY,
+ DEPOSIT_STATUS_DEPOSITED,
EDIT_SE_IRI,
EM_IRI,
- ARCHIVE_KEY,
METADATA_KEY,
- DEPOSIT_STATUS_DEPOSITED,
)
-
from swh.deposit.models import Deposit, DepositRequest
def count_deposit_request_types(deposit_requests) -> Mapping[str, int]:
deposit_request_types = defaultdict(int) # type: Dict[str, int]
for dr in deposit_requests:
deposit_request_types[dr.type] += 1
return deposit_request_types
def test_delete_archive_on_partial_deposit_works(
authenticated_client, partial_deposit_with_metadata, deposit_collection
):
"""Removing partial deposit's archive should return a 204 response
"""
deposit_id = partial_deposit_with_metadata.id
deposit = Deposit.objects.get(pk=deposit_id)
deposit_requests = DepositRequest.objects.filter(deposit=deposit)
# deposit request type: 'archive', 1 'metadata'
deposit_request_types = count_deposit_request_types(deposit_requests)
assert deposit_request_types == {ARCHIVE_KEY: 1, METADATA_KEY: 1}
# when
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id])
response = authenticated_client.delete(update_uri)
# then
assert response.status_code == status.HTTP_204_NO_CONTENT
deposit = Deposit.objects.get(pk=deposit_id)
deposit_requests2 = DepositRequest.objects.filter(deposit=deposit)
deposit_request_types = count_deposit_request_types(deposit_requests2)
assert deposit_request_types == {METADATA_KEY: 1}
def test_delete_archive_on_undefined_deposit_fails(
authenticated_client, deposit_collection, sample_archive
):
"""Delete undefined deposit returns a 404 response
"""
# when
update_uri = reverse(EM_IRI, args=[deposit_collection.name, 999])
response = authenticated_client.delete(update_uri)
# then
assert response.status_code == status.HTTP_404_NOT_FOUND
def test_delete_non_partial_deposit(
authenticated_client, deposit_collection, deposited_deposit
):
"""Delete !partial status deposit should return a 400 response
"""
deposit = deposited_deposit
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
# when
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.delete(update_uri)
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit is not None
def test_delete_partial_deposit(
authenticated_client, deposit_collection, partial_deposit
):
"""Delete deposit should return a 204 response
"""
# given
deposit = partial_deposit
# when
url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.delete(url)
# then
assert response.status_code == status.HTTP_204_NO_CONTENT
deposit_requests = list(DepositRequest.objects.filter(deposit=deposit))
assert deposit_requests == []
deposits = list(Deposit.objects.filter(pk=deposit.id))
assert deposits == []
def test_delete_on_edit_se_iri_cannot_delete_non_partial_deposit(
authenticated_client, deposit_collection, complete_deposit
):
"""Delete !partial deposit should return a 400 response
"""
# given
deposit = complete_deposit
# when
url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.delete(url)
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit is not None
diff --git a/swh/deposit/tests/api/test_deposit_list.py b/swh/deposit/tests/api/test_deposit_list.py
index e36d04ef..3de52950 100644
--- a/swh/deposit/tests/api/test_deposit_list.py
+++ b/swh/deposit/tests/api/test_deposit_list.py
@@ -1,100 +1,100 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
from rest_framework import status
from swh.deposit.api.converters import convert_status_detail
from swh.deposit.config import (
+ DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_PARTIAL,
PRIVATE_LIST_DEPOSITS,
- DEPOSIT_STATUS_DEPOSITED,
)
STATUS_DETAIL = {
"url": {
"summary": "At least one compatible url field. Failed",
"fields": ["testurl"],
},
"metadata": [{"summary": "Mandatory fields missing", "fields": ["9", 10, 1.212],},],
"archive": [
{"summary": "Invalid archive", "fields": ["3"],},
{"summary": "Unsupported archive", "fields": [2],},
],
}
def test_deposit_list(partial_deposit, deposited_deposit, authenticated_client):
"""Deposit list api should return all deposits in a paginated way
"""
partial_deposit.status_detail = STATUS_DETAIL
partial_deposit.save()
deposit_id = partial_deposit.id
deposit_id2 = deposited_deposit.id
# NOTE: does not work as documented
# https://docs.djangoproject.com/en/1.11/ref/urlresolvers/#django.core.urlresolvers.reverse # noqa
# url = reverse(PRIVATE_LIST_DEPOSITS, kwargs={'page_size': 1})
main_url = reverse(PRIVATE_LIST_DEPOSITS)
url = "%s?page_size=1" % main_url
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["count"] == 2 # 2 deposits
expected_next = f"{main_url}?page=2&page_size=1"
assert data["next"].endswith(expected_next) is True
assert data["previous"] is None
assert len(data["results"]) == 1 # page of size 1
deposit = data["results"][0]
assert deposit["id"] == deposit_id
assert deposit["status"] == DEPOSIT_STATUS_PARTIAL
expected_status_detail = convert_status_detail(STATUS_DETAIL)
assert deposit["status_detail"] == expected_status_detail
# then 2nd page
response2 = authenticated_client.get(expected_next)
assert response2.status_code == status.HTTP_200_OK
data2 = response2.json()
assert data2["count"] == 2 # still 2 deposits
assert data2["next"] is None
expected_previous = f"{main_url}?page_size=1"
assert data2["previous"].endswith(expected_previous) is True
assert len(data2["results"]) == 1 # page of size 1
deposit2 = data2["results"][0]
assert deposit2["id"] == deposit_id2
assert deposit2["status"] == DEPOSIT_STATUS_DEPOSITED
def test_deposit_list_exclude(partial_deposit, deposited_deposit, authenticated_client):
"""Exclusion pattern on external_id should be respected
"""
partial_deposit.status_detail = STATUS_DETAIL
partial_deposit.save()
main_url = reverse(PRIVATE_LIST_DEPOSITS)
# Testing exclusion pattern
exclude_pattern = "external-id"
assert partial_deposit.external_id.startswith(exclude_pattern)
assert deposited_deposit.external_id.startswith(exclude_pattern)
url = f"{main_url}?page_size=1&exclude=external-id"
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["count"] == 0
url = "%s?page_size=1&exclude=dummy" % main_url # that won't exclude anything
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["count"] == 2
diff --git a/swh/deposit/tests/api/test_deposit_multipart.py b/swh/deposit/tests/api/test_deposit_multipart.py
index bb4f42d7..c9a4a871 100644
--- a/swh/deposit/tests/api/test_deposit_multipart.py
+++ b/swh/deposit/tests/api/test_deposit_multipart.py
@@ -1,400 +1,401 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from io import BytesIO
+
from django.core.files.uploadedfile import InMemoryUploadedFile
from django.urls import reverse
-from io import BytesIO
from rest_framework import status
from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED
from swh.deposit.models import Deposit, DepositRequest
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import check_archive
def test_post_deposit_multipart_without_slug_header_is_bad_request(
authenticated_client, deposit_collection, atom_dataset
):
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
archive_content = b"some content representing archive"
archive = InMemoryUploadedFile(
BytesIO(archive_content),
field_name="archive0",
name="archive0",
content_type="application/zip",
size=len(archive_content),
charset=None,
)
data_atom_entry = atom_dataset["entry-data-deposit-binary"]
atom_entry = InMemoryUploadedFile(
BytesIO(data_atom_entry.encode("utf-8")),
field_name="atom0",
name="atom0",
content_type='application/atom+xml; charset="utf-8"',
size=len(data_atom_entry),
charset="utf-8",
)
# when
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
# + headers
HTTP_IN_PROGRESS="false",
)
assert b"Missing SLUG header" in response.content
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_multipart_zip(
authenticated_client, deposit_collection, atom_dataset, sample_archive
):
"""one multipart deposit (zip+xml) should be accepted
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
archive = InMemoryUploadedFile(
BytesIO(sample_archive["data"]),
field_name=sample_archive["name"],
name=sample_archive["name"],
content_type="application/zip",
size=sample_archive["length"],
charset=None,
)
data_atom_entry = atom_dataset["entry-data-deposit-binary"]
atom_entry = InMemoryUploadedFile(
BytesIO(data_atom_entry.encode("utf-8")),
field_name="atom0",
name="atom0",
content_type='application/atom+xml; charset="utf-8"',
size=len(data_atom_entry),
charset="utf-8",
)
external_id = "external-id"
# when
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
# + headers
HTTP_IN_PROGRESS="false",
HTTP_SLUG=external_id,
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_requests = DepositRequest.objects.filter(deposit=deposit)
assert len(deposit_requests) == 2
for deposit_request in deposit_requests:
assert deposit_request.deposit == deposit
if deposit_request.type == "archive":
check_archive(sample_archive["name"], deposit_request.archive.name)
assert deposit_request.metadata is None
assert deposit_request.raw_metadata is None
else:
assert (
deposit_request.metadata["id"]
== "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
)
assert deposit_request.raw_metadata == data_atom_entry
def test_post_deposit_multipart_tar(
authenticated_client, deposit_collection, atom_dataset, sample_archive
):
"""one multipart deposit (tar+xml) should be accepted
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
# from django.core.files import uploadedfile
data_atom_entry = atom_dataset["entry-data-deposit-binary"]
archive = InMemoryUploadedFile(
BytesIO(sample_archive["data"]),
field_name=sample_archive["name"],
name=sample_archive["name"],
content_type="application/x-tar",
size=sample_archive["length"],
charset=None,
)
atom_entry = InMemoryUploadedFile(
BytesIO(data_atom_entry.encode("utf-8")),
field_name="atom0",
name="atom0",
content_type='application/atom+xml; charset="utf-8"',
size=len(data_atom_entry),
charset="utf-8",
)
external_id = "external-id"
# when
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
# + headers
HTTP_IN_PROGRESS="false",
HTTP_SLUG=external_id,
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_requests = DepositRequest.objects.filter(deposit=deposit)
assert len(deposit_requests) == 2
for deposit_request in deposit_requests:
assert deposit_request.deposit == deposit
if deposit_request.type == "archive":
check_archive(sample_archive["name"], deposit_request.archive.name)
assert deposit_request.metadata is None
assert deposit_request.raw_metadata is None
else:
assert (
deposit_request.metadata["id"]
== "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
)
assert deposit_request.raw_metadata == data_atom_entry
def test_post_deposit_multipart_put_to_replace_metadata(
authenticated_client, deposit_collection, atom_dataset, sample_archive
):
"""One multipart deposit followed by a metadata update should be
accepted
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
data_atom_entry = atom_dataset["entry-data-deposit-binary"]
archive = InMemoryUploadedFile(
BytesIO(sample_archive["data"]),
field_name=sample_archive["name"],
name=sample_archive["name"],
content_type="application/zip",
size=sample_archive["length"],
charset=None,
)
atom_entry = InMemoryUploadedFile(
BytesIO(data_atom_entry.encode("utf-8")),
field_name="atom0",
name="atom0",
content_type='application/atom+xml; charset="utf-8"',
size=len(data_atom_entry),
charset="utf-8",
)
external_id = "external-id"
# when
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
# + headers
HTTP_IN_PROGRESS="true",
HTTP_SLUG=external_id,
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["deposit_id"]
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == "partial"
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_requests = DepositRequest.objects.filter(deposit=deposit)
assert len(deposit_requests) == 2
for deposit_request in deposit_requests:
assert deposit_request.deposit == deposit
if deposit_request.type == "archive":
check_archive(sample_archive["name"], deposit_request.archive.name)
else:
assert (
deposit_request.metadata["id"]
== "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
)
assert deposit_request.raw_metadata == data_atom_entry
replace_metadata_uri = response._headers["location"][1]
response = authenticated_client.put(
replace_metadata_uri,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data-deposit-binary"],
HTTP_IN_PROGRESS="false",
)
assert response.status_code == status.HTTP_204_NO_CONTENT
# deposit_id did not change
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_requests = DepositRequest.objects.filter(deposit=deposit)
assert len(deposit_requests) == 2
for deposit_request in deposit_requests:
assert deposit_request.deposit == deposit
if deposit_request.type == "archive":
check_archive(sample_archive["name"], deposit_request.archive.name)
else:
assert (
deposit_request.metadata["id"]
== "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
)
assert (
deposit_request.raw_metadata
== atom_dataset["entry-data-deposit-binary"]
)
# FAILURE scenarios
def test_post_deposit_multipart_only_archive_and_atom_entry(
authenticated_client, deposit_collection
):
"""Multipart deposit only accepts one archive and one atom+xml"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
archive_content = b"some content representing archive"
archive = InMemoryUploadedFile(
BytesIO(archive_content),
field_name="archive0",
name="archive0",
content_type="application/x-tar",
size=len(archive_content),
charset=None,
)
other_archive_content = b"some-other-content"
other_archive = InMemoryUploadedFile(
BytesIO(other_archive_content),
field_name="atom0",
name="atom0",
content_type="application/x-tar",
size=len(other_archive_content),
charset="utf-8",
)
# when
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive, "atom_entry": other_archive,},
# + headers
HTTP_IN_PROGRESS="false",
HTTP_SLUG="external-id",
)
# then
assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE
assert (
"Only 1 application/zip (or application/x-tar) archive"
in response.content.decode("utf-8")
)
# when
archive.seek(0)
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive,},
# + headers
HTTP_IN_PROGRESS="false",
HTTP_SLUG="external-id",
)
# then
assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE
assert (
"You must provide both 1 application/zip (or "
"application/x-tar) and 1 atom+xml entry for "
"multipart deposit" in response.content.decode("utf-8")
) is True
def test_post_deposit_multipart_400_when_badly_formatted_xml(
authenticated_client, deposit_collection, sample_archive, atom_dataset
):
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
archive_content = sample_archive["data"]
archive = InMemoryUploadedFile(
BytesIO(archive_content),
field_name=sample_archive["name"],
name=sample_archive["name"],
content_type="application/zip",
size=len(archive_content),
charset=None,
)
data_atom_entry_ko = atom_dataset["entry-data-ko"]
atom_entry = InMemoryUploadedFile(
BytesIO(data_atom_entry_ko.encode("utf-8")),
field_name="atom0",
name="atom0",
content_type='application/atom+xml; charset="utf-8"',
size=len(data_atom_entry_ko),
charset="utf-8",
)
# when
response = authenticated_client.post(
url,
format="multipart",
data={"archive": archive, "atom_entry": atom_entry,},
# + headers
HTTP_IN_PROGRESS="false",
HTTP_SLUG="external-id",
)
assert b"Malformed xml metadata" in response.content
assert response.status_code == status.HTTP_400_BAD_REQUEST
diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py
index 8982f232..c882f817 100644
--- a/swh/deposit/tests/api/test_deposit_private_check.py
+++ b/swh/deposit/tests/api/test_deposit_private_check.py
@@ -1,283 +1,282 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
import pytest
from rest_framework import status
-from swh.deposit.config import (
- DEPOSIT_STATUS_VERIFIED,
- PRIVATE_CHECK_DEPOSIT,
- DEPOSIT_STATUS_DEPOSITED,
- DEPOSIT_STATUS_REJECTED,
- COL_IRI,
-)
from swh.deposit.api.private.deposit_check import (
- MANDATORY_ARCHIVE_INVALID,
- MANDATORY_FIELDS_MISSING,
- MANDATORY_ARCHIVE_UNSUPPORTED,
ALTERNATE_FIELDS_MISSING,
+ MANDATORY_ARCHIVE_INVALID,
MANDATORY_ARCHIVE_MISSING,
+ MANDATORY_ARCHIVE_UNSUPPORTED,
+ MANDATORY_FIELDS_MISSING,
+)
+from swh.deposit.config import (
+ COL_IRI,
+ DEPOSIT_STATUS_DEPOSITED,
+ DEPOSIT_STATUS_REJECTED,
+ DEPOSIT_STATUS_VERIFIED,
+ PRIVATE_CHECK_DEPOSIT,
)
from swh.deposit.models import Deposit
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import (
create_arborescence_archive,
create_archive_with_archive,
)
-
PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc"
def private_check_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
return [
reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]),
reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]),
]
@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"])
def test_deposit_ok(
authenticated_client, deposit_collection, ready_deposit_ok, extension
):
"""Proper deposit should succeed the checks (-> status ready)
"""
deposit = ready_deposit_ok
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["status"] == DEPOSIT_STATUS_VERIFIED
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_VERIFIED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
@pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"])
def test_deposit_invalid_tarball(
tmp_path, authenticated_client, deposit_collection, extension
):
"""Deposit with tarball (of 1 tarball) should fail the checks: rejected
"""
deposit = create_deposit_archive_with_archive(
tmp_path, extension, authenticated_client, deposit_collection.name
)
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["status"] == DEPOSIT_STATUS_REJECTED
details = data["details"]
# archive checks failure
assert len(details["archive"]) == 1
assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
def test_deposit_ko_missing_tarball(
authenticated_client, deposit_collection, ready_deposit_only_metadata
):
"""Deposit without archive should fail the checks: rejected
"""
deposit = ready_deposit_only_metadata
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["status"] == DEPOSIT_STATUS_REJECTED
details = data["details"]
# archive checks failure
assert len(details["archive"]) == 1
assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
def test_deposit_ko_unsupported_tarball(
tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive
):
"""Deposit with an unsupported tarball should fail the checks: rejected
"""
deposit = ready_deposit_invalid_archive
assert DEPOSIT_STATUS_DEPOSITED == deposit.status
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["status"] == DEPOSIT_STATUS_REJECTED
details = data["details"]
# archive checks failure
assert len(details["archive"]) == 1
assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED
# metadata check failure
assert len(details["metadata"]) == 2
mandatory = details["metadata"][0]
assert mandatory["summary"] == MANDATORY_FIELDS_MISSING
assert set(mandatory["fields"]) == set(["author"])
alternate = details["metadata"][1]
assert alternate["summary"] == ALTERNATE_FIELDS_MISSING
assert alternate["fields"] == ["name or title"]
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
def test_check_deposit_metadata_ok(
authenticated_client, deposit_collection, ready_deposit_ok
):
"""Proper deposit should succeed the checks (-> status ready)
with all **MUST** metadata
using the codemeta metadata test set
"""
deposit = ready_deposit_ok
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data["status"] == DEPOSIT_STATUS_VERIFIED
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_VERIFIED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
def test_check_metadata_ok(swh_checks_deposit):
actual_check, detail = swh_checks_deposit._check_metadata(
{
"url": "something",
"external_identifier": "something-else",
"name": "foo",
"author": "someone",
}
)
assert actual_check is True
assert detail is None
def test_check_metadata_ok2(swh_checks_deposit):
actual_check, detail = swh_checks_deposit._check_metadata(
{
"url": "something",
"external_identifier": "something-else",
"title": "bar",
"author": "someone",
}
)
assert actual_check is True
assert detail is None
def test_check_metadata_ko(swh_checks_deposit):
"""Missing optional field should be caught
"""
actual_check, error_detail = swh_checks_deposit._check_metadata(
{
"url": "something",
"external_identifier": "something-else",
"author": "someone",
}
)
expected_error = {
"metadata": [
{
"summary": "Mandatory alternate fields are missing",
"fields": ["name or title"],
}
]
}
assert actual_check is False
assert error_detail == expected_error
def test_check_metadata_ko2(swh_checks_deposit):
"""Missing mandatory fields should be caught
"""
actual_check, error_detail = swh_checks_deposit._check_metadata(
{
"url": "something",
"external_identifier": "something-else",
"title": "foobar",
}
)
expected_error = {
"metadata": [{"summary": "Mandatory fields are missing", "fields": ["author"],}]
}
assert actual_check is False
assert error_detail == expected_error
def create_deposit_archive_with_archive(
root_path, archive_extension, client, collection_name
):
# we create the holding archive to a given extension
archive = create_arborescence_archive(
root_path,
"archive1",
"file1",
b"some content in file",
extension=archive_extension,
)
# now we create an archive holding the first created archive
invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive)
# we deposit it
response = client.post(
reverse(COL_IRI, args=[collection_name]),
content_type="application/x-tar",
data=invalid_archive["data"],
CONTENT_LENGTH=invalid_archive["length"],
HTTP_MD5SUM=invalid_archive["md5sum"],
HTTP_SLUG="external-id",
HTTP_IN_PROGRESS=False,
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (invalid_archive["name"],),
)
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(response.content)
deposit_status = response_content["deposit_status"]
assert deposit_status == DEPOSIT_STATUS_DEPOSITED
deposit_id = int(response_content["deposit_id"])
deposit = Deposit.objects.get(pk=deposit_id)
assert DEPOSIT_STATUS_DEPOSITED == deposit.status
return deposit
diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py
index 1724a2a9..6c265130 100644
--- a/swh/deposit/tests/api/test_deposit_private_read_archive.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py
@@ -1,87 +1,86 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import io
import zipfile
from django.urls import reverse
from rest_framework import status
-from swh.deposit.config import PRIVATE_GET_RAW_CONTENT, EM_IRI
+from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT
from swh.deposit.tests.common import create_arborescence_archive
-
PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc"
def private_get_raw_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
return [
reverse(PRIVATE_GET_RAW_CONTENT, args=[collection.name, deposit.id]),
reverse(PRIVATE_GET_RAW_CONTENT_NC, args=[deposit.id]),
]
def test_access_to_existing_deposit_with_one_archive(
authenticated_client, deposit_collection, complete_deposit, sample_archive
):
"""Access to deposit should stream a 200 response with its raw content
"""
deposit = complete_deposit
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
r = authenticated_client.get(url)
assert r.status_code == status.HTTP_200_OK
assert r._headers["content-type"][1] == "application/zip"
# read the stream
data = b"".join(r.streaming_content)
# extract the file from the zip
zfile = zipfile.ZipFile(io.BytesIO(data))
assert zfile.namelist() == ["file1"]
assert zfile.open("file1").read() == b"some content in file"
def test_access_to_existing_deposit_with_multiple_archives(
tmp_path, authenticated_client, deposit_collection, partial_deposit, sample_archive
):
"""Access to deposit should stream a 200 response with its raw contents
"""
deposit = partial_deposit
archive2 = create_arborescence_archive(
tmp_path, "archive2", "file2", b"some other content in file"
)
# Add a second archive to deposit
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.post(
update_uri,
content_type="application/zip", # as zip
data=archive2["data"],
# + headers
CONTENT_LENGTH=archive2["length"],
HTTP_SLUG=deposit.external_id,
HTTP_CONTENT_MD5=archive2["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],),
)
assert response.status_code == status.HTTP_201_CREATED
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
r = authenticated_client.get(url)
assert r.status_code == status.HTTP_200_OK
assert r._headers["content-type"][1] == "application/zip"
# read the stream
data = b"".join(r.streaming_content)
# extract the file from the zip
zfile = zipfile.ZipFile(io.BytesIO(data))
assert set(zfile.namelist()) == {"file1", "file2"}
assert zfile.open("file1").read() == b"some content in file"
assert zfile.open("file2").read() == b"some other content in file"
diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
index 475ab1b8..ec62dc73 100644
--- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py
@@ -1,551 +1,551 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
from rest_framework import status
+from swh.deposit.api import __version__
+from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON
from swh.deposit.models import Deposit
-from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON, EDIT_SE_IRI
-
PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc"
def private_get_raw_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
deposit_id = deposit if isinstance(deposit, int) else deposit.id
return [
reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]),
reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]),
]
def update_deposit(authenticated_client, collection, deposit, atom_dataset):
for atom_data in ["entry-data2", "entry-data3"]:
update_deposit_with_metadata(
authenticated_client, collection, deposit, atom_dataset[atom_data]
)
return deposit
def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata):
# update deposit's metadata
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=metadata,
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS=True,
)
assert response.status_code == status.HTTP_201_CREATED
return deposit
def test_read_metadata(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private metadata read api to existing deposit should return metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.save()
deposit = update_deposit(
authenticated_client, deposit_collection, deposit, atom_dataset
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
data = response.json()
expected_meta = {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"origin_metadata": {
"metadata": {
"@xmlns": ["http://www.w3.org/2005/Atom"],
"author": ["some awesome author", "another one", "no one"],
"codemeta:dateCreated": "2017-10-07T15:17:08Z",
"external_identifier": "some-external-id",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa
},
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
- "version": "0.0.1",
+ "version": __version__,
},
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [],
},
}
assert data == expected_meta
def test_read_metadata_revision_with_parent(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Private read metadata to a deposit (with parent) returns metadata
"""
deposit = partial_deposit
deposit.external_id = "some-external-id"
deposit.save()
deposit = update_deposit(
authenticated_client, deposit_collection, deposit, atom_dataset
)
rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa"
swh_id = "swh:1:rev:%s" % rev_id
fake_parent = Deposit(
swh_id=swh_id, client=deposit.client, collection=deposit.collection
)
fake_parent.save()
deposit.parent = fake_parent
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
data = response.json()
expected_meta = {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
},
"origin_metadata": {
"metadata": {
"@xmlns": ["http://www.w3.org/2005/Atom"],
"author": ["some awesome author", "another one", "no one"],
"codemeta:dateCreated": "2017-10-07T15:17:08Z",
"external_identifier": "some-external-id",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa
},
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
- "version": "0.0.1",
+ "version": __version__,
},
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": "test",
"id": deposit.id,
"collection": "test",
"revision_parents": [rev_id],
},
}
assert data == expected_meta
def test_read_metadata_3(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""date(Created|Published) provided, uses author/committer date
"""
deposit = partial_deposit
deposit.external_id = "hal-01243065"
deposit.save()
deposit = update_deposit(
authenticated_client, deposit_collection, deposit, atom_dataset
)
# add metadata to the deposit with datePublished and dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:002017-05-03T16:08:47+02:00
"""
)
update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
data = response.json()
metadata = {
"@xmlns": ["http://www.w3.org/2005/Atom"],
"@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
"author": [
"some awesome author",
"another one",
"no one",
{"email": "hal@ccsd.cnrs.fr", "name": "HAL"},
],
"client": "hal",
"codemeta:applicationCategory": "test",
"codemeta:author": {"codemeta:name": "Morane Gruenpeter"},
"codemeta:dateCreated": [
"2017-10-07T15:17:08Z",
"2015-04-06T17:08:47+02:00",
],
"codemeta:datePublished": "2017-05-03T16:08:47+02:00",
"codemeta:description": "this is the description",
"codemeta:developmentStatus": "stable",
"codemeta:keywords": "DSP programming",
"codemeta:license": [
{"codemeta:name": "GNU General Public License v3.0 only"},
{
"codemeta:name": "CeCILL "
"Free "
"Software "
"License "
"Agreement "
"v1.1"
},
],
"codemeta:programmingLanguage": ["php", "python", "C"],
"codemeta:runtimePlatform": "phpstorm",
"codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa
"codemeta:version": "1",
"external_identifier": ["some-external-id", "hal-01243065"],
"id": "hal-01243065",
- "title": "Composing a Web of Audio " "Applications",
+ "title": "Composing a Web of Audio Applications",
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
}
expected_meta = {
"origin": {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
},
"origin_metadata": {
"metadata": metadata,
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
- "version": "0.0.1",
+ "version": __version__,
},
},
"deposit": {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1507389428},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
},
}
assert data == expected_meta
def test_read_metadata_4(
authenticated_client, deposit_collection, atom_dataset, partial_deposit
):
"""dateCreated/datePublished not provided, revision uses complete_date
"""
deposit = partial_deposit
codemeta_entry_data = atom_dataset["metadata"] % ""
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
# will use the deposit completed date as fallback date
deposit.complete_date = "2016-04-06"
deposit.save()
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
data = response.json()
metadata = {
"@xmlns": "http://www.w3.org/2005/Atom",
"@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
"author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"},
"client": "hal",
"codemeta:applicationCategory": "test",
- "codemeta:author": {"codemeta:name": "Morane " "Gruenpeter"},
- "codemeta:description": "this is the " "description",
+ "codemeta:author": {"codemeta:name": "Morane Gruenpeter"},
+ "codemeta:description": "this is the description",
"codemeta:developmentStatus": "stable",
"codemeta:keywords": "DSP programming",
"codemeta:license": [
{
"codemeta:name": "GNU "
"General "
"Public "
"License "
"v3.0 "
"only"
},
{
"codemeta:name": "CeCILL "
"Free "
"Software "
"License "
"Agreement "
"v1.1"
},
],
"codemeta:programmingLanguage": ["php", "python", "C"],
"codemeta:runtimePlatform": "phpstorm",
"codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065",
"codemeta:version": "1",
"external_identifier": "hal-01243065",
"id": "hal-01243065",
- "title": "Composing a Web of Audio " "Applications",
+ "title": "Composing a Web of Audio Applications",
}
expected_origin = {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id),
}
expected_origin_metadata = {
"metadata": metadata,
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
- "version": "0.0.1",
+ "version": __version__,
},
}
expected_deposit_info = {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1459900800},
},
"author_date": {
"negative_utc": False,
"offset": 0,
"timestamp": {"microseconds": 0, "seconds": 1459900800},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
}
expected_meta = {
"origin": expected_origin,
"origin_metadata": expected_origin_metadata,
"deposit": expected_deposit_info,
}
assert data == expected_meta
def test_read_metadata_5(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""dateCreated/datePublished provided, revision uses author/committer
date
If multiple dateCreated provided, the first occurrence (of
dateCreated) is selected. If multiple datePublished provided,
the first occurrence (of datePublished) is selected.
"""
deposit = partial_deposit
# add metadata to the deposit with multiple datePublished/dateCreated
codemeta_entry_data = (
atom_dataset["metadata"]
% """
2015-04-06T17:08:47+02:002017-05-03T16:08:47+02:002016-04-06T17:08:47+02:002018-05-03T16:08:47+02:00
"""
)
deposit = update_deposit_with_metadata(
authenticated_client, deposit_collection, deposit, codemeta_entry_data
)
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response._headers["content-type"][1] == "application/json"
data = response.json()
expected_origin = {
"type": "deposit",
"url": "https://hal-test.archives-ouvertes.fr/external-id-partial",
}
metadata = {
"@xmlns": "http://www.w3.org/2005/Atom",
"@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
"author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"},
"client": "hal",
"codemeta:applicationCategory": "test",
- "codemeta:author": {"codemeta:name": "Morane " "Gruenpeter"},
+ "codemeta:author": {"codemeta:name": "Morane Gruenpeter"},
"codemeta:dateCreated": [
"2015-04-06T17:08:47+02:00",
"2016-04-06T17:08:47+02:00",
],
"codemeta:datePublished": [
"2017-05-03T16:08:47+02:00",
"2018-05-03T16:08:47+02:00",
],
"codemeta:description": "this is the description",
"codemeta:developmentStatus": "stable",
"codemeta:keywords": "DSP programming",
"codemeta:license": [
{
"codemeta:name": "GNU "
"General "
"Public "
"License "
"v3.0 "
"only"
},
{
"codemeta:name": "CeCILL "
"Free "
"Software "
"License "
"Agreement "
"v1.1"
},
],
"codemeta:programmingLanguage": ["php", "python", "C"],
"codemeta:runtimePlatform": "phpstorm",
"codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa
"codemeta:version": "1",
"external_identifier": "hal-01243065",
"id": "hal-01243065",
- "title": "Composing a Web of Audio " "Applications",
+ "title": "Composing a Web of Audio Applications",
}
expected_origin_metadata = {
"metadata": metadata,
"provider": {
"metadata": {},
"provider_name": "",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
},
"tool": {
"configuration": {"sword_version": "2"},
"name": "swh-deposit",
- "version": "0.0.1",
+ "version": __version__,
},
}
expected_deposit_info = {
"author": SWH_PERSON,
"committer": SWH_PERSON,
"committer_date": {
"negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1493820527},
},
"author_date": {
"negative_utc": False,
"offset": 120,
"timestamp": {"microseconds": 0, "seconds": 1428332927},
},
"client": deposit_collection.name,
"id": deposit.id,
"collection": deposit_collection.name,
"revision_parents": [],
}
expected_meta = {
"origin": expected_origin,
"origin_metadata": expected_origin_metadata,
"deposit": expected_deposit_info,
}
assert data == expected_meta
def test_access_to_nonexisting_deposit_returns_404_response(
authenticated_client, deposit_collection,
):
"""Read unknown collection should return a 404 response
"""
unknown_id = 999
try:
Deposit.objects.get(pk=unknown_id)
except Deposit.DoesNotExist:
assert True
for url in private_get_raw_url_endpoints(deposit_collection, unknown_id):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_404_NOT_FOUND
msg = "Deposit with id %s does not exist" % unknown_id
assert msg in response.content.decode("utf-8")
diff --git a/swh/deposit/tests/api/test_deposit_private_update_status.py b/swh/deposit/tests/api/test_deposit_private_update_status.py
index ba07f0bc..f93801de 100644
--- a/swh/deposit/tests/api/test_deposit_private_update_status.py
+++ b/swh/deposit/tests/api/test_deposit_private_update_status.py
@@ -1,194 +1,191 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import copy
import json
from django.urls import reverse
from rest_framework import status
-from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT
-
from swh.deposit.api.private.deposit_update_status import MANDATORY_KEYS
-
-from swh.deposit.models import Deposit
from swh.deposit.config import (
- PRIVATE_PUT_DEPOSIT,
- DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_LOAD_FAILURE,
+ DEPOSIT_STATUS_LOAD_SUCCESS,
+ PRIVATE_PUT_DEPOSIT,
)
-
+from swh.deposit.models import Deposit
+from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc"
def private_check_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
return [
reverse(PRIVATE_PUT_DEPOSIT, args=[collection.name, deposit.id]),
reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]),
]
def test_update_deposit_status_success_with_info(
authenticated_client, deposit_collection, ready_deposit_verified
):
"""Update deposit with load success should require all information to succeed
"""
deposit = ready_deposit_verified
expected_status = DEPOSIT_STATUS_LOAD_SUCCESS
origin_url = "something"
directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b"
revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e"
snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20"
full_body_info = {
"status": DEPOSIT_STATUS_LOAD_SUCCESS,
"revision_id": revision_id,
"directory_id": directory_id,
"snapshot_id": snapshot_id,
"origin_url": origin_url,
}
for url in private_check_url_endpoints(deposit_collection, deposit):
- dir_id = persistent_identifier(DIRECTORY, directory_id)
- rev_id = persistent_identifier(REVISION, revision_id)
- snp_id = persistent_identifier(SNAPSHOT, snapshot_id)
+ dir_id = swhid(DIRECTORY, directory_id)
+ rev_id = swhid(REVISION, revision_id)
+ snp_id = swhid(SNAPSHOT, snapshot_id)
expected_swh_id = "swh:1:dir:%s" % directory_id
expected_swh_id_context = (
f"{dir_id};origin={origin_url};" + f"visit={snp_id};anchor={rev_id};path=/"
)
response = authenticated_client.put(
url, content_type="application/json", data=json.dumps(full_body_info),
)
assert response.status_code == status.HTTP_204_NO_CONTENT
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == expected_status
assert deposit.swh_id == expected_swh_id
assert deposit.swh_id_context == expected_swh_id_context
# Reset deposit
deposit = ready_deposit_verified
deposit.save()
def test_update_deposit_status_rejected_with_info(
authenticated_client, deposit_collection, ready_deposit_verified
):
"""Update deposit with rejected status needs few information to succeed
"""
deposit = ready_deposit_verified
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.put(
url,
content_type="application/json",
data=json.dumps({"status": DEPOSIT_STATUS_LOAD_FAILURE}),
)
assert response.status_code == status.HTTP_204_NO_CONTENT
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE
assert deposit.swh_id is None
assert deposit.swh_id_context is None
# Reset status
deposit = ready_deposit_verified
deposit.save()
def test_update_deposit_status_success_with_incomplete_data(
authenticated_client, deposit_collection, ready_deposit_verified
):
"""Update deposit status with status success and incomplete information should fail
"""
deposit = ready_deposit_verified
origin_url = "something"
directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b"
revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e"
snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20"
new_status = DEPOSIT_STATUS_LOAD_SUCCESS
full_body_info = {
"status": new_status,
"revision_id": revision_id,
"directory_id": directory_id,
"snapshot_id": snapshot_id,
"origin_url": origin_url,
}
for url in private_check_url_endpoints(deposit_collection, deposit):
for key in MANDATORY_KEYS:
# Crafting body with missing information so that it raises
body = copy.deepcopy(full_body_info)
body.pop(key) # make the body incomplete
response = authenticated_client.put(
url, content_type="application/json", data=json.dumps(body),
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
assert (
f"deposit status to {new_status} requires information {key}"
in response.content.decode("utf-8")
)
def test_update_deposit_status_will_fail_with_unknown_status(
authenticated_client, deposit_collection, ready_deposit_verified
):
"""Unknown status for update should return a 400 response
"""
deposit = ready_deposit_verified
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.put(
url, content_type="application/json", data=json.dumps({"status": "unknown"})
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_update_deposit_status_will_fail_with_no_status_key(
authenticated_client, deposit_collection, ready_deposit_verified
):
"""No status provided for update should return a 400 response
"""
deposit = ready_deposit_verified
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.put(
url,
content_type="application/json",
data=json.dumps({"something": "something"}),
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_update_deposit_status_success_without_swh_id_fail(
authenticated_client, deposit_collection, ready_deposit_verified
):
"""Providing successful status without swh_id should return a 400
"""
deposit = ready_deposit_verified
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.put(
url,
content_type="application/json",
data=json.dumps({"status": DEPOSIT_STATUS_LOAD_SUCCESS}),
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
diff --git a/swh/deposit/tests/api/test_deposit_schedule.py b/swh/deposit/tests/api/test_deposit_schedule.py
index 8541420a..4218797e 100644
--- a/swh/deposit/tests/api/test_deposit_schedule.py
+++ b/swh/deposit/tests/api/test_deposit_schedule.py
@@ -1,91 +1,81 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import copy
import datetime
from io import BytesIO
-from typing import Dict
from django.urls import reverse
import pytest
from rest_framework import status
-from swh.deposit.config import (
- COL_IRI,
- DEPOSIT_STATUS_DEPOSITED,
-)
+from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED
from swh.deposit.parsers import parse_xml
-from ..conftest import TEST_CONFIG
-
-
-TEST_CONFIG_WITH_CHECKS: Dict[str, object] = {
- **TEST_CONFIG,
- "checks": True,
-}
-
-
@pytest.fixture()
-def deposit_config():
+def deposit_config(deposit_config):
"""Overrides the `deposit_config` fixture define in swh/deposit/tests/conftest.py
to re-enable the checks."""
- return TEST_CONFIG_WITH_CHECKS
+ config_d = copy.deepcopy(deposit_config)
+ config_d["checks"] = True
+ return config_d
def now() -> datetime.datetime:
return datetime.datetime.now(tz=datetime.timezone.utc)
def test_add_deposit_schedules_check(
authenticated_client, deposit_collection, sample_archive, swh_scheduler
):
"""Posting deposit on collection creates a checker task
"""
external_id = "external-id-schedules-check"
url = reverse(COL_IRI, args=[deposit_collection.name])
timestamp_before_call = now()
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]),
)
timestamp_after_call = now()
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
actual_state = response_content["deposit_status"]
assert actual_state == DEPOSIT_STATUS_DEPOSITED
deposit_id = response_content["deposit_id"]
tasks = swh_scheduler.grab_ready_tasks("check-deposit")
assert len(tasks) == 1
task = tasks[0]
assert timestamp_before_call <= task.pop("next_run") <= timestamp_after_call
assert task == {
"arguments": {
"args": [],
"kwargs": {"collection": "test", "deposit_id": int(deposit_id),},
},
"current_interval": datetime.timedelta(days=1),
"id": 1,
"policy": "oneshot",
"priority": None,
"retries_left": 3,
"status": "next_run_scheduled",
"type": "check-deposit",
}
diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py
index 4b03f7c7..c8f5f89e 100644
--- a/swh/deposit/tests/api/test_deposit_status.py
+++ b/swh/deposit/tests/api/test_deposit_status.py
@@ -1,120 +1,121 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from django.urls import reverse
from io import BytesIO
+
+from django.urls import reverse
from rest_framework import status
from swh.deposit.config import (
- STATE_IRI,
DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_REJECTED,
+ STATE_IRI,
)
from swh.deposit.models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS
from swh.deposit.parsers import parse_xml
def test_post_deposit_with_status_check(authenticated_client, deposited_deposit):
"""Successful but not loaded deposit should have a status 'deposited'
"""
deposit = deposited_deposit
status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id])
# check status
status_response = authenticated_client.get(status_url)
assert status_response.status_code == status.HTTP_200_OK
r = parse_xml(BytesIO(status_response.content))
assert int(r["deposit_id"]) == deposit.id
assert r["deposit_status"] == DEPOSIT_STATUS_DEPOSITED
assert r["deposit_status_detail"] == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED]
assert r["deposit_external_id"] == deposit.external_id
def test_status_unknown_deposit(authenticated_client, deposit_collection):
"""Unknown deposit status should return 404 response
"""
unknown_deposit_id = 999
status_url = reverse(STATE_IRI, args=[deposit_collection.name, unknown_deposit_id])
status_response = authenticated_client.get(status_url)
assert status_response.status_code == status.HTTP_404_NOT_FOUND
def test_status_unknown_collection(authenticated_client, deposited_deposit):
"""Unknown collection status should return 404 response"""
deposit = deposited_deposit
unknown_collection = "something-unknown"
status_url = reverse(STATE_IRI, args=[unknown_collection, deposit.id])
status_response = authenticated_client.get(status_url)
assert status_response.status_code == status.HTTP_404_NOT_FOUND
def test_status_deposit_rejected(authenticated_client, rejected_deposit):
"""Rejected deposit status should be 'rejected' with detailed summary
"""
deposit = rejected_deposit
# _status_detail = {'url': {'summary': 'Wrong url'}}
url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id])
# when
status_response = authenticated_client.get(url)
# then
assert status_response.status_code == status.HTTP_200_OK
r = parse_xml(BytesIO(status_response.content))
assert int(r["deposit_id"]) == deposit.id
assert r["deposit_status"] == DEPOSIT_STATUS_REJECTED
assert r["deposit_status_detail"] == "Deposit failed the checks"
if deposit.swh_id:
assert r["deposit_swh_id"] == deposit.swh_id
def test_status_with_http_accept_header_should_not_break(
authenticated_client, partial_deposit
):
"""Asking deposit status with Accept header should return 200
"""
deposit = partial_deposit
status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id])
response = authenticated_client.get(status_url)
assert response.status_code == status.HTTP_200_OK
response = authenticated_client.get(
status_url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8"
)
assert response.status_code == status.HTTP_200_OK
def test_status_complete_deposit(authenticated_client, complete_deposit):
"""Successful and loaded deposit should be 'done' and have detailed swh ids
"""
deposit = complete_deposit
url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id])
# when
status_response = authenticated_client.get(url)
# then
assert status_response.status_code == status.HTTP_200_OK
r = parse_xml(BytesIO(status_response.content))
assert int(r["deposit_id"]) == deposit.id
assert r["deposit_status"] == DEPOSIT_STATUS_LOAD_SUCCESS
assert (
r["deposit_status_detail"] == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS]
)
assert deposit.swh_id is not None
assert r["deposit_swh_id"] == deposit.swh_id
assert deposit.swh_id_context is not None
assert r["deposit_swh_id_context"] == deposit.swh_id_context
diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py
index 43b268cd..0b173c4f 100644
--- a/swh/deposit/tests/api/test_deposit_update.py
+++ b/swh/deposit/tests/api/test_deposit_update.py
@@ -1,395 +1,394 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
from rest_framework import status
-from swh.deposit.models import Deposit, DepositRequest, DepositCollection
from swh.deposit.config import EDIT_SE_IRI, EM_IRI
+from swh.deposit.models import Deposit, DepositCollection, DepositRequest
from swh.deposit.parsers import parse_xml
-
-from swh.deposit.tests.common import create_arborescence_archive, check_archive
+from swh.deposit.tests.common import check_archive, create_arborescence_archive
def test_replace_archive_to_deposit_is_possible(
tmp_path,
partial_deposit,
deposit_collection,
authenticated_client,
sample_archive,
atom_dataset,
):
"""Replace all archive with another one should return a 204 response
"""
tmp_path = str(tmp_path)
# given
deposit = partial_deposit
requests = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(list(requests)) == 1
check_archive(sample_archive["name"], requests[0].archive.name)
# we have no metadata for that deposit
requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata"))
assert len(requests) == 0
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS=True,
)
requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata"))
assert len(requests) == 1
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
external_id = "some-external-id-1"
archive2 = create_arborescence_archive(
tmp_path, "archive2", "file2", b"some other content in file"
)
response = authenticated_client.put(
update_uri,
content_type="application/zip", # as zip
data=archive2["data"],
# + headers
CONTENT_LENGTH=archive2["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],),
)
assert response.status_code == status.HTTP_204_NO_CONTENT
requests = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(list(requests)) == 1
check_archive(archive2["name"], requests[0].archive.name)
# check we did not touch the other parts
requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata"))
assert len(requests) == 1
def test_replace_metadata_to_deposit_is_possible(
tmp_path,
authenticated_client,
partial_deposit_with_metadata,
deposit_collection,
atom_dataset,
):
"""Replace all metadata with another one should return a 204 response
"""
# given
deposit = partial_deposit_with_metadata
raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8")
requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata")
assert len(requests_meta) == 1
request_meta0 = requests_meta[0]
assert request_meta0.raw_metadata == raw_metadata0
requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(requests_archive0) == 1
update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.put(
update_uri,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
)
assert response.status_code == status.HTTP_204_NO_CONTENT
requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata")
assert len(requests_meta) == 1
request_meta1 = requests_meta[0]
raw_metadata1 = request_meta1.raw_metadata
assert raw_metadata1 == atom_dataset["entry-data1"]
assert raw_metadata0 != raw_metadata1
assert request_meta0 != request_meta1
# check we did not touch the other parts
requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(requests_archive1) == 1
assert set(requests_archive0) == set(requests_archive1)
def test_add_archive_to_deposit_is_possible(
tmp_path,
authenticated_client,
deposit_collection,
partial_deposit_with_metadata,
sample_archive,
):
"""Add another archive to a deposit return a 201 response
"""
tmp_path = str(tmp_path)
deposit = partial_deposit_with_metadata
requests = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(requests) == 1
check_archive(sample_archive["name"], requests[0].archive.name)
requests_meta0 = DepositRequest.objects.filter(deposit=deposit, type="metadata")
assert len(requests_meta0) == 1
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
external_id = "some-external-id-1"
archive2 = create_arborescence_archive(
tmp_path, "archive2", "file2", b"some other content in file"
)
response = authenticated_client.post(
update_uri,
content_type="application/zip", # as zip
data=archive2["data"],
# + headers
CONTENT_LENGTH=archive2["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],),
)
assert response.status_code == status.HTTP_201_CREATED
requests = DepositRequest.objects.filter(deposit=deposit, type="archive").order_by(
"id"
)
assert len(requests) == 2
# first archive still exists
check_archive(sample_archive["name"], requests[0].archive.name)
# a new one was added
check_archive(archive2["name"], requests[1].archive.name)
# check we did not touch the other parts
requests_meta1 = DepositRequest.objects.filter(deposit=deposit, type="metadata")
assert len(requests_meta1) == 1
assert set(requests_meta0) == set(requests_meta1)
def test_add_metadata_to_deposit_is_possible(
authenticated_client,
deposit_collection,
partial_deposit_with_metadata,
atom_dataset,
):
"""Add metadata with another one should return a 204 response
"""
deposit = partial_deposit_with_metadata
requests = DepositRequest.objects.filter(deposit=deposit, type="metadata")
assert len(requests) == 1
requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(requests_archive0) == 1
update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id])
atom_entry = atom_dataset["entry-data1"]
response = authenticated_client.post(
update_uri, content_type="application/atom+xml;type=entry", data=atom_entry
)
assert response.status_code == status.HTTP_201_CREATED
requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by(
"id"
)
assert len(requests) == 2
expected_raw_meta0 = atom_dataset["entry-data0"] % (
deposit.external_id.encode("utf-8")
)
# a new one was added
assert requests[0].raw_metadata == expected_raw_meta0
assert requests[1].raw_metadata == atom_entry
# check we did not touch the other parts
requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive")
assert len(requests_archive1) == 1
assert set(requests_archive0) == set(requests_archive1)
def test_add_metadata_to_unknown_deposit(
deposit_collection, authenticated_client, atom_dataset
):
"""Replacing metadata to unknown deposit should return a 404 response
"""
unknown_deposit_id = 1000
try:
Deposit.objects.get(pk=unknown_deposit_id)
except Deposit.DoesNotExist:
assert True
url = reverse(EDIT_SE_IRI, args=[deposit_collection, unknown_deposit_id])
response = authenticated_client.post(
url,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
)
assert response.status_code == status.HTTP_404_NOT_FOUND
response_content = parse_xml(response.content)
assert "Unknown collection name" in response_content["sword:error"]["summary"]
def test_add_metadata_to_unknown_collection(
partial_deposit, authenticated_client, atom_dataset
):
"""Replacing metadata to unknown deposit should return a 404 response
"""
deposit = partial_deposit
unknown_collection_name = "unknown-collection"
try:
DepositCollection.objects.get(name=unknown_collection_name)
except DepositCollection.DoesNotExist:
assert True
url = reverse(EDIT_SE_IRI, args=[unknown_collection_name, deposit.id])
response = authenticated_client.post(
url,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
)
assert response.status_code == status.HTTP_404_NOT_FOUND
response_content = parse_xml(response.content)
assert "Unknown collection name" in response_content["sword:error"]["summary"]
def test_replace_metadata_to_unknown_deposit(
authenticated_client, deposit_collection, atom_dataset
):
"""Adding metadata to unknown deposit should return a 404 response
"""
unknown_deposit_id = 998
try:
Deposit.objects.get(pk=unknown_deposit_id)
except Deposit.DoesNotExist:
assert True
url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, unknown_deposit_id])
response = authenticated_client.put(
url,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
)
assert response.status_code == status.HTTP_404_NOT_FOUND
response_content = parse_xml(response.content)
assert (
"Deposit with id %s does not exist" % unknown_deposit_id
== response_content["sword:error"]["summary"]
)
def test_add_archive_to_unknown_deposit(
authenticated_client, deposit_collection, atom_dataset
):
"""Adding metadata to unknown deposit should return a 404 response
"""
unknown_deposit_id = 997
try:
Deposit.objects.get(pk=unknown_deposit_id)
except Deposit.DoesNotExist:
assert True
url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id])
response = authenticated_client.post(
url, content_type="application/zip", data=atom_dataset["entry-data1"]
)
assert response.status_code == status.HTTP_404_NOT_FOUND
response_content = parse_xml(response.content)
assert (
"Deposit with id %s does not exist" % unknown_deposit_id
== response_content["sword:error"]["summary"]
)
def test_replace_archive_to_unknown_deposit(
authenticated_client, deposit_collection, atom_dataset
):
"""Replacing archive to unknown deposit should return a 404 response
"""
unknown_deposit_id = 996
try:
Deposit.objects.get(pk=unknown_deposit_id)
except Deposit.DoesNotExist:
assert True
url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id])
response = authenticated_client.put(
url, content_type="application/zip", data=atom_dataset["entry-data1"]
)
assert response.status_code == status.HTTP_404_NOT_FOUND
response_content = parse_xml(response.content)
assert (
"Deposit with id %s does not exist" % unknown_deposit_id
== response_content["sword:error"]["summary"]
)
def test_post_metadata_to_em_iri_failure(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Update (POST) archive with wrong content type should return 400
"""
deposit = partial_deposit
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.post(
update_uri,
content_type="application/x-gtar-compressed",
data=atom_dataset["entry-data1"],
)
assert response.status_code == status.HTTP_400_BAD_REQUEST
response_content = parse_xml(response.content)
msg = (
"Packaging format supported is restricted to "
+ "application/zip, application/x-tar"
)
assert msg == response_content["sword:error"]["summary"]
def test_put_metadata_to_em_iri_failure(
authenticated_client, deposit_collection, partial_deposit, atom_dataset
):
"""Update (PUT) archive with wrong content type should return 400
"""
# given
deposit = partial_deposit
# when
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.put(
update_uri,
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
)
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
response_content = parse_xml(response.content)
msg = (
"Packaging format supported is restricted to "
+ "application/zip, application/x-tar"
)
assert msg == response_content["sword:error"]["summary"]
diff --git a/swh/deposit/tests/api/test_exception.py b/swh/deposit/tests/api/test_exception.py
index 0d71926b..a606397f 100644
--- a/swh/deposit/tests/api/test_exception.py
+++ b/swh/deposit/tests/api/test_exception.py
@@ -1,53 +1,52 @@
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from swh.deposit.exception import custom_exception_handler
-
+from django.db.utils import OperationalError
from rest_framework.exceptions import APIException
from rest_framework.response import Response
-from django.db.utils import OperationalError
+from swh.deposit.exception import custom_exception_handler
def test_custom_exception_handler_operational_error(mocker):
"""Operation error are translated to service unavailable
"""
fake_exception = OperationalError("Fake internal error", 503)
response = custom_exception_handler(fake_exception, {})
assert response is not None
assert response.status_code == 503
status = "Database backend maintenance"
detail = "Service temporarily unavailable, try again later."
assert (
response.content.decode("utf-8")
== f"""
{status}{detail}
"""
)
def test_custom_exception_handler_default_behavior_maintained(mocker):
"""Other internal errors are transmitted as is
"""
fake_exception = APIException("Fake internal error", 500)
fake_response = Response(
exception=fake_exception, status=fake_exception.status_code
)
mock_exception_handler = mocker.patch("swh.deposit.exception.exception_handler")
mock_exception_handler.return_value = fake_response
response = custom_exception_handler(fake_exception, {})
assert response is not None
assert response == fake_response
diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py
index b1cc9119..0adea4f5 100644
--- a/swh/deposit/tests/api/test_parser.py
+++ b/swh/deposit/tests/api/test_parser.py
@@ -1,134 +1,133 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import io
-
from collections import OrderedDict
+import io
from swh.deposit.parsers import SWHXMLParser
def test_parsing_without_duplicates():
xml_no_duplicate = io.BytesIO(
b"""
Awesome CompilerGPL3.0https://opensource.org/licenses/GPL-3.0Python3author1Inriaocamlhttp://issuetracker.com"""
)
actual_result = SWHXMLParser().parse(xml_no_duplicate)
expected_dict = OrderedDict(
[
("@xmlns", "http://www.w3.org/2005/Atom"),
("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"),
("title", "Awesome Compiler"),
(
"codemeta:license",
OrderedDict(
[
("codemeta:name", "GPL3.0"),
("codemeta:url", "https://opensource.org/licenses/GPL-3.0"),
]
),
),
("codemeta:runtimePlatform", "Python3"),
(
"codemeta:author",
OrderedDict(
[("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")]
),
),
("codemeta:programmingLanguage", "ocaml"),
("codemeta:issueTracker", "http://issuetracker.com"),
]
)
assert expected_dict == actual_result
def test_parsing_with_duplicates():
xml_with_duplicates = io.BytesIO(
b"""
Another CompilerGNU/LinuxGPL3.0https://opensource.org/licenses/GPL-3.0Un*xauthor1Inriaauthor2Inriaocamlhaskellspdxhttp://spdx.orgpython3"""
)
actual_result = SWHXMLParser().parse(xml_with_duplicates)
expected_dict = OrderedDict(
[
("@xmlns", "http://www.w3.org/2005/Atom"),
("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"),
("title", "Another Compiler"),
("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]),
(
"codemeta:license",
[
OrderedDict(
[
("codemeta:name", "GPL3.0"),
("codemeta:url", "https://opensource.org/licenses/GPL-3.0"),
]
),
OrderedDict(
[("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")]
),
],
),
(
"codemeta:author",
[
OrderedDict(
[
("codemeta:name", "author1"),
("codemeta:affiliation", "Inria"),
]
),
OrderedDict(
[
("codemeta:name", "author2"),
("codemeta:affiliation", "Inria"),
]
),
],
),
("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]),
]
)
assert expected_dict == actual_result
diff --git a/swh/deposit/tests/cli/data/atom/codemeta-sample.xml b/swh/deposit/tests/cli/data/atom/codemeta-sample.xml
deleted file mode 100644
index d804eff5..00000000
--- a/swh/deposit/tests/cli/data/atom/codemeta-sample.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
- %s
- hal-01587361
- https://hal.inria.fr/hal-01587361
- https://hal.inria.fr/hal-01587361/document
- https://hal.inria.fr/hal-01587361/file/AffectationRO-v1.0.0.zip
- doi:10.5281/zenodo.438684
- The assignment problem
- AffectationRO
- Gruenpeter, Morane
- [INFO] Computer Science [cs]
- [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO]
- SOFTWARE
- Project in OR: The assignment problemA java implementation for the assignment problem first release
- description fr
- 2015-06-01
- 2017-10-19
- en
-
-
- url stable
- Version sur hal
- Version entre par lutilisateur
- Mots-cls
- Commentaire
- Rfrence interne
-
- Collaboration/Projet
- nom du projet
- id
-
- Voir aussi
- Financement
- Projet ANR
- Projet Europen
- Platform/OS
- Dpendances
- Etat du dveloppement
-
- license
- url spdx
-
- Outils de dveloppement- outil no1
- Outils de dveloppement- outil no2
- http://code.com
- language 1
- language 2
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml b/swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml
deleted file mode 100644
index 25a417fb..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml b/swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml
deleted file mode 100644
index 65b7f63b..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-
-
- Title
- urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
- 2005-10-07T17:17:08Z
- Contributor
- The abstract
-
-
- The abstract
- Access Rights
- Alternative Title
- Date Available
- Bibliographic Citation # noqa
- Contributor
- Description
- Has Part
- Has Version
- Identifier
- Is Part Of
- Publisher
- References
- Rights Holder
- Source
- Title
- Type
-
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml b/swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml
deleted file mode 100644
index e4caf44f..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data-ko.xml b/swh/deposit/tests/cli/data/atom/entry-data-ko.xml
deleted file mode 100644
index 3f5d8802..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data-ko.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
- urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data-minimal.xml b/swh/deposit/tests/cli/data/atom/entry-data-minimal.xml
deleted file mode 100644
index 9432ac0e..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data-minimal.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
- %s
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml b/swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml
deleted file mode 100644
index 34710195..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
- Composing a Web of Audio Applications
-
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data0.xml b/swh/deposit/tests/cli/data/atom/entry-data0.xml
deleted file mode 100644
index 2b0ccc00..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data0.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
- Awesome Compiler
- hal
- urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a
- %s
- 2017-10-07T15:17:08Z
- some awesome author
- something
- awesome-compiler
- This is an awesome compiler destined to
-awesomely compile stuff
-and other stuff
- compiler,programming,language
- 2005-10-07T17:17:08Z
- 2005-10-07T17:17:08Z
- release note
- related link
-
- Awesome
- https://hoster.org/awesome-compiler
- GNU/Linux
- 0.0.1
- running
- all
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data1.xml b/swh/deposit/tests/cli/data/atom/entry-data1.xml
deleted file mode 100644
index e4f415c7..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data1.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
- hal
- urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a
- 2017-10-07T15:17:08Z
- some awesome author
- something
- awesome-compiler
- This is an awesome compiler destined to
-awesomely compile stuff
-and other stuff
- compiler,programming,language
- 2005-10-07T17:17:08Z
- 2005-10-07T17:17:08Z
- release note
- related link
-
- Awesome
- https://hoster.org/awesome-compiler
- GNU/Linux
- 0.0.1
- running
- all
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data2.xml b/swh/deposit/tests/cli/data/atom/entry-data2.xml
deleted file mode 100644
index 73cfafeb..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data2.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
- some-external-id
- https://hal-test.archives-ouvertes.fr/some-external-id
- some awesome author
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-data3.xml b/swh/deposit/tests/cli/data/atom/entry-data3.xml
deleted file mode 100644
index c75d9739..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-data3.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
- another one
- no one
- 2017-10-07T15:17:08Z
-
diff --git a/swh/deposit/tests/cli/data/atom/entry-update-in-place.xml b/swh/deposit/tests/cli/data/atom/entry-update-in-place.xml
deleted file mode 100644
index 1a7d7bbb..00000000
--- a/swh/deposit/tests/cli/data/atom/entry-update-in-place.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
- urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa7b
- Title
- Type
-
diff --git a/swh/deposit/tests/cli/data/atom/error-with-decimal.xml b/swh/deposit/tests/cli/data/atom/error-with-decimal.xml
deleted file mode 100644
index be002442..00000000
--- a/swh/deposit/tests/cli/data/atom/error-with-decimal.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
- Composing a Web of Audio Applications
- hal
- hal-01243065
- hal-01243065
- https://hal-test.archives-ouvertes.fr/hal-01243065
- test
-
-
- DSP programming,Web,Composability,Faust
- 2017-05-03T16:08:47+02:00
- The Web offers a great opportunity to share, deploy and use programs without installation difficulties. In this article we explore the idea of freely combining/composing real-time audio applications deployed on the Web using Faust audio DSP language.
- 1
- 10.4
- phpstorm
- stable
-
- linux
- php
- python
- C
-
- GNU General Public License v3.0 only
-
-
- CeCILL Free Software License Agreement v1.1
-
-
- HAL
- hal@ccsd.cnrs.fr
-
-
- Someone Nice
- someone@nice.fr
- FFJ
-
-
diff --git a/swh/deposit/tests/cli/data/atom/metadata.xml b/swh/deposit/tests/cli/data/atom/metadata.xml
deleted file mode 100644
index 65f58543..00000000
--- a/swh/deposit/tests/cli/data/atom/metadata.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
- Composing a Web of Audio Applications
- hal
- hal-01243065
- hal-01243065
- https://hal-test.archives-ouvertes.fr/hal-01243065
- test
- DSP programming
- this is the description
- 1
- phpstorm
- stable
- php
- python
- C
-
- GNU General Public License v3.0 only
-
-
- CeCILL Free Software License Agreement v1.1
-
-
- HAL
- hal@ccsd.cnrs.fr
-
-
- Morane Gruenpeter
-
-%s
-
diff --git a/swh/deposit/tests/cli/data/atom/tei-sample.xml b/swh/deposit/tests/cli/data/atom/tei-sample.xml
deleted file mode 100644
index cf2266af..00000000
--- a/swh/deposit/tests/cli/data/atom/tei-sample.xml
+++ /dev/null
@@ -1 +0,0 @@
-HAL TEI export of hal-01587083CCSDDistributed under a Creative Commons Attribution 4.0 International License
HAL API platform
questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733MoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.com2017-09-29 11:21:322017-10-03 17:20:132017-10-03 17:20:132017-09-292017-09-29contributorMoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.comCCSDhal-01587083https://hal.inria.fr/hal-01587083gruenpeter:hal-0158708320172017questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733EnglishComputer Science [cs]SoftwareIRILLInitiative pour la Recherche et l'Innovation sur le Logiciel Librehttps://www.irill.org/Universite Pierre et Marie Curie - Paris 6UPMC4 place Jussieu - 75005 Parishttp://www.upmc.fr/Institut National de Recherche en Informatique et en AutomatiqueInriaDomaine de VoluceauRocquencourt - BP 10578153 Le Chesnay Cedexhttp://www.inria.fr/en/Universite Paris Diderot - Paris 7UPD75 rue Thomas-Mann - 75205 Paris cedex 13http://www.univ-paris-diderot.fr
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py
index 7b5fd7ff..2e793fa4 100644
--- a/swh/deposit/tests/cli/test_client.py
+++ b/swh/deposit/tests/cli/test_client.py
@@ -1,457 +1,463 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import contextlib
import logging
import os
import re
from unittest.mock import MagicMock
from click.testing import CliRunner
import pytest
-from swh.deposit.client import PublicApiDepositClient, MaintenanceError
-from swh.deposit.cli.client import generate_slug, _url, _client, _collection, InputError
from swh.deposit.cli import deposit as cli
-from ..conftest import TEST_USER
+from swh.deposit.cli.client import InputError, _client, _collection, _url, generate_slug
+from swh.deposit.client import MaintenanceError, PublicApiDepositClient
+from ..conftest import TEST_USER
EXAMPLE_SERVICE_DOCUMENT = {
"service": {"workspace": {"collection": {"sword:name": "softcol",}}}
}
+@pytest.fixture
+def datadir(request):
+ """Override default datadir to target main test datadir"""
+ return os.path.join(os.path.dirname(str(request.fspath)), "../data")
+
+
@pytest.fixture
def slug():
return generate_slug()
@pytest.fixture
def client_mock(mocker, slug):
"""A successful deposit client with hard-coded default values
"""
mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug)
mock_client = MagicMock()
mocker.patch("swh.deposit.cli.client._client", return_value=mock_client)
mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT
mock_client.deposit_create.return_value = '{"foo": "bar"}'
return mock_client
@pytest.fixture
def client_mock_api_down(mocker, slug):
"""A mock client whose connection with api fails due to maintenance issue
"""
mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug)
mock_client = MagicMock()
mocker.patch("swh.deposit.cli.client._client", return_value=mock_client)
mock_client.service_document.side_effect = MaintenanceError(
"Database backend maintenance: Temporarily unavailable, try again later."
)
return mock_client
def test_url():
assert _url("http://deposit") == "http://deposit/1"
assert _url("https://other/1") == "https://other/1"
def test_client():
client = _client("http://deposit", "user", "pass")
assert isinstance(client, PublicApiDepositClient)
def test_collection_error():
mock_client = MagicMock()
mock_client.service_document.return_value = {"error": "something went wrong"}
with pytest.raises(InputError) as e:
_collection(mock_client)
assert "Service document retrieval: something went wrong" == str(e.value)
def test_collection_ok():
mock_client = MagicMock()
mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT
collection_name = _collection(mock_client)
assert collection_name == "softcol"
def test_collection_ko_because_downtime():
mock_client = MagicMock()
mock_client.service_document.side_effect = MaintenanceError("downtime")
with pytest.raises(MaintenanceError, match="downtime"):
_collection(mock_client)
def test_deposit_with_server_down_for_maintenance(
sample_archive, mocker, caplog, client_mock_api_down, slug, tmp_path
):
""" Deposit failure due to maintenance down time should be explicit
"""
runner = CliRunner()
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--name",
"test-project",
"--archive",
sample_archive["path"],
"--author",
"Jane Doe",
],
)
assert result.exit_code == 1, result.output
assert result.output == ""
assert caplog.record_tuples == [
(
"swh.deposit.cli.client",
logging.ERROR,
"Database backend maintenance: Temporarily unavailable, try again later.",
)
]
client_mock_api_down.service_document.assert_called_once_with()
def test_single_minimal_deposit(
sample_archive, mocker, caplog, client_mock, slug, tmp_path
):
""" from:
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit
""" # noqa
metadata_path = os.path.join(tmp_path, "metadata.xml")
mocker.patch(
- "swh.deposit.cli.client.tempfile.TemporaryDirectory",
+ "tempfile.TemporaryDirectory",
return_value=contextlib.nullcontext(str(tmp_path)),
)
runner = CliRunner()
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--name",
"test-project",
"--archive",
sample_archive["path"],
"--author",
"Jane Doe",
],
)
assert result.exit_code == 0, result.output
assert result.output == ""
assert caplog.record_tuples == [
("swh.deposit.cli.client", logging.INFO, '{"foo": "bar"}'),
]
client_mock.deposit_create.assert_called_once_with(
archive=sample_archive["path"],
collection="softcol",
in_progress=False,
metadata=metadata_path,
slug=slug,
)
with open(metadata_path) as fd:
assert (
fd.read()
== f"""\
\ttest-project
\t{slug}
\t
\t\tJane Doe
\t"""
)
def test_metadata_validation(sample_archive, mocker, caplog, tmp_path):
""" from:
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit
""" # noqa
slug = generate_slug()
mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug)
mock_client = MagicMock()
mocker.patch("swh.deposit.cli.client._client", return_value=mock_client)
mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT
mock_client.deposit_create.return_value = '{"foo": "bar"}'
metadata_path = os.path.join(tmp_path, "metadata.xml")
mocker.patch(
- "swh.deposit.cli.client.tempfile.TemporaryDirectory",
+ "tempfile.TemporaryDirectory",
return_value=contextlib.nullcontext(str(tmp_path)),
)
with open(metadata_path, "a"):
pass # creates the file
runner = CliRunner()
# Test missing author
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--name",
"test-project",
"--archive",
sample_archive["path"],
],
)
assert result.exit_code == 1, result.output
assert result.output == ""
assert len(caplog.record_tuples) == 1
(_logger, level, message) = caplog.record_tuples[0]
assert level == logging.ERROR
assert " --author " in message
# Clear mocking state
caplog.clear()
mock_client.reset_mock()
# Test missing name
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--archive",
sample_archive["path"],
"--author",
"Jane Doe",
],
)
assert result.exit_code == 1, result.output
assert result.output == ""
assert len(caplog.record_tuples) == 1
(_logger, level, message) = caplog.record_tuples[0]
assert level == logging.ERROR
assert " --name " in message
# Clear mocking state
caplog.clear()
mock_client.reset_mock()
# Test both --metadata and --author
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--archive",
sample_archive["path"],
"--metadata",
metadata_path,
"--author",
"Jane Doe",
],
)
assert result.exit_code == 1, result.output
assert result.output == ""
assert len(caplog.record_tuples) == 1
(_logger, level, message) = caplog.record_tuples[0]
assert level == logging.ERROR
assert re.search("--metadata.*is incompatible with", message)
# Clear mocking state
caplog.clear()
mock_client.reset_mock()
def test_single_deposit_slug_generation(
sample_archive, mocker, caplog, tmp_path, client_mock
):
""" from:
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit
""" # noqa
slug = "my-slug"
collection = "my-collection"
metadata_path = os.path.join(tmp_path, "metadata.xml")
mocker.patch(
- "swh.deposit.cli.client.tempfile.TemporaryDirectory",
+ "tempfile.TemporaryDirectory",
return_value=contextlib.nullcontext(str(tmp_path)),
)
runner = CliRunner()
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--name",
"test-project",
"--archive",
sample_archive["path"],
"--slug",
slug,
"--collection",
collection,
"--author",
"Jane Doe",
],
)
assert result.exit_code == 0, result.output
assert result.output == ""
assert caplog.record_tuples == [
("swh.deposit.cli.client", logging.INFO, '{"foo": "bar"}'),
]
client_mock.deposit_create.assert_called_once_with(
archive=sample_archive["path"],
collection=collection,
in_progress=False,
metadata=metadata_path,
slug=slug,
)
with open(metadata_path) as fd:
assert (
fd.read()
== """\
\ttest-project
\tmy-slug
\t
\t\tJane Doe
\t"""
)
def test_multisteps_deposit(
sample_archive, atom_dataset, mocker, caplog, datadir, client_mock, slug
):
""" from:
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#multisteps-deposit
""" # noqa
slug = generate_slug()
mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug)
# https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#create-an-incomplete-deposit
client_mock.deposit_create.return_value = '{"deposit_id": "42"}'
runner = CliRunner()
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--archive",
sample_archive["path"],
"--partial",
],
)
assert result.exit_code == 0, result.output
assert result.output == ""
assert caplog.record_tuples == [
("swh.deposit.cli.client", logging.INFO, '{"deposit_id": "42"}'),
]
client_mock.deposit_create.assert_called_once_with(
archive=sample_archive["path"],
collection="softcol",
in_progress=True,
metadata=None,
slug=slug,
)
# Clear mocking state
caplog.clear()
client_mock.reset_mock()
# https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#add-content-or-metadata-to-the-deposit
metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml")
result = runner.invoke(
cli,
[
"upload",
"--url",
"mock://deposit.swh/1",
"--username",
TEST_USER["username"],
"--password",
TEST_USER["password"],
"--metadata",
metadata_path,
],
)
assert result.exit_code == 0, result.output
assert result.output == ""
assert caplog.record_tuples == [
("swh.deposit.cli.client", logging.INFO, '{"deposit_id": "42"}'),
]
client_mock.deposit_create.assert_called_once_with(
archive=None,
collection="softcol",
in_progress=False,
metadata=metadata_path,
slug=slug,
)
# Clear mocking state
caplog.clear()
client_mock.reset_mock()
diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py
index 5bbe064e..c92a6916 100644
--- a/swh/deposit/tests/conftest.py
+++ b/swh/deposit/tests/conftest.py
@@ -1,428 +1,418 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
import base64
-import pytest
-import psycopg2
+import os
+from typing import Mapping
-from django.urls import reverse
from django.test.utils import setup_databases # type: ignore
-
-# mypy is asked to ignore the import statement above because setup_databases
-# is not part of the d.t.utils.__all__ variable.
-
+from django.urls import reverse
+import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
+import pytest
from rest_framework import status
from rest_framework.test import APIClient
-from typing import Mapping
+import yaml
-from swh.scheduler import get_scheduler
-from swh.scheduler.tests.conftest import * # noqa
-from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT
-from swh.deposit.config import setup_django_for
-from swh.deposit.parsers import parse_xml
-from swh.deposit.config import SWHDefaultConfig
from swh.deposit.config import (
COL_IRI,
- EDIT_SE_IRI,
DEPOSIT_STATUS_DEPOSITED,
- DEPOSIT_STATUS_REJECTED,
- DEPOSIT_STATUS_PARTIAL,
+ DEPOSIT_STATUS_LOAD_FAILURE,
DEPOSIT_STATUS_LOAD_SUCCESS,
+ DEPOSIT_STATUS_PARTIAL,
+ DEPOSIT_STATUS_REJECTED,
DEPOSIT_STATUS_VERIFIED,
- DEPOSIT_STATUS_LOAD_FAILURE,
+ EDIT_SE_IRI,
+ setup_django_for,
)
+from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import create_arborescence_archive
+from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid
+from swh.scheduler import get_scheduler
+
+# mypy is asked to ignore the import statement above because setup_databases
+# is not part of the d.t.utils.__all__ variable.
TEST_USER = {
"username": "test",
"password": "password",
"email": "test@example.org",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
"domain": "archives-ouvertes.fr/",
"collection": {"name": "test"},
}
-TEST_CONFIG = {
- "max_upload_size": 500,
- "extraction_dir": "/tmp/swh-deposit/test/extraction-dir",
- "checks": False,
- "provider": {
- "provider_name": "",
- "provider_type": "deposit_client",
- "provider_url": "",
- "metadata": {},
- },
- "tool": {
- "name": "swh-deposit",
- "version": "0.0.1",
- "configuration": {"sword_version": "2"},
- },
-}
-
-
def pytest_configure():
setup_django_for("testing")
@pytest.fixture()
-def deposit_config():
- return TEST_CONFIG
+def deposit_config(swh_scheduler_config):
+ return {
+ "max_upload_size": 500,
+ "extraction_dir": "/tmp/swh-deposit/test/extraction-dir",
+ "checks": False,
+ "provider": {
+ "provider_name": "",
+ "provider_type": "deposit_client",
+ "provider_url": "",
+ "metadata": {},
+ },
+ "scheduler": {"cls": "local", "args": swh_scheduler_config,},
+ }
-@pytest.fixture(autouse=True)
-def deposit_autoconfig(monkeypatch, deposit_config, swh_scheduler_config):
- """Enforce config for deposit classes inherited from SWHDefaultConfig."""
-
- def mock_parse_config(*args, **kw):
- config = deposit_config.copy()
- config["scheduler"] = {
- "cls": "local",
- "args": swh_scheduler_config,
- }
- return config
+@pytest.fixture()
+def deposit_config_path(tmp_path, monkeypatch, deposit_config):
+ conf_path = os.path.join(tmp_path, "deposit.yml")
+ with open(conf_path, "w") as f:
+ f.write(yaml.dump(deposit_config))
+ monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path)
+ return conf_path
- monkeypatch.setattr(SWHDefaultConfig, "parse_config_file", mock_parse_config)
+
+@pytest.fixture(autouse=True)
+def deposit_autoconfig(deposit_config_path, swh_scheduler_config):
+ """Enforce config for deposit classes inherited from APIConfig."""
scheduler = get_scheduler("local", swh_scheduler_config)
task_type = {
"type": "load-deposit",
"backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit",
- "description": "why does this have not-null constraint?",
+ "description": "Load deposit task",
}
scheduler.create_task_type(task_type)
@pytest.fixture(scope="session")
def django_db_setup(request, django_db_blocker, postgresql_proc):
from django.conf import settings
settings.DATABASES["default"].update(
{
("ENGINE", "django.db.backends.postgresql"),
("NAME", "tests"),
("USER", postgresql_proc.user), # noqa
("HOST", postgresql_proc.host), # noqa
("PORT", postgresql_proc.port), # noqa
}
)
with django_db_blocker.unblock():
setup_databases(
verbosity=request.config.option.verbose, interactive=False, keepdb=False
)
def execute_sql(sql):
"""Execute sql to postgres db"""
with psycopg2.connect(database="postgres") as conn:
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
cur.execute(sql)
@pytest.fixture(autouse=True, scope="session")
def swh_proxy():
"""Automatically inject this fixture in all tests to ensure no outside
connection takes place.
"""
os.environ["http_proxy"] = "http://localhost:999"
os.environ["https_proxy"] = "http://localhost:999"
def create_deposit_collection(collection_name: str):
"""Create a deposit collection with name collection_name
"""
from swh.deposit.models import DepositCollection
try:
collection = DepositCollection._default_manager.get(name=collection_name)
except DepositCollection.DoesNotExist:
collection = DepositCollection(name=collection_name)
collection.save()
return collection
def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]):
@pytest.fixture
def _deposit_collection(db, collection_name=collection_name):
return create_deposit_collection(collection_name)
return _deposit_collection
deposit_collection = deposit_collection_factory()
deposit_another_collection = deposit_collection_factory("another-collection")
@pytest.fixture
def deposit_user(db, deposit_collection):
"""Create/Return the test_user "test"
"""
from swh.deposit.models import DepositClient
try:
user = DepositClient._default_manager.get(username=TEST_USER["username"])
except DepositClient.DoesNotExist:
user = DepositClient._default_manager.create_user(
username=TEST_USER["username"],
email=TEST_USER["email"],
password=TEST_USER["password"],
provider_url=TEST_USER["provider_url"],
domain=TEST_USER["domain"],
)
user.collections = [deposit_collection.id]
user.save()
return user
@pytest.fixture
def client():
"""Override pytest-django one which does not work for djangorestframework.
"""
return APIClient() # <- drf's client
@pytest.yield_fixture
def authenticated_client(client, deposit_user):
"""Returned a logged client
"""
_token = "%s:%s" % (deposit_user.username, TEST_USER["password"])
token = base64.b64encode(_token.encode("utf-8"))
authorization = "Basic %s" % token.decode("utf-8")
client.credentials(HTTP_AUTHORIZATION=authorization)
yield client
client.logout()
@pytest.fixture
def sample_archive(tmp_path):
"""Returns a sample archive
"""
tmp_path = str(tmp_path) # pytest version limitation in previous version
archive = create_arborescence_archive(
tmp_path, "archive1", "file1", b"some content in file"
)
return archive
@pytest.fixture
def atom_dataset(datadir) -> Mapping[str, str]:
"""Compute the paths to atom files.
Returns:
Dict of atom name per content (bytes)
"""
atom_path = os.path.join(datadir, "atom")
data = {}
for filename in os.listdir(atom_path):
filepath = os.path.join(atom_path, filename)
with open(filepath, "rb") as f:
raw_content = f.read().decode("utf-8")
# Keep the filename without extension
atom_name = filename.split(".")[0]
data[atom_name] = raw_content
return data
def create_deposit(
authenticated_client,
collection_name: str,
sample_archive,
external_id: str,
deposit_status=DEPOSIT_STATUS_DEPOSITED,
):
"""Create a skeleton shell deposit
"""
url = reverse(COL_IRI, args=[collection_name])
# when
response = authenticated_client.post(
url,
content_type="application/zip", # as zip
data=sample_archive["data"],
# + headers
CONTENT_LENGTH=sample_archive["length"],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive["md5sum"],
HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip",
HTTP_IN_PROGRESS="false",
HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]),
)
# then
assert response.status_code == status.HTTP_201_CREATED
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(external_id=external_id)
if deposit.status != deposit_status:
deposit.status = deposit_status
deposit.save()
assert deposit.status == deposit_status
return deposit
def create_binary_deposit(
authenticated_client,
collection_name: str,
sample_archive,
external_id: str,
deposit_status: str = DEPOSIT_STATUS_DEPOSITED,
atom_dataset: Mapping[str, bytes] = {},
):
"""Create a deposit with both metadata and archive set. Then alters its status
to `deposit_status`.
"""
deposit = create_deposit(
authenticated_client,
collection_name,
sample_archive,
external_id=external_id,
deposit_status=DEPOSIT_STATUS_PARTIAL,
)
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"),
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS="true",
)
assert response.status_code == status.HTTP_201_CREATED
assert deposit.status == DEPOSIT_STATUS_PARTIAL
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(pk=deposit.id)
if deposit.status != deposit_status:
deposit.status = deposit_status
deposit.save()
assert deposit.status == deposit_status
return deposit
def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED):
"""Build deposit with a specific status
"""
@pytest.fixture()
def _deposit(
sample_archive,
deposit_collection,
authenticated_client,
deposit_status=deposit_status,
):
external_id = "external-id-%s" % deposit_status
return create_deposit(
authenticated_client,
deposit_collection.name,
sample_archive,
external_id=external_id,
deposit_status=deposit_status,
)
return _deposit
deposited_deposit = deposit_factory()
rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED)
partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL)
verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED)
completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS)
failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE)
@pytest.fixture
def partial_deposit_with_metadata(
sample_archive, deposit_collection, authenticated_client, atom_dataset
):
"""Returns deposit with archive and metadata provided, status 'partial'
"""
return create_binary_deposit(
authenticated_client,
deposit_collection.name,
sample_archive,
external_id="external-id-partial",
deposit_status=DEPOSIT_STATUS_PARTIAL,
atom_dataset=atom_dataset,
)
@pytest.fixture
def partial_deposit_only_metadata(
deposit_collection, authenticated_client, atom_dataset
):
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type="application/atom+xml;type=entry",
data=atom_dataset["entry-data1"],
HTTP_SLUG="external-id-partial",
HTTP_IN_PROGRESS=True,
)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(response.content)
deposit_id = response_content["deposit_id"]
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_PARTIAL
return deposit
@pytest.fixture
def complete_deposit(sample_archive, deposit_collection, authenticated_client):
"""Returns a completed deposit (load success)
"""
deposit = create_deposit(
authenticated_client,
deposit_collection.name,
sample_archive,
external_id="external-id-complete",
deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS,
)
origin = "https://hal.archives-ouvertes.fr/hal-01727745"
directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b"
revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10"
snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0"
- deposit.swh_id = persistent_identifier(DIRECTORY, directory_id)
- deposit.swh_id_context = persistent_identifier(
+ deposit.swh_id = swhid(DIRECTORY, directory_id)
+ deposit.swh_id_context = swhid(
DIRECTORY,
directory_id,
metadata={
"origin": origin,
- "visit": persistent_identifier(SNAPSHOT, snapshot_id),
- "anchor": persistent_identifier(REVISION, revision_id),
+ "visit": swhid(SNAPSHOT, snapshot_id),
+ "anchor": swhid(REVISION, revision_id),
"path": "/",
},
)
deposit.save()
return deposit
@pytest.fixture()
def tmp_path(tmp_path):
return str(tmp_path) # issue with oldstable's pytest version
diff --git a/swh/deposit/tests/api/data/atom/codemeta-sample.xml b/swh/deposit/tests/data/atom/codemeta-sample.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/codemeta-sample.xml
rename to swh/deposit/tests/data/atom/codemeta-sample.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml b/swh/deposit/tests/data/atom/entry-data-badly-formatted.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml
rename to swh/deposit/tests/data/atom/entry-data-badly-formatted.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml b/swh/deposit/tests/data/atom/entry-data-deposit-binary.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml
rename to swh/deposit/tests/data/atom/entry-data-deposit-binary.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data-empty-body.xml b/swh/deposit/tests/data/atom/entry-data-empty-body.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data-empty-body.xml
rename to swh/deposit/tests/data/atom/entry-data-empty-body.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data-ko.xml b/swh/deposit/tests/data/atom/entry-data-ko.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data-ko.xml
rename to swh/deposit/tests/data/atom/entry-data-ko.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data-minimal.xml b/swh/deposit/tests/data/atom/entry-data-minimal.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data-minimal.xml
rename to swh/deposit/tests/data/atom/entry-data-minimal.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml b/swh/deposit/tests/data/atom/entry-data-parsing-error-prone.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml
rename to swh/deposit/tests/data/atom/entry-data-parsing-error-prone.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data0.xml b/swh/deposit/tests/data/atom/entry-data0.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data0.xml
rename to swh/deposit/tests/data/atom/entry-data0.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data1.xml b/swh/deposit/tests/data/atom/entry-data1.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data1.xml
rename to swh/deposit/tests/data/atom/entry-data1.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data2.xml b/swh/deposit/tests/data/atom/entry-data2.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data2.xml
rename to swh/deposit/tests/data/atom/entry-data2.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-data3.xml b/swh/deposit/tests/data/atom/entry-data3.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-data3.xml
rename to swh/deposit/tests/data/atom/entry-data3.xml
diff --git a/swh/deposit/tests/api/data/atom/entry-update-in-place.xml b/swh/deposit/tests/data/atom/entry-update-in-place.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/entry-update-in-place.xml
rename to swh/deposit/tests/data/atom/entry-update-in-place.xml
diff --git a/swh/deposit/tests/api/data/atom/error-with-decimal.xml b/swh/deposit/tests/data/atom/error-with-decimal.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/error-with-decimal.xml
rename to swh/deposit/tests/data/atom/error-with-decimal.xml
diff --git a/swh/deposit/tests/api/data/atom/metadata.xml b/swh/deposit/tests/data/atom/metadata.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/metadata.xml
rename to swh/deposit/tests/data/atom/metadata.xml
diff --git a/swh/deposit/tests/api/data/atom/tei-sample.xml b/swh/deposit/tests/data/atom/tei-sample.xml
similarity index 100%
rename from swh/deposit/tests/api/data/atom/tei-sample.xml
rename to swh/deposit/tests/data/atom/tei-sample.xml
diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py
index 510830af..0ebbc603 100644
--- a/swh/deposit/tests/loader/common.py
+++ b/swh/deposit/tests/loader/common.py
@@ -1,138 +1,139 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
+from typing import Dict, Optional
-from typing import Dict
from swh.deposit.client import PrivateApiDepositClient
-
from swh.model.hashutil import hash_to_bytes, hash_to_hex
+from swh.model.model import SnapshotBranch, TargetType
+from swh.storage.algos.snapshot import snapshot_get_all_branches
CLIENT_TEST_CONFIG = {
"url": "http://nowhere:9000/",
"auth": {}, # no authentication in test scenario
}
class SWHDepositTestClient(PrivateApiDepositClient):
"""Deposit test client to permit overriding the default request
client.
"""
def __init__(self, client, config):
super().__init__(config=config)
self.client = client
def archive_get(self, archive_update_url, archive_path, log=None):
r = self.client.get(archive_update_url)
with open(archive_path, "wb") as f:
for chunk in r.streaming_content:
f.write(chunk)
return archive_path
def metadata_get(self, metadata_url, log=None):
r = self.client.get(metadata_url)
return json.loads(r.content.decode("utf-8"))
def status_update(
self,
update_status_url,
status,
revision_id=None,
directory_id=None,
origin_url=None,
):
payload = {"status": status}
if revision_id:
payload["revision_id"] = revision_id
if directory_id:
payload["directory_id"] = directory_id
if origin_url:
payload["origin_url"] = origin_url
self.client.put(
update_status_url, content_type="application/json", data=json.dumps(payload)
)
def check(self, check_url):
r = self.client.get(check_url)
data = json.loads(r.content.decode("utf-8"))
return data["status"]
def get_stats(storage) -> Dict:
"""Adaptation utils to unify the stats counters across storage
implementation.
"""
storage.refresh_stat_counters()
stats = storage.stat_counters()
keys = [
"content",
"directory",
"origin",
"origin_visit",
"person",
"release",
"revision",
"skipped_content",
"snapshot",
]
return {k: stats.get(k) for k in keys}
-def decode_target(target):
+def decode_target(branch: Optional[SnapshotBranch]) -> Optional[Dict]:
"""Test helper to ease readability in test
"""
- if not target:
- return target
- target_type = target["target_type"]
+ if not branch:
+ return None
+ target_type = branch.target_type
- if target_type == "alias":
- decoded_target = target["target"].decode("utf-8")
+ if target_type == TargetType.ALIAS:
+ decoded_target = branch.target.decode("utf-8")
else:
- decoded_target = hash_to_hex(target["target"])
+ decoded_target = hash_to_hex(branch.target)
return {"target": decoded_target, "target_type": target_type}
def check_snapshot(expected_snapshot, storage):
"""Check for snapshot match.
Provide the hashes as hexadecimal, the conversion is done
within the method.
Args:
expected_snapshot (dict): full snapshot with hex ids
storage (Storage): expected storage
"""
expected_snapshot_id = expected_snapshot["id"]
expected_branches = expected_snapshot["branches"]
- snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id))
+ snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id))
if snap is None:
# display known snapshots instead if possible
if hasattr(storage, "_snapshots"): # in-mem storage
from pprint import pprint
for snap_id, (_snap, _) in storage._snapshots.items():
snapd = _snap.to_dict()
snapd["id"] = hash_to_hex(snapd["id"])
branches = {
branch.decode("utf-8"): decode_target(target)
for branch, target in snapd["branches"].items()
}
snapd["branches"] = branches
pprint(snapd)
raise AssertionError("Snapshot is not found")
branches = {
- branch.decode("utf-8"): decode_target(target)
- for branch, target in snap["branches"].items()
+ branch.decode("utf-8"): decode_target(branch)
+ for branch_name, branch in snap["branches"].items()
}
assert expected_branches == branches
diff --git a/swh/deposit/tests/loader/conftest.py b/swh/deposit/tests/loader/conftest.py
index e340da91..260bd327 100644
--- a/swh/deposit/tests/loader/conftest.py
+++ b/swh/deposit/tests/loader/conftest.py
@@ -1,65 +1,37 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from functools import partial
import re
-import os
-import pytest
-import yaml
-from functools import partial
+import pytest
from swh.core.pytest_plugin import get_response_cb
-from swh.scheduler.tests.conftest import * # noqa
-from swh.storage.tests.conftest import * # noqa
from swh.deposit.loader.checker import DepositChecker
-@pytest.fixture(scope="session") # type: ignore # expected redefinition
-def celery_includes():
- return [
- "swh.deposit.loader.tasks",
- ]
-
-
@pytest.fixture
-def swh_config(tmp_path, swh_storage_postgresql, monkeypatch):
- storage_config = {
- "url": "https://deposit.softwareheritage.org/",
- "storage": {
- "cls": "local",
- "args": {
- "db": swh_storage_postgresql.dsn,
- "objstorage": {"cls": "memory", "args": {}},
- },
- },
+def deposit_config(tmp_path):
+ return {
+ "deposit": {
+ "url": "https://deposit.softwareheritage.org/1/private/",
+ "auth": {},
+ }
}
- conffile = os.path.join(tmp_path, "deposit.yml")
- with open(conffile, "w") as f:
- f.write(yaml.dump(storage_config))
- monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile)
- return conffile
-
@pytest.fixture
-def deposit_checker():
- return DepositChecker(
- config={
- "deposit": {
- "url": "https://deposit.softwareheritage.org/1/private/",
- "auth": {},
- }
- }
- )
+def deposit_checker(deposit_config_path):
+ return DepositChecker()
@pytest.fixture
def requests_mock_datadir(datadir, requests_mock_datadir):
"""Override default behavior to deal with put method
"""
cb = partial(get_response_cb, datadir=datadir)
requests_mock_datadir.put(re.compile("https://"), body=cb)
return requests_mock_datadir
diff --git a/swh/deposit/tests/loader/test_checker.py b/swh/deposit/tests/loader/test_checker.py
index c299b3bd..60d451ef 100644
--- a/swh/deposit/tests/loader/test_checker.py
+++ b/swh/deposit/tests/loader/test_checker.py
@@ -1,32 +1,32 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
-def test_check_deposit_ready(swh_config, requests_mock_datadir, deposit_checker):
+def test_checker_deposit_ready(requests_mock_datadir, deposit_checker):
"""Check on a valid 'deposited' deposit should result in 'verified'
"""
actual_result = deposit_checker.check(collection="test", deposit_id=1)
assert actual_result == {"status": "eventful"}
-def test_check_deposit_rejected(swh_config, requests_mock_datadir, deposit_checker):
+def test_checker_deposit_rejected(requests_mock_datadir, deposit_checker):
"""Check on invalid 'deposited' deposit should result in 'rejected'
"""
actual_result = deposit_checker.check(collection="test", deposit_id=2)
assert actual_result == {"status": "failed"}
@patch("swh.deposit.client.requests.get")
-def test_check_deposit_rejected_exception(mock_requests, swh_config, deposit_checker):
+def test_checker_deposit_rejected_exception(mock_requests, deposit_checker):
"""Check on invalid 'deposited' deposit should result in 'rejected'
"""
mock_requests.side_effect = ValueError("simulated problem when checking")
actual_result = deposit_checker.check(collection="test", deposit_id=3)
assert actual_result == {"status": "failed"}
diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py
index 4f099d40..55edd2c7 100644
--- a/swh/deposit/tests/loader/test_client.py
+++ b/swh/deposit/tests/loader/test_client.py
@@ -1,247 +1,246 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
import json
-import pytest
-import unittest
-
+import os
from typing import Any, Callable, Optional
+import unittest
from urllib.parse import urlparse
-from swh.deposit.client import PrivateApiDepositClient
-from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE
+import pytest
+from swh.deposit.client import PrivateApiDepositClient
+from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS
CLIENT_TEST_CONFIG = {
"url": "https://nowhere.org/",
"auth": {}, # no authentication in test scenario
}
def build_expected_path(datadir, base_url: str, api_url: str) -> str:
"""Build expected path from api to served file
"""
url = urlparse(base_url)
dirname = "%s_%s" % (url.scheme, url.hostname)
if api_url.endswith("/"):
api_url = api_url[:-1]
if api_url.startswith("/"):
api_url = api_url[1:]
suffix_path = api_url.replace("/", "_")
return os.path.join(datadir, dirname, suffix_path)
def test_build_expected_path(datadir):
actual_path = build_expected_path(datadir, "http://example.org", "/hello/you/")
assert actual_path == os.path.join(datadir, "http_example.org", "hello_you")
def read_served_path(
datadir,
base_url: str,
api_url: str,
convert_fn: Optional[Callable[[str], Any]] = None,
) -> bytes:
"""Read served path
"""
archive_path = build_expected_path(datadir, base_url, api_url)
with open(archive_path, "rb") as f:
content = f.read()
if convert_fn:
content = convert_fn(content.decode("utf-8"))
return content
def test_read_served_path(datadir):
actual_content = read_served_path(datadir, "http://example.org", "/hello/you/")
assert actual_content == b"hello people\n"
actual_content2 = read_served_path(
datadir, "http://example.org", "/hello.json", convert_fn=json.loads
)
assert actual_content2 == {"a": [1, 3]}
# private api to retrieve archive
def test_archive_get(tmp_path, datadir, requests_mock_datadir):
"""Retrieving archive data through private api should stream data
"""
api_url = "/1/private/test/1/raw/"
client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG)
expected_content = read_served_path(datadir, client.base_url, api_url)
archive_path = os.path.join(tmp_path, "test.archive")
archive_path = client.archive_get(api_url, archive_path)
assert os.path.exists(archive_path) is True
with open(archive_path, "rb") as f:
actual_content = f.read()
assert actual_content == expected_content
assert client.base_url == CLIENT_TEST_CONFIG["url"]
assert client.auth is None
def test_archive_get_auth(tmp_path, datadir, requests_mock_datadir):
"""Retrieving archive data through private api should stream data
"""
api_url = "/1/private/test/1/raw/"
config = CLIENT_TEST_CONFIG.copy()
config["auth"] = { # add authentication setup
"username": "user",
"password": "pass",
}
client = PrivateApiDepositClient(config)
expected_content = read_served_path(datadir, client.base_url, api_url)
archive_path = os.path.join(tmp_path, "test.archive")
archive_path = client.archive_get(api_url, archive_path)
assert os.path.exists(archive_path) is True
with open(archive_path, "rb") as f:
actual_content = f.read()
assert actual_content == expected_content
assert client.base_url == CLIENT_TEST_CONFIG["url"]
assert client.auth == ("user", "pass")
def test_archive_get_ko(tmp_path, datadir, requests_mock_datadir):
"""Reading archive can fail for some reasons
"""
unknown_api_url = "/1/private/unknown/deposit-id/raw/"
client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG)
with pytest.raises(ValueError, match="Problem when retrieving deposit"):
client.archive_get(unknown_api_url, "some/path")
# private api read metadata
def test_metadata_get(datadir, requests_mock_datadir):
"""Reading archive should write data in temporary directory
"""
api_url = "/1/private/test/1/metadata"
client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG)
actual_metadata = client.metadata_get(api_url)
assert isinstance(actual_metadata, str) is False
expected_content = read_served_path(
datadir, client.base_url, api_url, convert_fn=json.loads
)
assert actual_metadata == expected_content
def test_metadata_get_ko(requests_mock_datadir):
"""Reading metadata can fail for some reasons
"""
unknown_api_url = "/1/private/unknown/deposit-id/metadata/"
client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG)
with pytest.raises(ValueError, match="Problem when retrieving metadata"):
client.metadata_get(unknown_api_url)
# private api check
def test_check(requests_mock_datadir):
"""When check ok, this should return the deposit's status
"""
api_url = "/1/private/test/1/check"
client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG)
r = client.check(api_url)
assert r == "something"
def test_check_fails(requests_mock_datadir):
"""Checking deposit can fail for some reason
"""
unknown_api_url = "/1/private/test/10/check"
client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG)
with pytest.raises(ValueError, match="Problem when checking deposit"):
client.check(unknown_api_url)
# private api update status
class FakeRequestClientPut:
"""Fake Request client dedicated to put request method calls.
"""
args = None
kwargs = None
def put(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
class PrivateApiDepositClientStatusUpdateTest(unittest.TestCase):
def test_status_update(self):
"""Update status
"""
_client = FakeRequestClientPut()
deposit_client = PrivateApiDepositClient(
config=CLIENT_TEST_CONFIG, _client=_client
)
deposit_client.status_update(
"/update/status",
DEPOSIT_STATUS_LOAD_SUCCESS,
revision_id="some-revision-id",
)
self.assertEqual(_client.args, ("https://nowhere.org/update/status",))
self.assertEqual(
_client.kwargs,
{
"json": {
"status": DEPOSIT_STATUS_LOAD_SUCCESS,
"revision_id": "some-revision-id",
}
},
)
def test_status_update_with_no_revision_id(self):
"""Reading metadata can fail for some reasons
"""
_client = FakeRequestClientPut()
deposit_client = PrivateApiDepositClient(
config=CLIENT_TEST_CONFIG, _client=_client
)
deposit_client.status_update("/update/status/fail", DEPOSIT_STATUS_LOAD_FAILURE)
self.assertEqual(_client.args, ("https://nowhere.org/update/status/fail",))
self.assertEqual(
_client.kwargs, {"json": {"status": DEPOSIT_STATUS_LOAD_FAILURE,}}
)
diff --git a/swh/deposit/tests/loader/test_tasks.py b/swh/deposit/tests/loader/test_tasks.py
index c62fd45a..5f85ebcd 100644
--- a/swh/deposit/tests/loader/test_tasks.py
+++ b/swh/deposit/tests/loader/test_tasks.py
@@ -1,69 +1,75 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
@pytest.mark.db
-def test_deposit_check_eventful(mocker, swh_config, swh_app, celery_session_worker):
+def test_task_check_eventful(
+ mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker
+):
"""Successful check should make the check succeed
"""
client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check")
client.return_value = "verified"
collection = "collection"
deposit_id = 42
- res = swh_app.send_task(
+ res = swh_scheduler_celery_app.send_task(
"swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id]
)
assert res
res.wait()
assert res.successful()
assert res.result == {"status": "eventful"}
client.assert_called_once_with(f"/{collection}/{deposit_id}/check/")
@pytest.mark.db
-def test_deposit_check_failure(mocker, swh_config, swh_app, celery_session_worker):
+def test_task_check_failure(
+ mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker
+):
"""Unverified check status should make the check fail
"""
client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check")
client.return_value = "not-verified" # will make the status "failed"
collection = "collec"
deposit_id = 666
- res = swh_app.send_task(
+ res = swh_scheduler_celery_app.send_task(
"swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id]
)
assert res
res.wait()
assert res.successful()
assert res.result == {"status": "failed"}
client.assert_called_once_with(f"/{collection}/{deposit_id}/check/")
@pytest.mark.db
-def test_deposit_check_3(mocker, swh_config, swh_app, celery_session_worker):
+def test_task_check_3(
+ mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker
+):
"""Unexpected failures should fail the check
"""
client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check")
client.side_effect = ValueError("unexpected failure will make it fail")
collection = "another-collection"
deposit_id = 999
- res = swh_app.send_task(
+ res = swh_scheduler_celery_app.send_task(
"swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id]
)
assert res
res.wait()
assert res.successful()
assert res.result == {"status": "failed"}
client.assert_called_once_with(f"/{collection}/{deposit_id}/check/")
diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py
new file mode 100644
index 00000000..88fca573
--- /dev/null
+++ b/swh/deposit/tests/test_init.py
@@ -0,0 +1,10 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def test_version():
+ from swh.deposit.api import __version__
+
+ assert __version__ is not None
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
index 644d8f33..8be41c4c 100644
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -1,141 +1,141 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import pytest
-
from unittest.mock import patch
+import pytest
+
from swh.deposit import utils
def test_merge():
"""Calling utils.merge on dicts should merge without losing information
"""
d0 = {"author": "someone", "license": [["gpl2"]], "a": 1}
d1 = {
"author": ["author0", {"name": "author1"}],
"license": [["gpl3"]],
"b": {"1": "2"},
}
d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}}
d3 = {
"author": (v for v in ["no one"]),
}
actual_merge = utils.merge(d0, d1, d2, d3)
expected_merge = {
"a": 1,
"license": [["gpl2"], ["gpl3"], "mit"],
"author": ["someone", "author0", {"name": "author1"}, "else", "no one"],
"b": {"1": "2", "2": "3",},
}
assert actual_merge == expected_merge
def test_merge_2():
d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}}
d1 = {"license": "gpl3", "runtime": "GNU/Linux"}
expected = {
"license": ["gpl2", "gpl3"],
"runtime": [{"os": "unix derivative"}, "GNU/Linux"],
}
actual = utils.merge(d0, d1)
assert actual == expected
def test_merge_edge_cases():
input_dict = {
"license": ["gpl2", "gpl3"],
"runtime": [{"os": "unix derivative"}, "GNU/Linux"],
}
# against empty dict
actual = utils.merge(input_dict, {})
assert actual == input_dict
# against oneself
actual = utils.merge(input_dict, input_dict, input_dict)
assert actual == input_dict
def test_merge_one_dict():
"""Merge one dict should result in the same dict value
"""
input_and_expected = {"anything": "really"}
actual = utils.merge(input_and_expected)
assert actual == input_and_expected
def test_merge_raise():
"""Calling utils.merge with any no dict argument should raise
"""
d0 = {"author": "someone", "a": 1}
d1 = ["not a dict"]
with pytest.raises(ValueError):
utils.merge(d0, d1)
with pytest.raises(ValueError):
utils.merge(d1, d0)
with pytest.raises(ValueError):
utils.merge(d1)
assert utils.merge(d0) == d0
@patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x)
def test_normalize_date_0(mock_normalize):
"""When date is a list, choose the first date and normalize it
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date(["2017-10-12", "date1"])
expected_date = "2017-10-12 00:00:00+00:00"
assert str(actual_date) == expected_date
@patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x)
def test_normalize_date_1(mock_normalize):
"""Providing a date in a reasonable format, everything is fine
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date("2018-06-11 17:02:02")
expected_date = "2018-06-11 17:02:02+00:00"
assert str(actual_date) == expected_date
@patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x)
def test_normalize_date_doing_irrelevant_stuff(mock_normalize):
"""Providing a date with only the year results in a reasonable date
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date("2017")
expected_date = "2017-01-01 00:00:00+00:00"
assert str(actual_date) == expected_date
diff --git a/swh/deposit/urls.py b/swh/deposit/urls.py
index 384844c3..9f6ab0eb 100644
--- a/swh/deposit/urls.py
+++ b/swh/deposit/urls.py
@@ -1,31 +1,31 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""SWH's main deposit URL Configuration
"""
-from django.conf.urls import url, include
+from django.conf.urls import include, url
from django.shortcuts import render
from django.views.generic.base import RedirectView
from rest_framework.urlpatterns import format_suffix_patterns
favicon_view = RedirectView.as_view(
url="/static/img/icons/swh-logo-32x32.png", permanent=True
)
def default_view(req):
return render(req, "homepage.html")
urlpatterns = [
url(r"^favicon\.ico$", favicon_view),
url(r"^1/", include("swh.deposit.api.urls")),
url(r"^1/private/", include("swh.deposit.api.private.urls")),
url(r"^$", default_view, name="home"),
]
urlpatterns = format_suffix_patterns(urlpatterns)
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
index ee3711db..3b79293e 100644
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -1,83 +1,83 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import iso8601
-
from types import GeneratorType
+import iso8601
+
from swh.model.identifiers import normalize_timestamp
def merge(*dicts):
"""Given an iterator of dicts, merge them losing no information.
Args:
*dicts: arguments are all supposed to be dict to merge into one
Returns:
dict merged without losing information
"""
def _extend(existing_val, value):
"""Given an existing value and a value (as potential lists), merge
them together without repetition.
"""
if isinstance(value, (list, map, GeneratorType)):
vals = value
else:
vals = [value]
for v in vals:
if v in existing_val:
continue
existing_val.append(v)
return existing_val
d = {}
for data in dicts:
if not isinstance(data, dict):
raise ValueError("dicts is supposed to be a variable arguments of dict")
for key, value in data.items():
existing_val = d.get(key)
if not existing_val:
d[key] = value
continue
if isinstance(existing_val, (list, map, GeneratorType)):
new_val = _extend(existing_val, value)
elif isinstance(existing_val, dict):
if isinstance(value, dict):
new_val = merge(existing_val, value)
else:
new_val = _extend([existing_val], value)
else:
new_val = _extend([existing_val], value)
d[key] = new_val
return d
def normalize_date(date):
"""Normalize date fields as expected by swh workers.
If date is a list, elect arbitrarily the first element of that
list
If date is (then) a string, parse it through
dateutil.parser.parse to extract a datetime.
Then normalize it through
swh.model.identifiers.normalize_timestamp.
Returns
The swh date object
"""
if isinstance(date, list):
date = date[0]
if isinstance(date, str):
date = iso8601.parse_date(date)
return normalize_timestamp(date)
diff --git a/tox.ini b/tox.ini
index 00c7376d..625647a4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,44 +1,45 @@
[tox]
envlist=flake8,mypy,py3-django2
[testenv]
extras =
testing
deps =
# the dependency below is needed for now as a workaround for
# https://github.com/pypa/pip/issues/6239
- swh.core[http] >= 0.0.75
+ swh.core[http] >= 0.3
+ swh.scheduler[testing] >= 0.5.0
dev: pdbpp
pytest-cov
django2: Django>=2,<3
commands =
pytest \
!dev: --cov {envsitepackagesdir}/swh/deposit --cov-branch \
{envsitepackagesdir}/swh/deposit \
{posargs}
[testenv:black]
skip_install = true
deps =
black
commands =
{envpython} -m black --check swh
[testenv:flake8]
skip_install = true
deps =
flake8
commands =
{envpython} -m flake8 \
--exclude=.tox,.git,__pycache__,.tox,.eggs,*.egg,swh/deposit/migrations
[testenv:mypy]
setenv = DJANGO_SETTINGS_MODULE=swh.deposit.settings.testing
extras =
testing
deps =
mypy
django-stubs
djangorestframework-stubs
commands =
mypy swh
diff --git a/version.txt b/version.txt
deleted file mode 100644
index c5d8e33c..00000000
--- a/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-v0.0.90-0-gc586ff17
\ No newline at end of file