diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..10e97abb --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +*.pyc +*.sw? +*~ +/.coverage +/.coverage.* +.eggs/ +__pycache__ +*.egg-info/ +version.txt +build/ +dist/ +/analysis.org +/swh/deposit/fixtures/private_data.yaml +/swh/deposit.json +/test.json +/swh/test +db.sqlite3 +/.noseids +*.tgz +*.zip +*.tar.gz +*.tar.bz2 +*.tar.lzma +.tox/ +.mypy_cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..ad76d740 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.4.0 + hooks: + - id: trailing-whitespace + - id: check-json + - id: check-yaml + +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.3 + hooks: + - id: flake8 + +- repo: https://github.com/codespell-project/codespell + rev: v1.16.0 + hooks: + - id: codespell + +- repo: local + hooks: + - id: mypy + name: mypy + entry: env DJANGO_SETTINGS_MODULE=swh.deposit.settings.testing mypy + args: [swh] + pass_filenames: false + language: system + types: [python] + +- repo: https://github.com/PyCQA/isort + rev: 5.5.2 + hooks: + - id: isort + +- repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black + +# unfortunately, we are far from being able to enable this... +# - repo: https://github.com/PyCQA/pydocstyle.git +# rev: 4.0.0 +# hooks: +# - id: pydocstyle +# name: pydocstyle +# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. +# entry: pydocstyle --convention=google +# language: python +# types: [python] + diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..7a5c79d9 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,3 @@ +Copyright (C) 2015-2016 The Software Heritage developers + +See http://www.softwareheritage.org/ for more information. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..0ad22b51 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,78 @@ +# Software Heritage Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as Software +Heritage contributors and maintainers pledge to making participation in our +project and our community a harassment-free experience for everyone, regardless +of age, body size, disability, ethnicity, sex characteristics, gender identity +and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at `conduct@softwareheritage.org`. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an +incident. Further details of specific enforcement policies may be posted +separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 00000000..7c3f9625 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Ishan Bhanuka diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..94a9ed02 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/Makefile.local b/Makefile.local new file mode 100644 index 00000000..882a723c --- /dev/null +++ b/Makefile.local @@ -0,0 +1,34 @@ +FLAKEFLAGS='--exclude=swh/deposit/manage.py,swh/deposit/settings.py,swh/deposit/migrations/' + +MANAGE=python3 -m swh.deposit.manage + +db-drop: + dropdb swh-deposit-dev || return 0 + +db-create: db-drop + createdb swh-deposit-dev + +db-prepare: + $(MANAGE) makemigrations + +db-migrate: + $(MANAGE) migrate + +db-load-data: + $(MANAGE) loaddata deposit_data + +db-load-private-data: db-load-data + $(MANAGE) loaddata ../private_data.yaml + +run-dev: + $(MANAGE) runserver + +run: + gunicorn3 -b 127.0.0.1:5006 swh.deposit.wsgi + +# Override default rule to make sure DJANGO env var is properly set. It +# *should* work without any override thanks to the mypy django-stubs plugin, +# but it currently doesn't; see +# https://github.com/typeddjango/django-stubs/issues/166 +check-mypy: + DJANGO_SETTINGS_MODULE=swh.deposit.settings.testing $(MYPY) $(MYPYFLAGS) swh diff --git a/PKG-INFO b/PKG-INFO index 0e524373..5b467b4c 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,37 +1,37 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.90 +Version: 0.1.0 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-deposit/ Description: # swh-deposit This is [Software Heritage](https://www.softwareheritage.org)'s [SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server implementation, as well as a simple client to upload deposits on the server. **S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository **D**eposit) is an interoperability standard for digital file deposit. This implementation will permit interaction between a client (a repository) and a server (SWH repository) to permit deposits of software source code archives and associated metadata. The documentation is at ./docs/README-specification.md Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: server diff --git a/bin/Makefile b/bin/Makefile new file mode 100644 index 00000000..7a8898ff --- /dev/null +++ b/bin/Makefile @@ -0,0 +1,45 @@ +DEPOSIT_ID=1 +ARCHIVE=../../swh-deposit.zip +ARCHIVE2=../../swh-model.zip +STATUS=--no-partial +PARTIAL_STATUS=--partial +UPDATE_STATUS='done' +ATOM_ENTRY=../../atom-entry.xml +EXTERNAL_ID='external-id' + +create-archives: + 7z a $(ARCHIVE) $(FOLDER) + 7z a $(ARCHIVE2) $(FOLDER2) + +new: + ./create_deposit.sh $(ARCHIVE) $(STATUS) + +new-complete: + ./create_deposit_with_metadata.sh $(ARCHIVE) $(ATOM_ENTRY) $(STATUS) $(EXTERNAL_ID) + +new-partial: + make new STATUS=$(PARTIAL_STATUS) ARCHIVE=$(ARCHIVE) + +update: + ./update-deposit-with-another-archive.sh $(DEPOSIT_ID) $(ARCHIVE2) $(STATUS) + +update-partial: + make update DEPOSIT_ID=$(DEPOSIT_ID) ARCHIVE2=$(ARCHIVE2) STATUS=$(PARTIAL_STATUS) + +replace: + ./replace-deposit-archive.sh $(ARCHIVE2) $(DEPOSIT_ID) + +download: + ./download-deposit-archive.sh $(DEPOSIT_ID) + +status: + ./status.sh $(DEPOSIT_ID) + +service-document: + ./service-document.sh + +home: + ./home.sh + +update-status: + ./update-status.sh $(DEPOSIT_ID) $(UPDATE_STATUS) diff --git a/bin/content.sh b/bin/content.sh new file mode 100755 index 00000000..08012f94 --- /dev/null +++ b/bin/content.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +. ./default-setup + +DEPOSIT_ID=${1-1} + +curl -i -u "${CREDS}" ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/content/ diff --git a/bin/create_deposit.sh b/bin/create_deposit.sh new file mode 100755 index 00000000..41e4010b --- /dev/null +++ b/bin/create_deposit.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +. ./default-setup + +ARCHIVE=${1-'../../deposit.zip'} + +STATUS=${2-'--no-partial'} + +./swh-deposit \ + --username ${USER} \ + --password ${PASSWORD} \ + --collection ${COLLECTION} \ + --archive-deposit \ + --archive ${ARCHIVE} \ + ${STATUS} \ + --url ${SERVER}/1 diff --git a/bin/create_deposit_atom.sh b/bin/create_deposit_atom.sh new file mode 100755 index 00000000..db0c92d5 --- /dev/null +++ b/bin/create_deposit_atom.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +. ./default-setup + +ATOM=${1-'../../atom.xml'} +PROGRESS=${2-'false'} + +curl -i -u "$CREDS" \ + --data-binary @${ATOM} \ + -X POST \ + -H "In-Progress: ${PROGRESS}" \ + -H 'Content-Type: application/atom+xml;type=entry' \ + -H 'Slug: external-id' \ + -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \ + ${SERVER}/1/${COLLECTION}/ diff --git a/bin/create_deposit_with_metadata.sh b/bin/create_deposit_with_metadata.sh new file mode 100755 index 00000000..d93a85bc --- /dev/null +++ b/bin/create_deposit_with_metadata.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +. ./default-setup + +ARCHIVE=${1-'../../swh-deposit.zip'} +ATOM_ENTRY=${2-'../../atom-entry.xml'} + +STATUS=${3-'--no-partial'} +EXTERNAL_ID=${4-'external-id'} + +./swh-deposit \ + --username ${USER} \ + --password ${PASSWORD} \ + --collection ${COLLECTION} \ + --archive-deposit \ + --archive ${ARCHIVE} \ + --metadata-deposit \ + --metadata ${ATOM_ENTRY} \ + --slug ${EXTERNAL_ID} \ + ${STATUS} \ + --url ${SERVER}/1 diff --git a/bin/default-setup b/bin/default-setup new file mode 100644 index 00000000..c27054f9 --- /dev/null +++ b/bin/default-setup @@ -0,0 +1,5 @@ +SERVER=http://127.0.0.1:5006 +USER='hal' +PASSWORD='hal' +COLLECTION=hal +CREDS="$USER:$PASSWORD" diff --git a/bin/download-deposit-archive.sh b/bin/download-deposit-archive.sh new file mode 100755 index 00000000..2b875e31 --- /dev/null +++ b/bin/download-deposit-archive.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +. ./default-setup + +DEPOSIT_ID=${1-1} + +curl ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/raw/ diff --git a/bin/home.sh b/bin/home.sh new file mode 100755 index 00000000..d3cf2df4 --- /dev/null +++ b/bin/home.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +. ./default-setup + +curl ${SERVER} +echo diff --git a/bin/replace-deposit-archive.sh b/bin/replace-deposit-archive.sh new file mode 100755 index 00000000..5b2b50d2 --- /dev/null +++ b/bin/replace-deposit-archive.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +. ./default-setup + +ARCHIVE=${1-'../../swh-model.zip'} +NAME=$(basename ${ARCHIVE}) + +MD5=$(md5sum ${ARCHIVE} | cut -f 1 -d' ') + +DEPOSIT_ID=${2-1} + +curl -i -u "$CREDS" \ + -X PUT \ + --data-binary @${ARCHIVE} \ + -H "In-Progress: false" \ + -H "Content-MD5: ${MD5}" \ + -H "Content-Disposition: attachment; filename=${NAME}" \ + -H 'Slug: external-id' \ + -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \ + -H 'Content-type: application/zip' \ + ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/media/ diff --git a/bin/service-document.sh b/bin/service-document.sh new file mode 100755 index 00000000..f7818a11 --- /dev/null +++ b/bin/service-document.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +. ./default-setup + +curl -i -u "${CREDS}" ${SERVER}/1/servicedocument/ diff --git a/bin/status.sh b/bin/status.sh new file mode 100755 index 00000000..6d3662d1 --- /dev/null +++ b/bin/status.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +. ./default-setup + +DEPOSIT_ID=${1-1} + +./swh-deposit \ + --username ${USER} \ + --password ${PASSWORD} \ + --collection ${COLLECTION} \ + --status \ + --deposit-id ${DEPOSIT_ID} \ + --url ${SERVER}/1 diff --git a/bin/update-deposit-with-another-archive.sh b/bin/update-deposit-with-another-archive.sh new file mode 100755 index 00000000..644e7b80 --- /dev/null +++ b/bin/update-deposit-with-another-archive.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +. ./default-setup + +DEPOSIT_ID=${1-1} +ARCHIVE=${2-'../../swh-core.zip'} + +NAME=$(basename ${ARCHIVE}) +MD5=$(md5sum ${ARCHIVE} | cut -f 1 -d' ') +PROGRESS=${3-'false'} + +curl -i -u "${CREDS}" \ + -X POST \ + --data-binary @${ARCHIVE} \ + -H "In-Progress: ${PROGRESS}" \ + -H "Content-MD5: ${MD5}" \ + -H "Content-Disposition: attachment; filename=${NAME}" \ + -H 'Slug: external-id-2' \ + -H 'Packaging: http://purl.org/net/sword/package/SimpleZip' \ + -H 'Content-type: application/zip' \ + ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/media/ diff --git a/bin/update-status.sh b/bin/update-status.sh new file mode 100755 index 00000000..c5925a41 --- /dev/null +++ b/bin/update-status.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +. ./default-setup + +DEPOSIT_ID=${1-1} +UPDATE_STATUS=${2-'done'} + +curl -i \ + -X PUT \ + -H 'Content-Type: application/json' \ + -d "{\"status\": \"${UPDATE_STATUS}\"}" \ + ${SERVER}/1/${COLLECTION}/${DEPOSIT_ID}/update/ diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..16d82778 --- /dev/null +++ b/conftest.py @@ -0,0 +1,15 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.storage.pytest_plugin"] + + +@pytest.fixture(scope="session") +def swh_scheduler_celery_includes(swh_scheduler_celery_includes): + return swh_scheduler_celery_includes + [ + "swh.deposit.loader.tasks", + ] diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..e379dea1 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,4 @@ +_build/ +apidoc/ +*-stamp + diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..42355755 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,3 @@ +include ../../swh-docs/Makefile.sphinx + +APIDOC_EXCLUDES += ../swh/*/settings/* diff --git a/swh/deposit/api/__init__.py b/docs/_static/.placeholder similarity index 100% copy from swh/deposit/api/__init__.py copy to docs/_static/.placeholder diff --git a/swh/deposit/api/__init__.py b/docs/_templates/.placeholder similarity index 100% copy from swh/deposit/api/__init__.py copy to docs/_templates/.placeholder diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..5a0b8f31 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,8 @@ +import os + +import django + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "swh.deposit.settings.development") +django.setup() + +from swh.docs.sphinx.conf import * # NoQA diff --git a/docs/dev-info.rst b/docs/dev-info.rst new file mode 100644 index 00000000..1d613d0c --- /dev/null +++ b/docs/dev-info.rst @@ -0,0 +1,176 @@ +Hacking on swh-deposit +====================== + +There are multiple modes to run and test the server locally: + +* development-like (automatic reloading when code changes) +* production-like (no reloading) +* integration tests (no side effects) + +Except for the tests which are mostly side effects free (except for the +database access), the other modes will need some configuration files (up to 2) +to run properly. + +Database +-------- + +swh-deposit uses a database to store the state of a deposit. The default +db is expected to be called swh-deposit-dev. + +To simplify the use, the following makefile targets can be used: + +schema +~~~~~~ + +.. code:: shell + + make db-create db-prepare db-migrate + +data +~~~~ + +Once the db is created, you need some data to be injected (request +types, client, collection, etc...): + +.. code:: shell + + make db-load-data db-load-private-data + +The private data are about having a user (``hal``) with a password +(``hal``) who can access a collection (``hal``). + +Add the following to ``../private-data.yaml``: + +.. code:: yaml + + - model: deposit.depositclient + fields: + user_ptr_id: 1 + collections: + - 1 + - model: auth.User + pk: 1 + fields: + first_name: hal + last_name: hal + username: hal + password: "pbkdf2_sha256$30000$8lxjoGc9PiBm$DO22vPUJCTM17zYogBgBg5zr/97lH4pw10Mqwh85yUM=" + - model: deposit.depositclient + fields: + user_ptr_id: 1 + collections: + - 1 + url: https://hal.inria.fr + +drop +~~~~ + +For information, you can drop the db: + +.. code:: shell + + make db-drop + +Development-like environment +---------------------------- + +Development-like environment needs one configuration file to work +properly. + +Configuration +~~~~~~~~~~~~~ + +**``{/etc/softwareheritage | ~/.config/swh | ~/.swh}``/deposit/server.yml**: + +.. code:: yaml + + # dev option for running the server locally + host: 127.0.0.1 + port: 5006 + + # production + authentication: + activated: true + white-list: + GET: + - / + + # 20 Mib max size + max_upload_size: 20971520 + +Run +~~~ + +Run the local server, using the default configuration file: + +.. code:: shell + + make run-dev + +Production-like environment +--------------------------- + +Production-like environment needs additional section in the +configuration file to work properly. + +This is more close to what's actually running in production. + +Configuration +~~~~~~~~~~~~~ + +This expects the same file describes in the previous chapter. Plus, an +additional private section file containing private information that is +not in the source code repository. + +**``{/etc/softwareheritage | ~/.config/swh | ~/.swh}``/deposit/private.yml**: + +.. code:: yaml + + private: + secret_key: production-local + db: + name: swh-deposit-dev + +A production configuration file would look like: + +.. code:: yaml + + private: + secret_key: production-secret-key + db: + name: swh-deposit-dev + host: db + port: 5467 + user: user + password: user-password + +Run +~~~ + +.. code:: shell + + make run + +Note: This expects gunicorn3 package installed on the system + +Tests +----- + +To run the tests: + +.. code:: shell + + make test + +As explained, those tests are mostly side-effect free. The db part is +dealt with by django. The remaining part which patches those side-effect +behavior is dealt with in the ``swh/deposit/tests/__init__.py`` module. + +Sum up +------ + +Prepare everything for your user to run: + +.. code:: shell + + make db-drop db-create db-prepare db-migrate db-load-private-data run-dev diff --git a/docs/endpoints/collection.rst b/docs/endpoints/collection.rst new file mode 100644 index 00000000..53219258 --- /dev/null +++ b/docs/endpoints/collection.rst @@ -0,0 +1,73 @@ +Create deposit +^^^^^^^^^^^^^^^ + +.. http:post:: /1// + + Create deposit in a collection. + + The client sends a deposit request to a specific collection with: + + * an archive holding the software source code (binary upload) + * an envelop with metadata describing information regarding a deposit (atom + entry deposit) + + Also known as: COL-IRI + + :param text : the client's credentials + :param text Content-Type: accepted mimetype + :param int Content-Length: tarball size + :param text Content-MD5: md5 checksum hex encoded of the tarball + :param text Content-Disposition: attachment; filename=[filename]; the filename + parameter must be text (ascii) + :param text Content-Disposition: for the metadata file set name parameter + to 'atom'. + :param bool In-progress: true if not final; false when final request. + :statuscode 201: success for deposit on POST + :statuscode 401: Unauthorized + :statuscode 404: access to an unknown collection + :statuscode 415: unsupported media type + +Sample request +~~~~~~~~~~~~~~~ +.. code:: shell + + curl -i -u hal: \ + -F "file=@../deposit.json;type=application/zip;filename=payload" \ + -F "atom=@../atom-entry.xml;type=application/atom+xml;charset=UTF-8" \ + -H 'In-Progress: false' \ + -H 'Slug: some-external-id' \ + -XPOST https://deposit.softwareheritage.org/1/hal/ + +Sample response +~~~~~~~~~~~~~~~ + +.. code:: shell + + HTTP/1.0 201 Created + Date: Tue, 26 Sep 2017 10:32:35 GMT + Server: WSGIServer/0.2 CPython/3.5.3 + Vary: Accept, Cookie + Allow: GET, POST, PUT, DELETE, HEAD, OPTIONS + Location: /1/hal/10/metadata/ + X-Frame-Options: SAMEORIGIN + Content-Type: application/xml + + + 10 + Sept. 26, 2017, 10:32 a.m. + None + deposited + + + + + + + + + + + http://purl.org/net/sword/package/SimpleZip + diff --git a/docs/endpoints/content.rst b/docs/endpoints/content.rst new file mode 100644 index 00000000..ef89d1e9 --- /dev/null +++ b/docs/endpoints/content.rst @@ -0,0 +1,14 @@ +Display content +^^^^^^^^^^^^^^^^ + +.. http:get:: /1///content/ + + Display information on the content's representation in the sword + server. + + + Also known as: CONT-FILE-IRI + + :param text : the client's credentials + :statuscode 200: no error + :statuscode 401: Unauthorized diff --git a/docs/endpoints/service-document.rst b/docs/endpoints/service-document.rst new file mode 100644 index 00000000..97a7af19 --- /dev/null +++ b/docs/endpoints/service-document.rst @@ -0,0 +1,48 @@ +Service document +^^^^^^^^^^^^^^^^^ + +.. http:get:: /1/servicedocument/ + + This is the starting endpoint for the client to discover its initial + collection. The answer to this query will describes: + + * the server's abilities + * connected client's collection information + + Also known as: SD-IRI - The Service Document IRI + + :param text : the client's credentials + :statuscode 200: no error + :statuscode 401: Unauthorized + + + +Sample response +~~~~~~~~~~~~~~~ + .. code:: xml + + + + + 2.0 + 20971520 + + + The Software Heritage (SWH) archive + + SWH Software Archive + application/zip + application/x-tar + Collection Policy + Software Heritage Archive + false + false + Collect, Preserve, Share + http://purl.org/net/sword/package/SimpleZip + https://deposit.softwareheritage.org/1/hal/ + + + diff --git a/docs/endpoints/status.rst b/docs/endpoints/status.rst new file mode 100644 index 00000000..ca773b0b --- /dev/null +++ b/docs/endpoints/status.rst @@ -0,0 +1,74 @@ +Retrieve status +^^^^^^^^^^^^^^^^ + +.. http:get:: /1/// + + Returns deposit's status. + + The different statuses: + + - **partial**: multipart deposit is still ongoing + - **deposited**: deposit completed, ready for checks + - **rejected**: deposit failed the checks + - **verified**: content and metadata verified, ready for loading + - **loading**: loading in-progress + - **done**: loading completed successfully + - **failed**: the deposit loading has failed + + Also known as STATE-IRI + + :param text : the client's credentials + :statuscode 201: with the deposit's status + :statuscode 401: Unauthorized + :statuscode 404: access to an unknown deposit + + +Rejected deposit +~~~~~~~~~~~~~~~~ + +It so happens that deposit could be rejected. In that case, the +`deposit_status_detail` entry will explain failed checks. + +Many reasons are possibles, here are some: + +- Deposit without software archive (main goal of the deposit is to + deposit software source code) + +- Deposit with malformed software archive (i.e archive within archive) + +- Deposit with invalid software archive (corrupted archive, although, + this one should happen during upload and not during checks) + +- Deposit with unsupported archive format + +- Deposit with missing metadata + + +Sample response +~~~~~~~~~~~~~~~ + + Successful deposit: + + .. code:: xml + + + 160 + done + The deposit has been successfully loaded into the Software Heritage archive + swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9 + swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9;origin=https://forge.softwareheritage.org/source/jesuisgpl/;visit=swh:1:snp:68c0d26104d47e278dd6be07ed61fafb561d0d20;anchor=swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb;path=/ + + + Rejected deposit: + + .. code:: xml + + + 148 + rejected + - At least one url field must be compatible with the client's domain name (codemeta:url) + diff --git a/docs/endpoints/update-media.rst b/docs/endpoints/update-media.rst new file mode 100644 index 00000000..de32634c --- /dev/null +++ b/docs/endpoints/update-media.rst @@ -0,0 +1,27 @@ +Update content +^^^^^^^^^^^^^^^ + +.. http:post:: /1///media/ + + Add archive(s) to a deposit. Only possible if the deposit's status + is partial. + +.. http:put:: /1///media/ + + Replace all content by submitting a new archive. Only possible if + the deposit's status is partial. + + + Also known as: *update iri* (EM-IRI) + + :param text : the client's credentials + :param text Content-Type: accepted mimetype + :param int Content-Length: tarball size + :param text Content-MD5: md5 checksum hex encoded of the tarball + :param text Content-Disposition: attachment; filename=[filename] ; the filename + parameter must be text (ascii) + :param bool In-progress: true if not final; false when final request. + :statuscode 204: success without payload on PUT + :statuscode 201: success for deposit on POST + :statuscode 401: Unauthorized + :statuscode 415: unsupported media type diff --git a/docs/endpoints/update-metadata.rst b/docs/endpoints/update-metadata.rst new file mode 100644 index 00000000..661d7516 --- /dev/null +++ b/docs/endpoints/update-metadata.rst @@ -0,0 +1,24 @@ +Update metadata +^^^^^^^^^^^^^^^^ + +.. http:post:: /1///metadata/ + + Add metadata to a deposit. Only possible if the deposit's status + is partial. + +.. http:put:: /1///metadata/ + + Replace all metadata by submitting a new metadata file. Only possible if + the deposit's status is partial. + + + Also known as: *update iri* (SE-IRI) + + :param text : the client's credentials + :param text Content-Disposition: attachment; filename=[filename] ; the filename + parameter must be text (ascii), with a name parameter set to 'atom'. + :param bool In-progress: true if not final; false when final request. + :statuscode 204: success without payload on PUT + :statuscode 201: success for deposit on POST + :statuscode 401: Unauthorized + :statuscode 415: unsupported media type diff --git a/docs/getting-started.rst b/docs/getting-started.rst new file mode 100644 index 00000000..6915e309 --- /dev/null +++ b/docs/getting-started.rst @@ -0,0 +1,284 @@ +Getting Started +=============== + +This is a guide for how to prepare and push a software deposit with +the `swh deposit` commands. + +The API is rooted at https://deposit.softwareheritage.org/1. + +For more details, see the `main documentation <./index.html>`__. + +Requirements +------------ + +You need to be referenced on SWH's client list to have: + +* credentials (needed for the basic authentication step) + + - in this document we reference ```` as the client's name and + ```` as its associated authentication password. + +* an associated collection_. + + +.. _collection: https://bitworking.org/projects/atom/rfc5023#rfc.section.8.3.3 + + +`Contact us for more information. +`__ + +Prepare a deposit +----------------- +* compress the files in a supported archive format: + + - zip: common zip archive (no multi-disk zip files). + - tar: tar archive without compression or optionally any of the + following compression algorithm gzip (`.tar.gz`, `.tgz`), bzip2 + (`.tar.bz2`) , or lzma (`.tar.lzma`) + +* (Optional) prepare a metadata file (more details :ref:`deposit-metadata`): + + +Push deposit +------------ +You can push a deposit with: + +* a single deposit (archive + metadata): + + The user posts in one query a software + source code archive and associated metadata. + The deposit is directly marked with status ``deposited``. + +* a multisteps deposit: + + 1. Create an incomplete deposit (marked with status ``partial``) + 2. Add data to a deposit (in multiple requests if needed) + 3. Finalize deposit (the status becomes ``deposited``) + + +Single deposit +^^^^^^^^^^^^^^ + + +Once the files are ready for deposit, we want to do the actual deposit +in one shot, sending exactly one POST query: + +* 1 archive (content-type ``application/zip`` or ``application/x-tar``) +* 1 metadata file in atom xml format (``content-type: application/atom+xml;type=entry``) + +For this, we need to provide the: + +* arguments: ``--username 'name' --password 'pass'`` as credentials +* archive's path (example: ``--archive path/to/archive-name.tgz``) +* software's name (optional if a metadata filepath is specified and the + artifact's name is included in the metadata file). +* author's name (optional if a metadata filepath is specified and the authors + are included in the metadata file). This can be specified multiple times in + case of multiple authors. +* (optionally) metadata file's path ``--metadata + path/to/file.metadata.xml``. +* (optionally) ``--slug 'your-id'`` argument, a reference to a unique identifier + the client uses for the software object. If not provided, A UUID will be + generated by SWH. + +You can do this with the following command: + +minimal deposit + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --author "Jane Doe" \ + --author "John Doe" \ + --name 'je-suis-gpl' \ + --archive je-suis-gpl.tgz + +with client's external identifier (``slug``) + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --author "Jane Doe" \ + --name 'je-suis-gpl' \ + --archive je-suis-gpl.tgz \ + --slug je-suis-gpl + +to a specific client's collection + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --author "Jane Doe" \ + --name 'je-suis-gpl' \ + --archive je-suis-gpl.tgz \ + --collection 'second-collection' + + +You just posted a deposit to your collection on Software Heritage + + +If everything went well, the successful response will contain the +elements below: + +.. code:: shell + + { + 'deposit_status': 'deposited', + 'deposit_id': '7', + 'deposit_date': 'Jan. 29, 2018, 12:29 p.m.' + } + +Note: As the deposit is in ``deposited`` status, you can no longer +update the deposit after this query. It will be answered with a 403 +forbidden answer. + +If something went wrong, an equivalent response will be given with the +`error` and `detail` keys explaining the issue, e.g.: + +.. code:: shell + + { + 'error': 'Unknown collection name xyz', + 'detail': None, + 'deposit_status': None, + 'deposit_status_detail': None, + 'deposit_swh_id': None, + 'status': 404 + } + + + +multisteps deposit +^^^^^^^^^^^^^^^^^^^^^^^^^ +The steps to create a multisteps deposit: + +1. Create an incomplete deposit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +First use the ``--partial`` argument to declare there is more to come + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --archive foo.tar.gz \ + --partial + + +2. Add content or metadata to the deposit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Continue the deposit by using the ``--deposit-id`` argument given as a response +for the first step. You can continue adding content or metadata while you use +the ``--partial`` argument. + +To only add one new archive to the deposit: + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --archive add-foo.tar.gz \ + --deposit-id 42 \ + --partial + +To only add metadata to the deposit: + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --metadata add-foo.tar.gz.metadata.xml \ + --deposit-id 42 \ + --partial + +or: +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --name 'add-foo' --author 'someone' \ + --deposit-id 42 \ + --partial + + +3. Finalize deposit +~~~~~~~~~~~~~~~~~~~ + +On your last addition (same command as before), by not declaring it +``--partial``, the deposit will be considered completed. Its status will be +changed to ``deposited`` + + +Update deposit +---------------- +* replace deposit: + + - only possible if the deposit status is ``partial`` and + ``--deposit-id `` is provided + + - by using the ``--replace`` flag + + - ``--metadata-deposit`` replaces associated existing metadata + - ``--archive-deposit`` replaces associated archive(s) + - by default, with no flag or both, you'll replace associated + metadata and archive(s): + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --deposit-id 11 \ + --archive updated-je-suis-gpl.tgz \ + --replace + +* update a loaded deposit with a new version: + + - by using the external-id with the ``--slug`` argument, you will + link the new deposit with its parent deposit: + +.. code:: shell + + $ swh deposit upload --username name --password secret \ + --archive je-suis-gpl-v2.tgz \ + --slug 'je-suis-gpl' \ + + + +Check the deposit's status +-------------------------- + +You can check the status of the deposit by using the ``--deposit-id`` argument: + +.. code:: shell + + $ swh deposit status --username name --password secret \ + --deposit-id 11 + +.. code:: json + + { + 'deposit_id': '11', + 'deposit_status': 'deposited', + 'deposit_swh_id': None, + 'deposit_status_detail': 'Deposit is ready for additional checks \ + (tarball ok, metadata, etc...)' + } + +The different statuses: + +- **partial**: multipart deposit is still ongoing +- **deposited**: deposit completed +- **rejected**: deposit failed the checks +- **verified**: content and metadata verified +- **loading**: loading in-progress +- **done**: loading completed successfully +- **failed**: the deposit loading has failed + +When the deposit has been loaded into the archive, the status will be +marked ``done``. In the response, will also be available the +, . For example: + +.. code:: json + + { + 'deposit_id': '11', + 'deposit_status': 'done', + 'deposit_swh_id': 'swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9', + 'deposit_swh_id_context': 'swh:1:dir:d83b7dda887dc790f7207608474650d4344b8df9;origin=https://forge.softwareheritage.org/source/jesuisgpl/;visit=swh:1:snp:68c0d26104d47e278dd6be07ed61fafb561d0d20;anchor=swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb;path=/', + 'deposit_status_detail': 'The deposit has been successfully \ + loaded into the Software Heritage archive' + } diff --git a/docs/images/deposit-create-chart.png b/docs/images/deposit-create-chart.png new file mode 100644 index 00000000..97c4eb45 Binary files /dev/null and b/docs/images/deposit-create-chart.png differ diff --git a/docs/images/deposit-delete-chart.png b/docs/images/deposit-delete-chart.png new file mode 100644 index 00000000..d9ba8ec4 Binary files /dev/null and b/docs/images/deposit-delete-chart.png differ diff --git a/docs/images/deposit-update-chart.png b/docs/images/deposit-update-chart.png new file mode 100644 index 00000000..d84eb52f Binary files /dev/null and b/docs/images/deposit-update-chart.png differ diff --git a/docs/images/status.png b/docs/images/status.png new file mode 100644 index 00000000..00137fff Binary files /dev/null and b/docs/images/status.png differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..486a1e41 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,28 @@ +.. _swh-deposit: + +Software Heritage - Deposit +=========================== + +Push-based deposit of software source code artifacts to the archive. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + getting-started + spec-api + metadata + dev-info + sys-info + specs/specs + tests/tests_HAL.rst + + +Reference Documentation +----------------------- + +.. toctree:: + :maxdepth: 2 + + /apidoc/swh.deposit diff --git a/docs/metadata.rst b/docs/metadata.rst new file mode 100644 index 00000000..f8e1cda2 --- /dev/null +++ b/docs/metadata.rst @@ -0,0 +1,185 @@ +.. _deposit-metadata: + +Deposit metadata +================ + +When making a software deposit into the SWH archive, one can add +information describing the software artifact and the software project. + + +Metadata requirements +--------------------- + +- **the schema/vocabulary** used *MUST* be specified with a persistent url + (DublinCore, DOAP, CodeMeta, etc.) + + .. code:: xml + + + or + + or + + +- **the name** of the software deposit *MUST* be provided [atom:title, + codemeta:name, dcterms:title] + +- **the authors** of the software deposit *MUST* be provided + +- **the url** representing the location of the source *MAY* be provided under + the url tag. The url will be used for creating an origin object in the + archive. + + .. code:: xml + + www.url-example.com + +- **the external\_identifier** *MAY* be provided as an identifier + +- **the external\_identifier** *SHOULD* match the Slug external-identifier in + the header + +- **the description** of the software deposit *SHOULD* be provided + [codemeta:description]: short or long description of the software + +- **the license/s** of the software + deposit *SHOULD* be provided [codemeta:license] + +- other metadata *MAY* be added with terms defined by the schema in use. + +Examples +-------- + +Using only Atom +~~~~~~~~~~~~~~~ + +.. code:: xml + + + + Awesome Compiler + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 1785io25c695 + 2017-10-07T15:17:08Z + some awesome author + + +Using Atom with CodeMeta +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: xml + + + + Awesome Compiler + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 1785io25c695 + 1785io25c695 + origin url + other identifier, DOI, ARK + Domain + + description + key-word 1 + key-word 2 + creation date + publication date + comment + + article name + article id + + + Collaboration/Projet + project name + id + + see also + Sponsor A + Sponsor B + Platform/OS + dependencies + Version + active + + license + url spdx + + .Net Framework 3.0 + Python2.3 + + author1 + Inria + UPMC + + + author2 + Inria + UPMC + + http://code.com + language 1 + language 2 + http://issuetracker.com + + +Using Atom with DublinCore and CodeMeta (multi-schema entry) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: xml + + + + Awesome Compiler + hal + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + %s + hal-01587361 + doi:10.5281/zenodo.438684 + The assignment problem + AffectationRO + author + [INFO] Computer Science [cs] + [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO] + SOFTWARE + Project in OR: The assignment problemA java implementation for the assignment problem first release + description fr + 2015-06-01 + 2017-10-19 + en + + + origin url + + 1.0.0 + key word + Comment + Rfrence interne + + link + Sponsor + + Platform/OS + dependencies + Ended + + license + url spdx + + + http://code.com + language 1 + language 2 + + +Note +---- +We aim on harmonizing the metadata from different origins and thus +metadata will be translated to the `CodeMeta +v.2 `__ vocabulary if +possible. diff --git a/docs/spec-api.rst b/docs/spec-api.rst new file mode 100644 index 00000000..4a6b3cc2 --- /dev/null +++ b/docs/spec-api.rst @@ -0,0 +1,112 @@ +API Specification +================= + +This is `Software Heritage `__'s +`SWORD +2.0 `__ +Server implementation. + +**S.W.O.R.D** (**S**\ imple **W**\ eb-Service **O**\ ffering +**R**\ epository **D**\ eposit) is an interoperability standard for +digital file deposit. + +This implementation will permit interaction between a client (a repository) and +a server (SWH repository) to push deposits of software source code archives +with associated metadata. + +*Note:* + +* In the following document, we will use the ``archive`` or ``software source + code archive`` interchangeably. +* The supported archive formats are: + + * zip: common zip archive (no multi-disk zip files). + * tar: tar archive without compression or optionally any of the following + compression algorithm gzip (.tar.gz, .tgz), bzip2 (.tar.bz2) , or lzma + (.tar.lzma) + +Collection +---------- + +SWORD defines a ``collection`` concept. In SWH's case, this collection +refers to a group of deposits. A ``deposit`` is some form of software +source code archive(s) associated with metadata. +By default the client's collection will have the client's name. + +Limitations +----------- +* upload limitation of 100Mib +* no mediation + +API overview +------------ + +API access is over HTTPS. + +The API is protected through basic authentication. + + +Endpoints +--------- + +The API endpoints are rooted at https://deposit.softwareheritage.org/1/. + +Data is sent and received as XML (as specified in the SWORD 2.0 +specification). + +.. include:: endpoints/service-document.rst + +.. include:: endpoints/collection.rst + +.. include:: endpoints/update-media.rst + +.. include:: endpoints/update-metadata.rst + +.. include:: endpoints/status.rst + +.. include:: endpoints/content.rst + + +Possible errors: +---------------- + +* common errors: + + * 401 (unauthenticated) if a client does not provide credential or provide + wrong ones + * 403 (forbidden) if a client tries access to a collection it does not own + * 404 (not found) if a client tries access to an unknown collection + * 404 (not found) if a client tries access to an unknown deposit + * 415 (unsupported media type) if a wrong media type is provided to the + endpoint + +* archive/binary deposit: + + * 403 (forbidden) if the length of the archive exceeds the max size + configured + * 412 (precondition failed) if the length or hash provided mismatch the + reality of the archive. + * 415 (unsupported media type) if a wrong media type is provided + +* multipart deposit: + + * 412 (precondition failed) if the md5 hash provided mismatch the reality of + the archive + * 415 (unsupported media type) if a wrong media type is provided + +* Atom entry deposit: + + * 400 (bad request) if the request's body is empty (for creation only) + + + + +Sources +------- + +* `SWORD v2 specification + `__ +* `arxiv documentation `__ +* `Dataverse example `__ +* `SWORD used on HAL `__ +* `xml examples for CCSD `__ diff --git a/docs/specs/blueprint.rst b/docs/specs/blueprint.rst new file mode 100644 index 00000000..fb762efd --- /dev/null +++ b/docs/specs/blueprint.rst @@ -0,0 +1,114 @@ +Use cases +--------- + + +Deposit creation +~~~~~~~~~~~~~~~~ + +From client's deposit repository server to SWH's repository server: + +1. The client requests for the server's abilities and its associated collection + (GET query to the *SD/service document uri*) + +2. The server answers the client with the service document which gives the + *collection uri* (also known as *COL/collection IRI*). + +3. The client sends a deposit (optionally a zip archive, some metadata or both) + through the *collection uri*. + + This can be done in: + + * one POST request (metadata + archive). + * one POST request (metadata or archive) + other PUT or POST request to the + *update uris* (*edit-media iri* or *edit iri*) + + a. Server validates the client's input or returns detailed error if any + + b. Server stores information received (metadata or software archive source + code or both) + +4. The server notifies the client it acknowledged the client's request. An + ``http 201 Created`` response with a deposit receipt in the body response is + sent back. That deposit receipt will hold the necessary information to + eventually complete the deposit later on if it was incomplete (also known as + status ``partial``). + +Schema representation +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +.. figure:: ../images/deposit-create-chart.png + :alt: + + +Updating an existing deposit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +5. Client updates existing deposit through the *update uris* (one or more POST + or PUT requests to either the *edit-media iri* or *edit iri*). + + 1. Server validates the client's input or returns detailed error if any + + 2. Server stores information received (metadata or software archive source + code or both) + + This would be the case for example if the client initially posted a + ``partial`` deposit (e.g. only metadata with no archive, or an archive + without metadata, or a split archive because the initial one exceeded + the limit size imposed by swh repository deposit) + +Schema representation +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +.. figure:: ../images/deposit-update-chart.png + :alt: + +Deleting deposit (or associated archive, or associated metadata) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +6. Deposit deletion is possible as long as the deposit is still in ``partial`` + state. + + 1. Server validates the client's input or returns detailed error if any + 2. Server actually delete information according to request + +Schema representation +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +.. figure:: ../images/deposit-delete-chart.png + :alt: + +Client asks for operation status +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +7. Operation status can be read through a GET query to the *state iri*. + +Server: Triggering deposit checks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once the status ``deposited`` is reached for a deposit, checks for the +associated archive(s) and metadata will be triggered. If those checks +fail, the status is changed to ``rejected`` and nothing more happens +there. Otherwise, the status is changed to ``verified``. + +Server: Triggering deposit load +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once the status ``verified`` is reached for a deposit, loading the +deposit with its associated metadata will be triggered. + +The loading will result on status update, either ``done`` or ``failed`` +(depending on the loading's status). + +This is described in the `loading document <./spec-loading.html>`__. diff --git a/docs/specs/metadata_example.xml b/docs/specs/metadata_example.xml new file mode 100644 index 00000000..e30cf4b3 --- /dev/null +++ b/docs/specs/metadata_example.xml @@ -0,0 +1,31 @@ + + + + HAL + hal@ccsd.cnrs.fr + + hal + hal-01243573 + The assignment problem + https://hal.archives-ouvertes.fr/hal-01243573 + other identifier, DOI, ARK + Domain + description + + author1 + Inria + UPMC + + + author2 + Inria + UPMC + + + + + + + diff --git a/docs/specs/spec-loading.rst b/docs/specs/spec-loading.rst new file mode 100644 index 00000000..f8aaab0a --- /dev/null +++ b/docs/specs/spec-loading.rst @@ -0,0 +1,450 @@ +Loading specification +===================== + +An important part of the deposit specifications is the loading procedure where +a deposit is ingested into the Software Heritage (archive), using +the tarball loader and the complete process of software artifacts creation +in the archive. + +Tarball Loading +--------------- + +The ``swh-loader-tar`` module is already able to inject tarballs in swh +with very limited metadata (mainly the origin). + +The loading of the deposit will use the deposit's associated data: + +* the metadata +* the archive(s) + + +Artifacts creation +------------------ + +Deposit to artifacts mapping +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a global view of the deposit ingestion + ++------------------------------------+-----------------------------------------+ +| swh artifact | representation in deposit | ++====================================+=========================================+ +| origin | https://hal.inria.fr/hal-id | ++------------------------------------+-----------------------------------------+ +| origin_metadata | aggregated metadata | ++------------------------------------+-----------------------------------------+ +| snapshot | reception of all occurrences (branches) | ++------------------------------------+-----------------------------------------+ +| branches | master & | +| | branch (optional): tag to release | ++------------------------------------+-----------------------------------------+ +| release | (optional) synthetic release created | +| | from metadata | ++------------------------------------+-----------------------------------------+ +| revision | synthetic revision pointing to | +| | the expanded submitted tarball | ++------------------------------------+-----------------------------------------+ +| directory | root directory of the expanded submitted| +| | tarball | ++------------------------------------+-----------------------------------------+ + + +Origin artifact +~~~~~~~~~~~~~~~ + +We create an origin URL by concatenating the client URI and the value of the +Slug header of the initial POST request of the deposit. + +.. code-block:: json + + { + "origin": { + "id": 89283768, + "origin_visits_url": "/api/1/origin/89283768/visits/", + "type": "deposit", + "url": "https://hal.archives-ouvertes.fr/hal-02140606" + } + } + +Visits +~~~~~~ + +We identify with a visit each deposit push of the same external_id. +Here in the example below, two snapshots are identified by two different visits. + +.. code-block:: json + + { + "visits": [ + { + "date": "2019-06-03T09:28:10.223007+00:00", + "origin": 89283768, + "origin_visit_url": "/api/1/origin/89283768/visit/2/", + "snapshot": "a3773941561cc557853898773a19c07cfe2efc5a", + "snapshot_url": "/api/1/snapshot/a3773941561cc557853898773a19c07cfe2efc5a/", + "status": "full", + "type": "deposit", + "visit": 2 + }, + { + "date": "2019-05-27T12:23:31.037273+00:00", + "origin": 89283768, + "origin_visit_url": "/api/1/origin/89283768/visit/1/", + "snapshot": "43fdb8291f1bf6962211c370e394f6abb1cbe01d", + "snapshot_url": "/api/1/snapshot/43fdb8291f1bf6962211c370e394f6abb1cbe01d/", + "status": "full", + "type": "deposit", + "visit": 1 + } + ] + } + +Snapshot artifact +~~~~~~~~~~~~~~~~~ + +The snapshot represents one deposit push. The ``HEAD`` branch points to a +synthetic revision. + + .. code-block:: json + + { + "snapshot": { + "branches": { + "HEAD": { + "target": "396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52", + "target_type": "revision", + "target_url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/" + } + }, + "id": "a3773941561cc557853898773a19c07cfe2efc5a", + "next_branch": null + } + } + +Note that previous versions of the deposit-loader named the branch ``master`` +instead, and created release branches under certain conditions. + +Release artifact +~~~~~~~~~~~~~~~~ + +.. warning:: + + This part of the specification is not implemented yet, only releases are + currently being created. + +The content is deposited with a set of descriptive metadata in the CodeMeta +vocabulary. The following CodeMeta terms implies that the +artifact is a release: + +- `releaseNotes` +- `softwareVersion` + +If present, a release artifact will be created with the mapping below: + ++-------------------+-----------------------------------+-----------------+----------------+ +| SWH release field | Description | CodeMeta term | Fallback value | ++===================+===================================+=================+================+ +| target | revision containing all metadata | X |X | ++-------------------+-----------------------------------+-----------------+----------------+ +| target_type | revision | X |X | ++-------------------+-----------------------------------+-----------------+----------------+ +| name | release or tag name (mandatory) | softwareVersion | X | ++-------------------+-----------------------------------+-----------------+----------------+ +| message | message associated with release | releaseNotes | X | ++-------------------+-----------------------------------+-----------------+----------------+ +| date | release date = publication date | datePublished | deposit_date | ++-------------------+-----------------------------------+-----------------+----------------+ +| author | deposit client | author | client | ++-------------------+-----------------------------------+-----------------+----------------+ + + +.. code-block:: json + + { + "release": { + "author": { + "email": "hal@ccsd.cnrs.fr", + "fullname": "HAL ", + "name": "HAL" + }, + "author_url": "/api/1/person/x/", + "date": "2019-05-27T16:28:33+02:00", + "id": "a9f3396f372ed4a51d75e15ca16c1c2df1fc5c97", + "message": "AffectationRO Version 1.1 - added new feature\n", + "name": "1.1", + "synthetic": true, + "target": "396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52", + "target_type": "revision", + "target_url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/" + } + } + + +Revision artifact +~~~~~~~~~~~~~~~~~ + +The metadata sent with the deposit is stored outside the revision, +and does not affect the hash computation. +It contains the same fields as any revision object; in particular: + ++-------------------+-----------------------------------------+ +| SWH revision field| Description | ++===================+=========================================+ +| message | synthetic message, containing the name | +| | of the deposit client and an internal | +| | identifier of the deposit. For example: | +| | ``hal: Deposit 817 in collection hal`` | ++-------------------+-----------------------------------------+ +| author | synthetic author (SWH itself, for now) | ++-------------------+-----------------------------------------+ +| committer | same as the author (for now) | ++-------------------+-----------------------------------------+ +| date | see below | ++-------------------+-----------------------------------------+ +| committer_date | see below | ++-------------------+-----------------------------------------+ + +The date mapping +^^^^^^^^^^^^^^^^ + +A deposit may contain 4 different dates concerning the software artifacts. + +The deposit's revision will reflect the most accurate point in time available. +Here are all dates that can be available in a deposit: + ++----------------+---------------------------------+------------------------------------------------+ +| dates | location | Description | ++================+=================================+================================================+ +| reception_date | On SWORD reception (automatic) | the deposit was received at this ts | ++----------------+---------------------------------+------------------------------------------------+ +| complete_date | On SWH ingestion (automatic) | the ingestion was completed by SWH at this ts | ++----------------+---------------------------------+------------------------------------------------+ +| dateCreated | metadata in codeMeta (optional) | the software artifact was created at this ts | ++----------------+---------------------------------+------------------------------------------------+ +| datePublished | metadata in codeMeta (optional) | the software was published (contributed in HAL)| ++----------------+---------------------------------+------------------------------------------------+ + +A visit targeting a snapshot contains one date: + ++-------------------+----------------------------------------------+----------------+ +| SWH visit field | Description | value | ++===================+==============================================+================+ +| date | the origin pushed the deposit at this date | reception_date | ++-------------------+----------------------------------------------+----------------+ + +A revision contains two dates: + ++-------------------+-----------------------------------------+----------------+----------------+ +| SWH revision field| Description | CodeMeta term | Fallback value | ++===================+=========================================+================+================+ +| date | date of software artifact modification | dateCreated | reception_date | ++-------------------+-----------------------------------------+----------------+----------------+ +| committer_date | date of the commit in VCS | datePublished | reception_date | ++-------------------+-----------------------------------------+----------------+----------------+ + + +A release contains one date: + ++-------------------+----------------------------------+----------------+-----------------+ +| SWH release field |Description | CodeMeta term | Fallback value | ++===================+==================================+================+=================+ +| date |release date = publication date | datePublished | reception_date | ++-------------------+----------------------------------+----------------+-----------------+ + + +.. code-block:: json + + { + "revision": { + "author": { + "email": "robot@softwareheritage.org", + "fullname": "Software Heritage", + "id": 18233048, + "name": "Software Heritage" + }, + "author_url": "/api/1/person/18233048/", + "committer": { + "email": "robot@softwareheritage.org", + "fullname": "Software Heritage", + "id": 18233048, + "name": "Software Heritage" + }, + "committer_date": "2019-05-27T16:28:33+02:00", + "committer_url": "/api/1/person/18233048/", + "date": "2012-01-01T00:00:00+00:00", + "directory": "fb13b51abbcfd13de85d9ba8d070a23679576cd7", + "directory_url": "/api/1/directory/fb13b51abbcfd13de85d9ba8d070a23679576cd7/", + "history_url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/log/", + "id": "396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52", + "merge": false, + "message": "hal: Deposit 282 in collection hal", + "metadata": { + "@xmlns": "http://www.w3.org/2005/Atom", + "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", + "author": { + "email": "hal@ccsd.cnrs.fr", + "name": "HAL" + }, + "client": "hal", + "codemeta:applicationCategory": "info", + "codemeta:author": { + "codemeta:name": "Morane Gruenpeter" + }, + "codemeta:codeRepository": "www.code-repository.com", + "codemeta:contributor": "Morane Gruenpeter", + "codemeta:dateCreated": "2012", + "codemeta:datePublished": "2019-05-27T16:28:33+02:00", + "codemeta:description": "description\\_en test v2", + "codemeta:developmentStatus": "Inactif", + "codemeta:keywords": "mot_cle_en,mot_cle_2_en,mot_cle_fr", + "codemeta:license": [ + { + "codemeta:name": "MIT License" + }, + { + "codemeta:name": "CeCILL Free Software License Agreement v1.1" + } + ], + "codemeta:name": "Test\\_20190527\\_01", + "codemeta:operatingSystem": "OS", + "codemeta:programmingLanguage": "Java", + "codemeta:referencePublication": null, + "codemeta:relatedLink": null, + "codemeta:releaseNotes": "releaseNote", + "codemeta:runtimePlatform": "outil", + "codemeta:softwareVersion": "1.0.1", + "codemeta:url": "https://hal.archives-ouvertes.fr/hal-02140606", + "codemeta:version": "2", + "external_identifier": "hal-02140606", + "id": "hal-02140606", + "original_artifact": [ + { + "archive_type": "zip", + "blake2s256": "96be3ddedfcee9669ad9c42b0bb3a706daf23824d04311c63505a4d8db02df00", + "length": 193072, + "name": "archive.zip", + "sha1": "5b6ecc9d5bb113ff69fc275dcc9b0d993a8194f1", + "sha1_git": "bd10e4d3ede17162692d7e211e08e87e67994488", + "sha256": "3e2ce93384251ce6d6da7b8f2a061a8ebdaf8a28b8d8513223ca79ded8a10948" + } + ] + }, + "parents": [ + { + "id": "a9fdc3937d2b704b915852a64de2ab1b4b481003", + "url": "/api/1/revision/a9fdc3937d2b704b915852a64de2ab1b4b481003/" + } + ], + "synthetic": true, + "type": "tar", + "url": "/api/1/revision/396b1ff29f7c75a0a3cc36f30e24ff7bae70bb52/" + } + } + +Directory artifact +~~~~~~~~~~~~~~~~~~ + +The directory artifact is the archive(s)' raw content deposited. + +.. code-block:: json + + { + "directory": [ + { + "dir_id": "fb13b51abbcfd13de85d9ba8d070a23679576cd7", + "length": null, + "name": "AffectationRO", + "perms": 16384, + "target": "fbc418f9ac2c39e8566b04da5dc24b14e65b23b1", + "target_url": "/api/1/directory/fbc418f9ac2c39e8566b04da5dc24b14e65b23b1/", + "type": "dir" + } + ] + } + + +Questions raised concerning loading +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- A deposit has one origin, yet an origin can have multiple deposits? + +No, an origin can have multiple requests for the same deposit. Which +should end up in one single deposit (when the client pushes its final +request saying deposit 'done' through the header In-Progress). + +Only update of existing 'partial' deposit is permitted. Other than that, +the deposit 'update' operation. + +To create a new version of a software (already deposited), the client +must prior to this create a new deposit. + +Illustration First deposit loading: + +HAL's deposit 01535619 = SWH's deposit **01535619-1** + +:: + + + 1 origin with url:https://hal.inria.fr/medihal-01535619 + + + 1 synthetic revision + + + 1 directory + +HAL's update on deposit 01535619 = SWH's deposit **01535619-2** + +(\*with HAL updates can only be on the metadata and a new version is +required if the content changes) + +:: + + + 1 origin with url:https://hal.inria.fr/medihal-01535619 + + + new synthetic revision (with new metadata) + + + same directory + +HAL's deposit 01535619-v2 = SWH's deposit **01535619-v2-1** + +:: + + + same origin + + + new revision + + + new directory + + +Scheduling loading +~~~~~~~~~~~~~~~~~~ + +All ``archive`` and ``metadata`` deposit requests should be aggregated before +loading. + +The loading should be scheduled via the scheduler's api. + +Only ``deposited`` deposit are concerned by the loading. + +When the loading is done and successful, the deposit entry is updated: + + - ``status`` is updated to ``done`` + - ``swh-id`` is populated with the resulting :ref:`SWHID + ` + - ``complete_date`` is updated to the loading's finished time + +When the loading has failed, the deposit entry is updated: + - ``status`` is updated to ``failed`` + - ``swh-id`` and ``complete_data`` remains as is + +*Note:* As a further improvement, we may prefer having a retry policy with +graceful delays for further scheduling. + +Metadata loading +~~~~~~~~~~~~~~~~ + +- the metadata received with the deposit are kept in a dedicated table + ``raw_extrinsic_metadata``, distinct from the ``revision`` and ``origin`` + tables. + +- ``authority`` is computed from the deposit client information, and ``fetcher`` + is the deposit loader. + diff --git a/docs/specs/spec-meta-deposit.rst b/docs/specs/spec-meta-deposit.rst new file mode 100644 index 00000000..6ef3c30a --- /dev/null +++ b/docs/specs/spec-meta-deposit.rst @@ -0,0 +1,118 @@ +The metadata-deposit +==================== + +Goal +---- +A client wishes to deposit only metadata about an origin or object in the +Software Heritage archive. + +The metadata-deposit is a special deposit where no content is +provided and the data transferred to Software Heritage is only +the metadata about an object in the archive. + +Requirements +------------ +The scope of the metadata-deposit is different than the +sparse-deposit. While a sparse-deposit creates a revision with referenced +directories and content files, the metadata-deposit references any of the +following: + +- origin +- snapshot +- release +- revision +- directory +- content + + +A complete metadata example +--------------------------- +The reference element is included in the metadata xml atomEntry under the +swh namespace: + +TODO: publish schema at https://www.softwareheritage.org/schema/2018/deposit + +.. code:: xml + + + + + HAL + hal@ccsd.cnrs.fr + + hal + hal-01243573 + The assignment problem + https://hal.archives-ouvertes.fr/hal-01243573 + other identifier, DOI, ARK + Domain + description + + author1 + Inria + UPMC + + + author2 + Inria + UPMC + + + + + + + + +References +^^^^^^^^^^ + +Origins +======= + +The metadata may be on an origin, identified by the origin's URL: + +.. code:: xml + + + + + + + +Graph objects +============= + +It may also reference an object in the `SWH graph `: contents, +directories, revisions, releases, and snapshots: + +.. code:: xml + + + + + + + +The value of the ``swhid`` attribute must be a `SWHID `, +with any context qualifiers in this list: + +* ``origin`` +* ``visit`` +* ``anchor`` +* ``path`` + +and they should be provided whenever relevant, especially ``origin``. + +Other qualifiers are not allowed (for example, ``line`` isn't because SWH +cannot store metadata at a finer level than entire contents). + + +Loading procedure +------------------ + +In this case, the metadata-deposit will be injected as a metadata entry of +the relevant object, with the information about the contributor of the deposit. +Contrary to the complete and sparse deposit, there will be no object creation. diff --git a/docs/specs/spec-sparse-deposit.rst b/docs/specs/spec-sparse-deposit.rst new file mode 100644 index 00000000..461694fa --- /dev/null +++ b/docs/specs/spec-sparse-deposit.rst @@ -0,0 +1,102 @@ +The sparse-deposit +================== + +Goal +---- +A client wishes to transfer a tarball for which part of the content is +already in the SWH archive. + +Requirements +------------ +To do so, a list of paths with targets must be provided in the metadata and +the paths to the missing directories/content should not be included +in the tarball. The list will be referred to +as the manifest list using the entry name 'bindings' in the metadata. + ++----------------------+-------------------------------------+ +| path | swh-id | ++======================+=====================================+ +| path/to/file.txt | swh:1:cnt:aaaaaaaaaaaaaaaaaaaaa... | ++----------------------+-------------------------------------+ +| path/to/dir/ | swh:1:dir:aaaaaaaaaaaaaaaaaaaaa... | ++----------------------+-------------------------------------+ + +Note: the *name* of the file or the directory is given by the path and is not +part of the identified object. + +TODO: see if a trailing "/" is mandatory for implementation. + +A concrete example +------------------ +The manifest list is included in the metadata xml atomEntry under the +swh namespace: + +TODO: publish schema at https://www.softwareheritage.org/schema/2018/deposit + +.. code:: xml + + + + + HALit mandatory to have a trailing "/", + hal@ccsd.cnrs.fr + + hal + hal-01243573 + The assignment problem + https://hal.archives-ouvertes.fr/hal-01243573 + other identifier, DOI, ARK + Domain + description + + author1 + Inria + UPMC + + + author2 + Inria + UPMC + + + + + + + + + + + + +Deposit verification +-------------------- + +After checking the integrity of the deposit content and +metadata, the following checks should be added: + +1. validate the manifest list structure with a correct swh-id for each path (syntax check on the swh-id format) +2. verify that the path name corresponds to the object type +3. locate the identifiers in the SWH archive + +Each failing check should return a different error with the deposit +and result in a 'rejected' deposit. + +Loading procedure +------------------ +The injection procedure should include: + +- load the tarball new data +- create new objects using the path name and create links from the path to the + SWH object using the identifier +- calculate identifier of the new objects at each level +- return final swh-id of the new revision + +Invariant: the same content should yield the same swh-id, +that's why a complete deposit with all the content and +a sparse-deposit with the correct links will result +with the same root directory swh-id. +The same is expected with the revision swh-id if the metadata provided is +identical. diff --git a/docs/specs/spec-technical.rst b/docs/specs/spec-technical.rst new file mode 100644 index 00000000..d1111b8a --- /dev/null +++ b/docs/specs/spec-technical.rst @@ -0,0 +1,100 @@ +Technical specifications +======================== + +Requirements +~~~~~~~~~~~~ + +* one dedicated database to store the deposit's state - swh-deposit +* one dedicated temporary storage to store archives before loading +* one client to test the communication with SWORD protocol + +Deposit reception schema +~~~~~~~~~~~~~~~~~~~~~~~~ + +* SWORD imposes the use of basic authentication, so we need a way to + authenticate client. Also, a client can access collections: + + **deposit\_client** table: + + - id (bigint): Client's identifier + - username (str): Client's username + - password (pass): Client's encrypted password + - collections ([id]): List of collections the client can access + +* Collections group deposits together: + + **deposit\_collection** table: + + - id (bigint): Collection's identifier + - name (str): Collection's human readable name + +* A deposit is the main object the repository is all about: + + **deposit** table: + + - id (bigint): deposit's identifier + - reception\_date (date): First deposit's reception date + - complete\_data (date): Date when the deposit is deemed complete and ready + for loading + - collection (id): The collection the deposit belongs to + - external id (text): client's internal identifier (e.g hal's id, etc...). + - client\_id (id) : Client which did the deposit + - swh\_id (str) : swh identifier result once the loading is complete + - status (enum): The deposit's current status + +- As mentioned, a deposit can have a status, whose possible values are: + + .. code:: text + + 'partial', -- the deposit is new or partially received since it + -- can be done in multiple requests + 'expired', -- deposit has been there too long and is now deemed + -- ready to be garbage collected + 'deposited' -- deposit complete, it is ready to be checked to ensure data consistency + 'verified', -- deposit is fully received, checked, and ready for loading + 'loading', -- loading is ongoing on swh's side + 'done', -- loading is successful + 'failed' -- loading is a failure + +* A deposit is stateful and can be made in multiple requests: + + **deposit\_request** table: + + - id (bigint): identifier + - type (id): deposit request's type (possible values: 'archive', 'metadata') + - deposit\_id (id): deposit whose request belongs to + - metadata: metadata associated to the request + - date (date): date of the requests + + Information sent along a request are stored in a ``deposit_request`` row. + + They can be either of type ``metadata`` (atom entry, multipart's atom entry + part) or of type ``archive`` (binary upload, multipart's binary upload part). + + When the deposit is complete (status ``deposited``), those ``metadata`` and + ``archive`` deposit requests will be read and aggregated. They will then be + sent as parameters to the loading routine. + + During loading, some of those metadata are kept in the ``origin_metadata`` + table and some other are stored in the ``revision`` table (see `metadata + loading <#metadata-loading>`__). + + The only update actions occurring on the deposit table are in regards of: + + - status changes (see figure below): + + - ``partial`` -> {``expired``/``deposited``}, + - ``deposited`` -> {``rejected``/``verified``}, + - ``verified`` -> ``loading`` + - ``loading`` -> {``done``/``failed``} + + - ``complete_date`` when the deposit is + finalized (when the status is changed to ``deposited``) + - ``swh-id`` is populated once we have the loading result + +.. raw:: html + + + +.. figure:: ../images/status.png + :alt: diff --git a/docs/specs/specs.rst b/docs/specs/specs.rst new file mode 100644 index 00000000..8abdb491 --- /dev/null +++ b/docs/specs/specs.rst @@ -0,0 +1,14 @@ +.. _swh-deposit-specs: + +Blueprint Specifications +========================= + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + blueprint.rst + spec-loading.rst + spec-technical.rst + spec-sparse-deposit.rst + spec-meta-deposit.rst diff --git a/docs/specs/swh.xsd b/docs/specs/swh.xsd new file mode 100644 index 00000000..a082f4d5 --- /dev/null +++ b/docs/specs/swh.xsd @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/sys-info.rst b/docs/sys-info.rst new file mode 100644 index 00000000..12374a6e --- /dev/null +++ b/docs/sys-info.rst @@ -0,0 +1,95 @@ +Deployment of the swh-deposit +============================= + +As usual, the debian packaged is created and uploaded to the swh debian +repository. Once the package is installed, we need to do a few things in +regards to the database. + +Prepare the database setup (existence, connection, etc...). +----------------------------------------------------------- + +This is defined through the packaged ``swh.deposit.settings.production`` +module and the expected **/etc/softwareheritage/deposit/server.yml**. + +As usual, the expected configuration files are deployed through our +puppet manifest (cf. puppet-environment/swh-site, +puppet-environment/swh-role, puppet-environment/swh-profile) + +Environment (production) +------------------------ + +`SWH_CONFIG_FILENAME` must be defined and target the deposit's server +configuration file. So either prefix the following commands or export the +environment variable in your shell session. + +.. code:: shell + + export SWH_CONFIG_FILENAME=/etc/softwareheritage/deposit/server.yml + +Migrate/bootstrap the db schema +------------------------------- + +.. code:: shell + + sudo django-admin migrate --settings=swh.deposit.settings.production + +Load minimum defaults data +-------------------------- + +.. code:: shell + + sudo django-admin loaddata \ + --settings=swh.deposit.settings.production deposit_data + +This adds the minimal 'hal' collection + +Note: swh.deposit.fixtures.deposit\_data is packaged + +Add client and collection +------------------------- + +.. code:: shell + + swh deposit admin \ + --config-file /etc/softwareheritage/deposit/server.yml \ + --platform production \ + user create \ + --collection \ + --username \ + --password + +This adds a user ```` which can access the collection +````. The password will be used for the authentication +access to the deposit api. + +Note: + - If the collection does not exist, it is created alongside + - The password is plain text but stored encrypted (so yes, for now + we know the user's password) + - For production platform, you must either set an `SWH_CONFIG_FILENAME` + environment variable or pass alongside the `--config-file` parameter + +Reschedule a deposit +--------------------- + +.. code:: shell + + swh deposit admin \ + --config-file /etc/softwareheritage/deposit/server.yml \ + --platform production \ + deposit reschedule \ + --deposit-id + +This will: + +- check the deposit's status to something reasonable (failed or done). That + means that the checks have passed alright but something went wrong during the + loading (failed: loading failed, done: loading ok, still for some reasons as + in bugs, we need to reschedule it) +- reset the deposit's status to 'verified' (prior to any loading but after the + checks which are fine) and removes the different archives' identifiers + (swh-id, ...) +- trigger back the loading task through the scheduler + + + diff --git a/docs/tests/tests_HAL.rst b/docs/tests/tests_HAL.rst new file mode 100644 index 00000000..0a1eeb4c --- /dev/null +++ b/docs/tests/tests_HAL.rst @@ -0,0 +1,67 @@ +Tests scenarios for client +========================== + +Scenarios for HAL- on HAL's platform +------------------------------------ + +The same procedure is used for all tests: + +Software Author: + +#. prepare content +#. fill out form +#. submit + +HAL moderator: + +#. review content submitted +#. check metadata fields on HAL +#. validate submission + +SWH side: + +1. check content in SWH: + + - directory was created + - revision was created + - release was created when releaseNotes and softwareVersion was included (new feature!) + - origin corresponds to HAL url + +2. check metadata fields on SWH (in revision) +3. check directory +4. check swh-id on HAL +5. check browsability when entering SWH artifact from HAL +6. check vault artifact recreation +7. access deposit's origin from SWH + ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| scenario | test case | data | result | exceptions or specific checks | ++=============+===========================================+==========+=========+=========================================+ +| submit code | content: .tar.gz | .zip | success | | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| submit code | content: .zip | .tar.gz | success | | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| submit code | content: no content | empty | fail | blocked on HAL | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| submit code | content: double compression (.zip in .zip)| .zip x 2 | fail | status `failed` on SWH | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| submit code | all metadata-single entry | metadata | success | check that all metadata is transmitted | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| submit code | multiple entries | metadata | success | languages / authors / descriptions | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| new version | new content- same metadata | content | success | check new swh-id in SWH and HAL | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| new version | same content- new metadata | metadata | ? | dead angle- doesn't arrives to SWH | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| new version | new content-new metadata | C & M | success | check artifacts history in revisions | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ +| submit code | deposit on another hal platform | C & M | success | | ++-------------+-------------------------------------------+----------+---------+-----------------------------------------+ + +Past known bugs: + +- v2 problem, where swh-id from first version is kept in the second version + instead of the new swh-id. +- when deposit workers are down- error 500 is returned on HAL without real + explanation (because there is no error on SWH- deposit status + stays `deposited`). diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..b450c720 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,43 @@ +[mypy] +namespace_packages = True +warn_unused_ignores = True + + +# support for django magic: https://github.com/typeddjango/django-stubs +plugins = mypy_django_plugin.main + +[mypy.plugins.django-stubs] +django_settings_module = swh.deposit.settings.testing + + +# 3rd party libraries without stubs (yet) + +[mypy-celery.*] +ignore_missing_imports = True + +[mypy-iso8601.*] +ignore_missing_imports = True + +[mypy-pkg_resources.*] +ignore_missing_imports = True + +[mypy-psycopg2.*] +ignore_missing_imports = True + +[mypy-pytest.*] +ignore_missing_imports = True + +[tenacity.*] +ignore_missing_imports = True + +[mypy-rest_framework.*] +ignore_missing_imports = True + +[mypy-xmltodict.*] +ignore_missing_imports = True + +[mypy-swh.loader.tar.*] +ignore_missing_imports = True + +[mypy-swh.storage.*] +ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index b5413f6c..69b8f4dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,11 @@ [tool.black] target-version = ['py37'] + +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +line_length = 88 +force_sort_within_sections = true diff --git a/pytest.ini b/pytest.ini index bfe57267..020ea949 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,8 +1,9 @@ [pytest] -addopts = -p no:flask -norecursedirs = docs +# Remove the pytest_swh_* entries when they stop getting imported automatically +addopts = -p no:flask -p no:pytest_swh_scheduler -p no:pytest_swh_storage +norecursedirs = docs .* DJANGO_SETTINGS_MODULE = swh.deposit.settings.testing markers = db: execute tests using a postgresql database fs: execute tests using the filesystem diff --git a/requirements-server.txt b/requirements-server.txt index d2631e2c..5a906819 100644 --- a/requirements-server.txt +++ b/requirements-server.txt @@ -1,2 +1,3 @@ Django < 3 djangorestframework +setuptools diff --git a/requirements-swh-server.txt b/requirements-swh-server.txt index 86a85993..5e81fabe 100644 --- a/requirements-swh-server.txt +++ b/requirements-swh-server.txt @@ -1,4 +1,4 @@ swh.core[http] swh.loader.core >= 0.0.71 swh.scheduler >= 0.0.39 -swh.model >= 0.1.0 +swh.model >= 0.3.8 diff --git a/requirements-swh.txt b/requirements-swh.txt index c1af7e51..9bc67248 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1 +1 @@ -swh.core >= 0.0.75 +swh.core[http] >= 0.3 diff --git a/resources/deposit/server.yml b/resources/deposit/server.yml new file mode 100644 index 00000000..b7dbba1f --- /dev/null +++ b/resources/deposit/server.yml @@ -0,0 +1,2 @@ +# 200 Mib max size +max_upload_size: 209715200 diff --git a/setup.py b/setup.py index 7926e240..566feef9 100755 --- a/setup.py +++ b/setup.py @@ -1,79 +1,79 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from setuptools import setup, find_packages - -from os import path from io import open +from os import path + +from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(*names): requirements = [] for name in names: if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.deposit", description="Software Heritage Deposit Server", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/source/swh-deposit/", packages=find_packages(), install_requires=parse_requirements(None, "swh"), tests_require=parse_requirements("test"), - setup_requires=["vcversioner"], + setup_requires=["setuptools-scm"], + use_scm_version=True, extras_require={ "testing": parse_requirements("test", "server", "swh-server"), "server": parse_requirements("server", "swh-server"), }, - vcversioner={}, include_package_data=True, entry_points=""" [console_scripts] swh-deposit=swh.deposit.cli:main [swh.cli.subcommands] - deposit=swh.deposit.cli:deposit + deposit=swh.deposit.cli [swh.workers] deposit.worker=swh.deposit.loader:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-deposit", "Documentation": "https://docs.softwareheritage.org/devel/swh-deposit/", }, ) diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index 0e524373..5b467b4c 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,37 +1,37 @@ Metadata-Version: 2.1 Name: swh.deposit -Version: 0.0.90 +Version: 0.1.0 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-deposit/ Description: # swh-deposit This is [Software Heritage](https://www.softwareheritage.org)'s [SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server implementation, as well as a simple client to upload deposits on the server. **S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository **D**eposit) is an interoperability standard for digital file deposit. This implementation will permit interaction between a client (a repository) and a server (SWH repository) to permit deposits of software source code archives and associated metadata. The documentation is at ./docs/README-specification.md Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: server diff --git a/swh.deposit.egg-info/SOURCES.txt b/swh.deposit.egg-info/SOURCES.txt index 6d6b7ec9..063a2c97 100644 --- a/swh.deposit.egg-info/SOURCES.txt +++ b/swh.deposit.egg-info/SOURCES.txt @@ -1,177 +1,215 @@ +.gitignore +.pre-commit-config.yaml +AUTHORS +CODE_OF_CONDUCT.md +CONTRIBUTORS +LICENSE MANIFEST.in Makefile +Makefile.local README.md +conftest.py +mypy.ini pyproject.toml pytest.ini requirements-server.txt requirements-swh-server.txt requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini -version.txt +bin/Makefile +bin/content.sh +bin/create_deposit.sh +bin/create_deposit_atom.sh +bin/create_deposit_with_metadata.sh +bin/default-setup +bin/download-deposit-archive.sh +bin/home.sh +bin/replace-deposit-archive.sh +bin/service-document.sh +bin/status.sh +bin/update-deposit-with-another-archive.sh +bin/update-status.sh +docs/.gitignore +docs/Makefile +docs/conf.py +docs/dev-info.rst +docs/getting-started.rst +docs/index.rst +docs/metadata.rst +docs/spec-api.rst +docs/sys-info.rst +docs/_static/.placeholder +docs/_templates/.placeholder +docs/endpoints/collection.rst +docs/endpoints/content.rst +docs/endpoints/service-document.rst +docs/endpoints/status.rst +docs/endpoints/update-media.rst +docs/endpoints/update-metadata.rst +docs/images/deposit-create-chart.png +docs/images/deposit-delete-chart.png +docs/images/deposit-update-chart.png +docs/images/status.png +docs/specs/blueprint.rst +docs/specs/metadata_example.xml +docs/specs/spec-loading.rst +docs/specs/spec-meta-deposit.rst +docs/specs/spec-sparse-deposit.rst +docs/specs/spec-technical.rst +docs/specs/specs.rst +docs/specs/swh.xsd +docs/tests/tests_HAL.rst +resources/deposit/server.yml swh/__init__.py swh.deposit.egg-info/PKG-INFO swh.deposit.egg-info/SOURCES.txt swh.deposit.egg-info/dependency_links.txt swh.deposit.egg-info/entry_points.txt swh.deposit.egg-info/requires.txt swh.deposit.egg-info/top_level.txt swh/deposit/__init__.py swh/deposit/apps.py swh/deposit/auth.py swh/deposit/client.py swh/deposit/config.py swh/deposit/errors.py swh/deposit/exception.py swh/deposit/gunicorn_config.py swh/deposit/manage.py swh/deposit/models.py swh/deposit/parsers.py swh/deposit/py.typed swh/deposit/urls.py swh/deposit/utils.py swh/deposit/api/__init__.py swh/deposit/api/common.py swh/deposit/api/converters.py swh/deposit/api/deposit.py swh/deposit/api/deposit_content.py swh/deposit/api/deposit_status.py swh/deposit/api/deposit_update.py swh/deposit/api/service_document.py swh/deposit/api/urls.py swh/deposit/api/private/__init__.py swh/deposit/api/private/deposit_check.py swh/deposit/api/private/deposit_list.py swh/deposit/api/private/deposit_read.py swh/deposit/api/private/deposit_update_status.py swh/deposit/api/private/urls.py swh/deposit/cli/__init__.py swh/deposit/cli/admin.py swh/deposit/cli/client.py swh/deposit/fixtures/__init__.py swh/deposit/fixtures/deposit_data.yaml swh/deposit/loader/__init__.py swh/deposit/loader/checker.py swh/deposit/loader/tasks.py swh/deposit/migrations/0001_initial.py swh/deposit/migrations/0002_depositrequest_archive.py swh/deposit/migrations/0003_temporaryarchive.py swh/deposit/migrations/0004_delete_temporaryarchive.py swh/deposit/migrations/0005_auto_20171019_1436.py swh/deposit/migrations/0006_depositclient_url.py swh/deposit/migrations/0007_auto_20171129_1609.py swh/deposit/migrations/0008_auto_20171130_1513.py swh/deposit/migrations/0009_deposit_parent.py swh/deposit/migrations/0010_auto_20180110_0953.py swh/deposit/migrations/0011_auto_20180115_1510.py swh/deposit/migrations/0012_deposit_status_detail.py swh/deposit/migrations/0013_depositrequest_raw_metadata.py swh/deposit/migrations/0014_auto_20180720_1221.py swh/deposit/migrations/0015_depositrequest_typemigration.py swh/deposit/migrations/0016_auto_20190507_1408.py swh/deposit/migrations/0017_auto_20190925_0906.py swh/deposit/migrations/0018_migrate_swhids.py swh/deposit/migrations/0019_auto_20200519_1035.py swh/deposit/migrations/__init__.py swh/deposit/settings/__init__.py swh/deposit/settings/common.py swh/deposit/settings/development.py swh/deposit/settings/production.py swh/deposit/settings/testing.py swh/deposit/static/robots.txt swh/deposit/static/css/bootstrap-responsive.min.css swh/deposit/static/css/style.css swh/deposit/static/img/arrow-up-small.png swh/deposit/static/img/swh-logo-deposit.png swh/deposit/static/img/swh-logo-deposit.svg swh/deposit/static/img/icons/swh-logo-32x32.png swh/deposit/static/img/icons/swh-logo-deposit-180x180.png swh/deposit/static/img/icons/swh-logo-deposit-192x192.png swh/deposit/static/img/icons/swh-logo-deposit-270x270.png swh/deposit/templates/__init__.py swh/deposit/templates/api.html swh/deposit/templates/homepage.html swh/deposit/templates/layout.html swh/deposit/templates/deposit/__init__.py swh/deposit/templates/deposit/content.xml swh/deposit/templates/deposit/deposit_receipt.xml swh/deposit/templates/deposit/error.xml swh/deposit/templates/deposit/service_document.xml swh/deposit/templates/deposit/status.xml swh/deposit/templates/rest_framework/api.html swh/deposit/tests/__init__.py swh/deposit/tests/common.py swh/deposit/tests/conftest.py swh/deposit/tests/test_common.py swh/deposit/tests/test_gunicorn_config.py +swh/deposit/tests/test_init.py swh/deposit/tests/test_utils.py swh/deposit/tests/api/__init__.py swh/deposit/tests/api/conftest.py swh/deposit/tests/api/test_converters.py swh/deposit/tests/api/test_deposit.py swh/deposit/tests/api/test_deposit_atom.py swh/deposit/tests/api/test_deposit_binary.py swh/deposit/tests/api/test_deposit_delete.py swh/deposit/tests/api/test_deposit_list.py swh/deposit/tests/api/test_deposit_multipart.py swh/deposit/tests/api/test_deposit_private_check.py swh/deposit/tests/api/test_deposit_private_read_archive.py swh/deposit/tests/api/test_deposit_private_read_metadata.py swh/deposit/tests/api/test_deposit_private_update_status.py swh/deposit/tests/api/test_deposit_schedule.py swh/deposit/tests/api/test_deposit_status.py swh/deposit/tests/api/test_deposit_update.py swh/deposit/tests/api/test_exception.py swh/deposit/tests/api/test_parser.py swh/deposit/tests/api/test_service_document.py -swh/deposit/tests/api/data/atom/codemeta-sample.xml -swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml -swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml -swh/deposit/tests/api/data/atom/entry-data-empty-body.xml -swh/deposit/tests/api/data/atom/entry-data-ko.xml -swh/deposit/tests/api/data/atom/entry-data-minimal.xml -swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml -swh/deposit/tests/api/data/atom/entry-data0.xml -swh/deposit/tests/api/data/atom/entry-data1.xml -swh/deposit/tests/api/data/atom/entry-data2.xml -swh/deposit/tests/api/data/atom/entry-data3.xml -swh/deposit/tests/api/data/atom/entry-update-in-place.xml -swh/deposit/tests/api/data/atom/error-with-decimal.xml -swh/deposit/tests/api/data/atom/metadata.xml -swh/deposit/tests/api/data/atom/tei-sample.xml swh/deposit/tests/cli/__init__.py swh/deposit/tests/cli/test_client.py -swh/deposit/tests/cli/data/atom/codemeta-sample.xml -swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml -swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml -swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml -swh/deposit/tests/cli/data/atom/entry-data-ko.xml -swh/deposit/tests/cli/data/atom/entry-data-minimal.xml -swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml -swh/deposit/tests/cli/data/atom/entry-data0.xml -swh/deposit/tests/cli/data/atom/entry-data1.xml -swh/deposit/tests/cli/data/atom/entry-data2.xml -swh/deposit/tests/cli/data/atom/entry-data3.xml -swh/deposit/tests/cli/data/atom/entry-update-in-place.xml -swh/deposit/tests/cli/data/atom/error-with-decimal.xml -swh/deposit/tests/cli/data/atom/metadata.xml -swh/deposit/tests/cli/data/atom/tei-sample.xml +swh/deposit/tests/data/atom/codemeta-sample.xml +swh/deposit/tests/data/atom/entry-data-badly-formatted.xml +swh/deposit/tests/data/atom/entry-data-deposit-binary.xml +swh/deposit/tests/data/atom/entry-data-empty-body.xml +swh/deposit/tests/data/atom/entry-data-ko.xml +swh/deposit/tests/data/atom/entry-data-minimal.xml +swh/deposit/tests/data/atom/entry-data-parsing-error-prone.xml +swh/deposit/tests/data/atom/entry-data0.xml +swh/deposit/tests/data/atom/entry-data1.xml +swh/deposit/tests/data/atom/entry-data2.xml +swh/deposit/tests/data/atom/entry-data3.xml +swh/deposit/tests/data/atom/entry-update-in-place.xml +swh/deposit/tests/data/atom/error-with-decimal.xml +swh/deposit/tests/data/atom/metadata.xml +swh/deposit/tests/data/atom/tei-sample.xml swh/deposit/tests/loader/__init__.py swh/deposit/tests/loader/common.py swh/deposit/tests/loader/conftest.py swh/deposit/tests/loader/test_checker.py swh/deposit/tests/loader/test_client.py swh/deposit/tests/loader/test_tasks.py swh/deposit/tests/loader/data/http_example.org/hello.json swh/deposit/tests/loader/data/http_example.org/hello_you swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_1_check swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_2_check swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_raw swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_update swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_check swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_metadata swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_raw \ No newline at end of file diff --git a/swh.deposit.egg-info/entry_points.txt b/swh.deposit.egg-info/entry_points.txt index dbdecaea..d627b0ce 100644 --- a/swh.deposit.egg-info/entry_points.txt +++ b/swh.deposit.egg-info/entry_points.txt @@ -1,8 +1,8 @@ [console_scripts] swh-deposit=swh.deposit.cli:main [swh.cli.subcommands] - deposit=swh.deposit.cli:deposit + deposit=swh.deposit.cli [swh.workers] deposit.worker=swh.deposit.loader:register \ No newline at end of file diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt index afc37349..88063b40 100644 --- a/swh.deposit.egg-info/requires.txt +++ b/swh.deposit.egg-info/requires.txt @@ -1,30 +1,32 @@ vcversioner click xmltodict iso8601 requests -swh.core>=0.0.75 +swh.core[http]>=0.3 [server] Django<3 djangorestframework +setuptools swh.core[http] swh.loader.core>=0.0.71 swh.scheduler>=0.0.39 -swh.model>=0.1.0 +swh.model>=0.3.8 [testing] pytest pytest-django pytest-mock swh.scheduler[testing] swh.loader.core[testing] pytest-postgresql>=2.1.0 requests_mock django-stubs Django<3 djangorestframework +setuptools swh.core[http] swh.loader.core>=0.0.71 swh.scheduler>=0.0.39 -swh.model>=0.1.0 +swh.model>=0.3.8 diff --git a/swh/deposit/api/__init__.py b/swh/deposit/api/__init__.py index e69de29b..65c78f3d 100644 --- a/swh/deposit/api/__init__.py +++ b/swh/deposit/api/__init__.py @@ -0,0 +1,11 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pkg_resources + +try: + __version__ = pkg_resources.get_distribution("swh.deposit").version +except pkg_resources.DistributionNotFound: + __version__ = "devel" diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py index c8e5248a..6bed49c5 100644 --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -1,962 +1,1033 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from abc import ABCMeta, abstractmethod +import datetime import hashlib +import json +from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union -from typing import Sequence, Type - -from abc import ABCMeta, abstractmethod -from django.urls import reverse -from django.http import HttpResponse +from django.http import FileResponse, HttpResponse from django.shortcuts import render +from django.urls import reverse from django.utils import timezone from rest_framework import status from rest_framework.authentication import BaseAuthentication, BasicAuthentication from rest_framework.permissions import BasePermission, IsAuthenticated +from rest_framework.request import Request from rest_framework.views import APIView from swh.model import hashutil from swh.scheduler.utils import create_oneshot_task_dict from ..config import ( - SWHDefaultConfig, + ARCHIVE_KEY, + ARCHIVE_TYPE, + CONT_FILE_IRI, + DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, EDIT_SE_IRI, EM_IRI, - CONT_FILE_IRI, - ARCHIVE_KEY, METADATA_KEY, + METADATA_TYPE, RAW_METADATA_KEY, STATE_IRI, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_LOAD_SUCCESS, - ARCHIVE_TYPE, - METADATA_TYPE, + APIConfig, ) from ..errors import ( - MAX_UPLOAD_SIZE_EXCEEDED, BAD_REQUEST, - ERROR_CONTENT, CHECKSUM_MISMATCH, - make_error_dict, - MEDIATION_NOT_ALLOWED, - make_error_response_from_dict, + ERROR_CONTENT, FORBIDDEN, - NOT_FOUND, - make_error_response, + MAX_UPLOAD_SIZE_EXCEEDED, + MEDIATION_NOT_ALLOWED, METHOD_NOT_ALLOWED, - ParserError, + NOT_FOUND, PARSING_ERROR, + ParserError, + make_error_dict, + make_error_response, + make_error_response_from_dict, ) -from ..models import Deposit, DepositRequest, DepositCollection, DepositClient +from ..models import Deposit, DepositClient, DepositCollection, DepositRequest from ..parsers import parse_xml - ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] -class SWHAPIView(APIView): +class AuthenticatedAPIView(APIView): """Mixin intended as a based API view to enforce the basic authentication check """ authentication_classes: Sequence[Type[BaseAuthentication]] = (BasicAuthentication,) permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,) -class SWHBaseDeposit(SWHDefaultConfig, SWHAPIView, metaclass=ABCMeta): +class APIBase(APIConfig, AuthenticatedAPIView, metaclass=ABCMeta): """Base deposit request class sharing multiple common behaviors. """ - def _read_headers(self, request): + def _read_headers(self, request: Request) -> Dict[str, Any]: """Read and unify the necessary headers from the request (those are not stored in the same location or not properly formatted). Args: request (Request): Input request Returns: Dictionary with the following keys (some associated values may be None): - content-type - content-length - in-progress - content-disposition - packaging - slug - on-behalf-of """ meta = request._request.META content_type = request.content_type content_length = meta.get("CONTENT_LENGTH") if content_length and isinstance(content_length, str): content_length = int(content_length) # final deposit if not provided in_progress = meta.get("HTTP_IN_PROGRESS", False) content_disposition = meta.get("HTTP_CONTENT_DISPOSITION") if isinstance(in_progress, str): in_progress = in_progress.lower() == "true" content_md5sum = meta.get("HTTP_CONTENT_MD5") if content_md5sum: content_md5sum = bytes.fromhex(content_md5sum) packaging = meta.get("HTTP_PACKAGING") slug = meta.get("HTTP_SLUG") on_behalf_of = meta.get("HTTP_ON_BEHALF_OF") metadata_relevant = meta.get("HTTP_METADATA_RELEVANT") return { "content-type": content_type, "content-length": content_length, "in-progress": in_progress, "content-disposition": content_disposition, "content-md5sum": content_md5sum, "packaging": packaging, "slug": slug, "on-behalf-of": on_behalf_of, "metadata-relevant": metadata_relevant, } - def _compute_md5(self, filehandler): + def _compute_md5(self, filehandler) -> bytes: """Compute uploaded file's md5 sum. Args: filehandler (InMemoryUploadedFile): the file to compute the md5 hash Returns: the md5 checksum (str) """ h = hashlib.md5() for chunk in filehandler: h.update(chunk) return h.digest() def _deposit_put( - self, request, deposit_id=None, in_progress=False, external_id=None - ): + self, + request: Request, + deposit_id: Optional[int] = None, + in_progress: bool = False, + external_id: Optional[str] = None, + ) -> Deposit: """Save/Update a deposit in db. Args: - deposit_id (int): deposit identifier - in_progress (dict): The deposit's status - external_id (str): The external identifier to associate to - the deposit + request: request data + deposit_id: deposit identifier + in_progress: deposit status + external_id: external identifier to associate to the deposit Returns: The Deposit instance saved or updated. """ + complete_date: Optional[datetime.datetime] = None + deposit_parent: Optional[Deposit] = None + if in_progress is False: complete_date = timezone.now() status_type = DEPOSIT_STATUS_DEPOSITED else: - complete_date = None status_type = DEPOSIT_STATUS_PARTIAL if not deposit_id: try: - # find a deposit parent (same external id, status load - # to success) + # find a deposit parent (same external id, status load to success) deposit_parent = ( Deposit.objects.filter( external_id=external_id, status=DEPOSIT_STATUS_LOAD_SUCCESS ) .order_by("-id")[0:1] .get() ) # noqa except Deposit.DoesNotExist: - deposit_parent = None + # then no parent for that deposit, deposit_parent already None + pass + assert external_id is not None deposit = Deposit( collection=self._collection, external_id=external_id, complete_date=complete_date, status=status_type, client=self._client, parent=deposit_parent, ) else: deposit = Deposit.objects.get(pk=deposit_id) # update metadata deposit.complete_date = complete_date deposit.status = status_type if self.config["checks"]: deposit.save() # needed to have a deposit id scheduler = self.scheduler if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id: task = create_oneshot_task_dict( "check-deposit", collection=deposit.collection.name, deposit_id=deposit.id, ) check_task_id = scheduler.create_tasks([task])[0]["id"] deposit.check_task_id = check_task_id deposit.save() return deposit def _deposit_request_put( self, - deposit, - deposit_request_data, - replace_metadata=False, - replace_archives=False, - ): + deposit: Deposit, + deposit_request_data: Dict[str, Any], + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> None: """Save a deposit request with metadata attached to a deposit. Args: - deposit (Deposit): The deposit concerned by the request - deposit_request_data (dict): The dictionary with at most 2 deposit - request types (archive, metadata) to associate to the deposit - replace_metadata (bool): Flag defining if we add or update + deposit: The deposit concerned by the request + deposit_request_data: The dictionary with at most 2 deposit + request types (archive, metadata) to associate to the deposit + replace_metadata: Flag defining if we add or update existing metadata to the deposit - replace_archives (bool): Flag defining if we add or update + replace_archives: Flag defining if we add or update archives to existing deposit Returns: None """ if replace_metadata: DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete() if replace_archives: DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() deposit_request = None archive_file = deposit_request_data.get(ARCHIVE_KEY) if archive_file: deposit_request = DepositRequest( type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file ) deposit_request.save() metadata = deposit_request_data.get(METADATA_KEY) if metadata: - raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) + raw_metadata = deposit_request_data[RAW_METADATA_KEY] deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() assert deposit_request is not None - def _delete_archives(self, collection_name, deposit_id): - """Delete archives reference from the deposit id. + def _delete_archives(self, collection_name: str, deposit_id: int) -> Dict: + """Delete archive references from the deposit id. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( - NOT_FOUND, "The deposit %s does not exist" % deposit_id + NOT_FOUND, f"The deposit {deposit_id} does not exist" ) DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete() return {} - def _delete_deposit(self, collection_name, deposit_id): + def _delete_deposit(self, collection_name: str, deposit_id: int) -> Dict: """Delete deposit reference. Args: - collection_name (str): Client's name - deposit_id (id): The deposit to delete + collection_name: Client's collection + deposit_id: The deposit to delete Returns Empty dict when ok. Dict with error key to describe the failure. """ try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( - NOT_FOUND, "The deposit %s does not exist" % deposit_id + NOT_FOUND, f"The deposit {deposit_id} does not exist" ) if deposit.collection.name != collection_name: summary = "Cannot delete a deposit from another collection" description = "Deposit %s does not belong to the collection %s" % ( deposit_id, collection_name, ) return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) DepositRequest.objects.filter(deposit=deposit).delete() deposit.delete() return {} - def _check_preconditions_on(self, filehandler, md5sum, content_length=None): + def _check_preconditions_on( + self, filehandler, md5sum: str, content_length: Optional[int] = None + ) -> Optional[Dict]: """Check preconditions on provided file are respected. That is the length and/or the md5sum hash match the file's content. Args: filehandler (InMemoryUploadedFile): The file to check - md5sum (hex str): md5 hash expected from the file's content - content_length (int): the expected length if provided. + md5sum: md5 hash expected from the file's content + content_length: the expected length if provided. Returns: Either none if no error or a dictionary with a key error detailing the problem. """ + max_upload_size = self.config["max_upload_size"] if content_length: - if content_length > self.config["max_upload_size"]: + if content_length > max_upload_size: return make_error_dict( MAX_UPLOAD_SIZE_EXCEEDED, - "Upload size limit exceeded (max %s bytes)." - % self.config["max_upload_size"], - "Please consider sending the archive in " "multiple steps.", + f"Upload size limit exceeded (max {max_upload_size} bytes)." + "Please consider sending the archive in multiple steps.", ) length = filehandler.size if length != content_length: return make_error_dict( status.HTTP_412_PRECONDITION_FAILED, "Wrong length" ) if md5sum: _md5sum = self._compute_md5(filehandler) if _md5sum != md5sum: return make_error_dict( CHECKSUM_MISMATCH, "Wrong md5 hash", - "The checksum sent %s and the actual checksum " - "%s does not match." - % (hashutil.hash_to_hex(md5sum), hashutil.hash_to_hex(_md5sum)), + f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual " + f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.", ) return None def _binary_upload( self, - request, - headers, - collection_name, - deposit_id=None, - replace_metadata=False, - replace_archives=False, - ): + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> Dict[str, Any]: """Binary upload routine. Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db headers (dict): request headers formatted collection_name (str): the associated client deposit_id (id): deposit identifier if provided replace_metadata (bool): 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. replace_archives (bool): 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 413 (request entity too large) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided """ content_length = headers["content-length"] if not content_length: return make_error_dict( BAD_REQUEST, "CONTENT_LENGTH header is mandatory", - "For archive deposit, the " "CONTENT_LENGTH header must be sent.", + "For archive deposit, the CONTENT_LENGTH header must be sent.", ) content_disposition = headers["content-disposition"] if not content_disposition: return make_error_dict( BAD_REQUEST, "CONTENT_DISPOSITION header is mandatory", - "For archive deposit, the " "CONTENT_DISPOSITION header must be sent.", + "For archive deposit, the CONTENT_DISPOSITION header must be sent.", ) packaging = headers["packaging"] if packaging and packaging not in ACCEPT_PACKAGINGS: return make_error_dict( BAD_REQUEST, - "Only packaging %s is supported" % ACCEPT_PACKAGINGS, - "The packaging provided %s is not supported" % packaging, + f"Only packaging {ACCEPT_PACKAGINGS} is supported", + f"The packaging provided {packaging} is not supported", ) filehandler = request.FILES["file"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"], content_length ) if precondition_status_response: return precondition_status_response external_id = headers["slug"] # actual storage of data archive_metadata = filehandler deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {ARCHIVE_KEY: archive_metadata}, replace_metadata=replace_metadata, replace_archives=replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "status": deposit.status, "archive": filehandler.name, } - def _read_metadata(self, metadata_stream): + def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]: """Given a metadata stream, reads the metadata and returns both the parsed and the raw metadata. """ raw_metadata = metadata_stream.read() metadata = parse_xml(raw_metadata) return raw_metadata, metadata def _multipart_upload( self, - request, - headers, - collection_name, - deposit_id=None, - replace_metadata=False, - replace_archives=False, - ): + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> Dict: """Multipart upload supported with exactly: - 1 archive (zip) - 1 atom entry Other than such a request, a 415 response is returned. Args: request (Request): the request holding information to parse and inject in db - headers (dict): request headers formatted - collection_name (str): the associated client - deposit_id (id): deposit identifier if provided - replace_metadata (bool): 'Update or add' request to existing + headers: request headers formatted + collection_name: the associated client + deposit_id: deposit identifier if provided + replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. - replace_archives (bool): 'Update or add' request to existing + replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id (int): Deposit identifier - deposit_date (date): Deposit date - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 413 (request entity too large) if the length of the archive exceeds the max size configured - 415 (unsupported media type) if a wrong media type is provided """ external_id = headers["slug"] content_types_present = set() - data = { + data: Dict[str, Optional[Any]] = { "application/zip": None, # expected either zip "application/x-tar": None, # or x-tar "application/atom+xml": None, } for key, value in request.FILES.items(): fh = value - if fh.content_type in content_types_present: + content_type = fh.content_type + if content_type in content_types_present: return make_error_dict( ERROR_CONTENT, "Only 1 application/zip (or application/x-tar) archive " "and 1 atom+xml entry is supported (as per sword2.0 " "specification)", "You provided more than 1 application/(zip|x-tar) " "or more than 1 application/atom+xml content-disposition " "header in the multipart deposit", ) - content_types_present.add(fh.content_type) - data[fh.content_type] = fh + content_types_present.add(content_type) + assert content_type is not None + data[content_type] = fh if len(content_types_present) != 2: return make_error_dict( ERROR_CONTENT, "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for multipart " "deposit", "You need to provide only 1 application/(zip|x-tar) " "and 1 application/atom+xml content-disposition header " "in the multipart deposit", ) filehandler = data["application/zip"] if not filehandler: filehandler = data["application/x-tar"] precondition_status_response = self._check_preconditions_on( filehandler, headers["content-md5sum"] ) if precondition_status_response: return precondition_status_response try: raw_metadata, metadata = self._read_metadata(data["application/atom+xml"]) except ParserError: return make_error_dict( PARSING_ERROR, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) # actual storage of data deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) deposit_request_data = { ARCHIVE_KEY: filehandler, METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( deposit, deposit_request_data, replace_metadata, replace_archives ) + assert filehandler is not None return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": filehandler.name, "status": deposit.status, } def _atom_entry( self, - request, - headers, - collection_name, - deposit_id=None, - replace_metadata=False, - replace_archives=False, - ): + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + replace_metadata: bool = False, + replace_archives: bool = False, + ) -> Dict[str, Any]: """Atom entry deposit. Args: request (Request): the request holding information to parse and inject in db - headers (dict): request headers formatted - collection_name (str): the associated client - deposit_id (id): deposit identifier if provided - replace_metadata (bool): 'Update or add' request to existing + headers: request headers formatted + collection_name: the associated client + deposit_id: deposit identifier if provided + replace_metadata: 'Update or add' request to existing deposit. If False (default), this adds new metadata request to existing ones. Otherwise, this will replace existing metadata. - replace_archives (bool): 'Update or add' request to existing + replace_archives: 'Update or add' request to existing deposit. If False (default), this adds new archive request to existing ones. Otherwise, this will replace existing archives. ones. Returns: In the optimal case a dict with the following keys: - deposit_id: deposit id associated to the deposit - deposit_date: date of the deposit - archive: None (no archive is provided here) Otherwise, a dictionary with the key error and the associated failures, either: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ try: raw_metadata, metadata = self._read_metadata(request.data) except ParserError: return make_error_dict( BAD_REQUEST, "Malformed xml metadata", "The xml received is malformed. " "Please ensure your metadata file is correctly formatted.", ) if not metadata: return make_error_dict( BAD_REQUEST, "Empty body request is not supported", "Atom entry deposit is supposed to send for metadata. " "If the body is empty, there is no metadata.", ) external_id = metadata.get("external_identifier", headers["slug"]) + # TODO: Determine if we are in the metadata-only deposit case. If it is, then + # save deposit and deposit request typed 'metadata' and send metadata to the + # metadata storage. Otherwise, do as existing deposit. + deposit = self._deposit_put( request, deposit_id=deposit_id, in_progress=headers["in-progress"], external_id=external_id, ) self._deposit_request_put( deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) return { "deposit_id": deposit.id, "deposit_date": deposit.reception_date, "archive": None, "status": deposit.status, } - def _empty_post(self, request, headers, collection_name, deposit_id): + def _empty_post( + self, request: Request, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Empty post to finalize an empty deposit. Args: - request (Request): the request holding information to parse + request: the request holding information to parse and inject in db - headers (dict): request headers formatted - collection_name (str): the associated client - deposit_id (id): deposit identifier + headers: request headers formatted + collection_name: the associated client + deposit_id: deposit identifier Returns: Dictionary of result with the deposit's id, the date it was completed and no archive. """ deposit = Deposit.objects.get(pk=deposit_id) deposit.complete_date = timezone.now() deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return { "deposit_id": deposit_id, "deposit_date": deposit.complete_date, "status": deposit.status, "archive": None, } - def _make_iris(self, request, collection_name, deposit_id): + def _make_iris( + self, request: Request, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Define the IRI endpoints Args: request (Request): The initial request collection_name (str): client/collection's name deposit_id (id): Deposit identifier Returns: Dictionary of keys with the iris' urls. """ args = [collection_name, deposit_id] return { iri: request.build_absolute_uri(reverse(iri, args=args)) for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI] } - def additional_checks(self, request, headers, collection_name, deposit_id=None): + def additional_checks( + self, + request: Request, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Dict[str, Any]: """Permit the child class to enrich additional checks. Returns: dict with 'error' detailing the problem. """ return {} - def checks(self, request, collection_name, deposit_id=None): + def checks( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> Dict[str, Any]: try: self._collection = DepositCollection.objects.get(name=collection_name) except DepositCollection.DoesNotExist: return make_error_dict( - NOT_FOUND, "Unknown collection name %s" % collection_name + NOT_FOUND, f"Unknown collection name {collection_name}" ) + assert self._collection is not None username = request.user.username if username: # unauthenticated request can have the username empty try: - self._client = DepositClient.objects.get(username=username) + self._client: DepositClient = DepositClient.objects.get( # type: ignore + username=username + ) except DepositClient.DoesNotExist: - return make_error_dict(NOT_FOUND, "Unknown client name %s" % username) + return make_error_dict(NOT_FOUND, f"Unknown client name {username}") - if self._collection.id not in self._client.collections: + collection_id = self._collection.id + collections = self._client.collections + assert collections is not None + if collection_id not in collections: return make_error_dict( FORBIDDEN, - "Client %s cannot access collection %s" - % (username, collection_name), + f"Client {username} cannot access collection {collection_name}", ) if deposit_id: try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( - NOT_FOUND, "Deposit with id %s does not exist" % deposit_id + NOT_FOUND, f"Deposit with id {deposit_id} does not exist" ) checks = self.restrict_access(request, deposit) if checks: return checks headers = self._read_headers(request) if headers["on-behalf-of"]: return make_error_dict(MEDIATION_NOT_ALLOWED, "Mediation is not supported.") checks = self.additional_checks(request, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} - def restrict_access(self, request, deposit=None): + def restrict_access( + self, request: Request, deposit: Optional[Deposit] = None + ) -> Dict[str, Any]: if deposit: if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL: summary = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) - description = "This deposit has status '%s'" % deposit.status + description = f"This deposit has status '{deposit.status}'" return make_error_dict( BAD_REQUEST, summary=summary, verbose_description=description ) + return {} - def _basic_not_allowed_method(self, request, method): + def _basic_not_allowed_method(self, request: Request, method: str): return make_error_response( request, METHOD_NOT_ALLOWED, - "%s method is not supported on this endpoint" % method, + f"{method} method is not supported on this endpoint", ) - def get(self, request, *args, **kwargs): + def get( + self, request: Request, collection_name: str, deposit_id: int + ) -> Union[HttpResponse, FileResponse]: return self._basic_not_allowed_method(request, "GET") - def post(self, request, *args, **kwargs): + def post( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: return self._basic_not_allowed_method(request, "POST") - def put(self, request, *args, **kwargs): + def put( + self, request: Request, collection_name: str, deposit_id: int + ) -> HttpResponse: return self._basic_not_allowed_method(request, "PUT") - def delete(self, request, *args, **kwargs): + def delete( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: return self._basic_not_allowed_method(request, "DELETE") -class SWHGetDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIGet(APIBase, metaclass=ABCMeta): """Mixin for class to support GET method. """ - def get(self, request, collection_name, deposit_id, format=None): + def get( + self, request: Request, collection_name: str, deposit_id: int + ) -> Union[HttpResponse, FileResponse]: """Endpoint to create/add resources to deposit. Returns: 200 response when no error during routine occurred 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) r = self.process_get(request, collection_name, deposit_id) - if isinstance(r, tuple): - status, content, content_type = r - return HttpResponse(content, status=status, content_type=content_type) - - return r + status, content, content_type = r + if content_type == "swh/generator": + with content as path: + return FileResponse( + open(path, "rb"), status=status, content_type="application/zip" + ) + if content_type == "application/json": + return HttpResponse( + json.dumps(content), status=status, content_type=content_type + ) + return HttpResponse(content, status=status, content_type=content_type) @abstractmethod - def process_get(self, request, collection_name, deposit_id): + def process_get( + self, request: Request, collection_name: str, deposit_id: int + ) -> Tuple[int, Any, str]: """Routine to deal with the deposit's get processing. Returns: Tuple status, stream of content, content-type """ pass -class SWHPostDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIPost(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ - def post(self, request, collection_name, deposit_id=None, format=None): + def post( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: """Endpoint to create/add resources to deposit. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] _status, _iri_key, data = self.process_post( request, headers, collection_name, deposit_id ) error = data.get("error") if error: return make_error_response_from_dict(request, error) data["packagings"] = ACCEPT_PACKAGINGS iris = self._make_iris(request, collection_name, data["deposit_id"]) data.update(iris) response = render( request, "deposit/deposit_receipt.xml", context=data, content_type="application/xml", status=_status, ) - response._headers["location"] = "Location", data[_iri_key] + response._headers["location"] = "Location", data[_iri_key] # type: ignore return response @abstractmethod - def process_post(self, request, headers, collection_name, deposit_id=None): + def process_post( + self, + request, + headers: Dict, + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Tuple[int, str, Dict]: """Routine to deal with the deposit's processing. Returns Tuple of: - response status code (200, 201, etc...) - key iri (EM_IRI, EDIT_SE_IRI, etc...) - dictionary of the processing result """ pass -class SWHPutDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIPut(APIBase, metaclass=ABCMeta): """Mixin for class to support PUT method. """ - def put(self, request, collection_name, deposit_id, format=None): + def put( + self, request: Request, collection_name: str, deposit_id: int + ) -> HttpResponse: """Endpoint to update deposit resources. Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) headers = checks["headers"] data = self.process_put(request, headers, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod - def process_put(self, request, headers, collection_name, deposit_id): + def process_put( + self, request: Request, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Routine to deal with updating a deposit in some way. Returns dictionary of the processing result """ pass -class SWHDeleteDepositAPI(SWHBaseDeposit, metaclass=ABCMeta): +class APIDelete(APIBase, metaclass=ABCMeta): """Mixin for class to support DELETE method. """ - def delete(self, request, collection_name, deposit_id): + def delete( + self, request: Request, collection_name: str, deposit_id: Optional[int] = None + ) -> HttpResponse: """Endpoint to delete some deposit's resources (archives, deposit). Returns: 204 response when no error during routine occurred. 400 if the deposit does not belong to the collection 404 if the deposit or the collection does not exist """ checks = self.checks(request, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(request, checks["error"]) + assert deposit_id is not None data = self.process_delete(request, collection_name, deposit_id) error = data.get("error") if error: return make_error_response_from_dict(request, error) return HttpResponse(status=status.HTTP_204_NO_CONTENT) @abstractmethod - def process_delete(self, request, collection_name, deposit_id): + def process_delete( + self, request: Request, collection_name: str, deposit_id: int + ) -> Dict: """Routine to delete a resource. This is mostly not allowed except for the - EM_IRI (cf. .api.deposit_update.SWHUpdateArchiveDeposit) + EM_IRI (cf. .api.deposit_update.APIUpdateArchive) """ - pass + return {} diff --git a/swh/deposit/api/deposit.py b/swh/deposit/api/deposit.py index c5258081..8cc4455c 100644 --- a/swh/deposit/api/deposit.py +++ b/swh/deposit/api/deposit.py @@ -1,95 +1,112 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Dict, Optional, Tuple + from rest_framework import status -from .common import SWHPostDepositAPI, ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import EDIT_SE_IRI -from ..errors import make_error_dict, BAD_REQUEST -from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser -from ..parsers import SWHAtomEntryParser -from ..parsers import SWHMultiPartParser +from ..errors import BAD_REQUEST, make_error_dict +from ..parsers import ( + SWHAtomEntryParser, + SWHFileUploadTarParser, + SWHFileUploadZipParser, + SWHMultiPartParser, +) +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIPost -class SWHDeposit(SWHPostDepositAPI): +class APIPostDeposit(APIPost): """Deposit request class defining api endpoints for sword deposit. What's known as 'Col IRI' in the sword specification. HTTP verbs supported: POST """ parser_classes = ( SWHMultiPartParser, SWHFileUploadZipParser, SWHFileUploadTarParser, SWHAtomEntryParser, ) - def additional_checks(self, req, headers, collection_name, deposit_id=None): + def additional_checks( + self, + req, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Dict[str, Any]: slug = headers["slug"] if not slug: msg = "Missing SLUG header in request" verbose_description = "Provide in the SLUG header one identifier, for example the url pointing to the resource you are depositing." # noqa return make_error_dict(BAD_REQUEST, msg, verbose_description) return {} - def process_post(self, req, headers, collection_name, deposit_id=None): + def process_post( + self, + req, + headers: Dict[str, Any], + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Tuple[int, str, Dict[str, Any]]: """Create a first deposit as: - archive deposit (1 zip) - multipart (1 zip + 1 atom entry) - atom entry Args: req (Request): the request holding the information to parse and inject in db collection_name (str): the associated client Returns: An http response (HttpResponse) according to the situation. If everything is ok, a 201 response (created) with a deposit receipt. Otherwise, depending on the upload, the following errors can be returned: - archive deposit: - 400 (bad request) if the request is not providing an external identifier - 403 (forbidden) if the length of the archive exceeds the max size configured - 412 (precondition failed) if the length or hash provided mismatch the reality of the archive. - 415 (unsupported media type) if a wrong media type is provided - multipart deposit: - 400 (bad request) if the request is not providing an external identifier - 412 (precondition failed) if the potentially md5 hash provided mismatch the reality of the archive - 415 (unsupported media type) if a wrong media type is provided - Atom entry deposit: - 400 (bad request) if the request is not providing an external identifier - 400 (bad request) if the request's body is empty - 415 (unsupported media type) if a wrong media type is provided """ assert deposit_id is None if req.content_type in ACCEPT_ARCHIVE_CONTENT_TYPES: data = self._binary_upload(req, headers, collection_name) elif req.content_type.startswith("multipart/"): data = self._multipart_upload(req, headers, collection_name) else: data = self._atom_entry(req, headers, collection_name) return status.HTTP_201_CREATED, EDIT_SE_IRI, data diff --git a/swh/deposit/api/deposit_content.py b/swh/deposit/api/deposit_content.py index a5065ae6..fbab2fe4 100644 --- a/swh/deposit/api/deposit_content.py +++ b/swh/deposit/api/deposit_content.py @@ -1,47 +1,47 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from django.http import HttpResponse from django.shortcuts import render from rest_framework import status -from .common import SWHBaseDeposit -from ..errors import NOT_FOUND, make_error_response -from ..errors import make_error_response_from_dict +from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit, DepositRequest +from .common import APIBase -class SWHDepositContent(SWHBaseDeposit): - def get(self, req, collection_name, deposit_id, format=None): +class APIContent(APIBase): + def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse: checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) requests = DepositRequest.objects.filter(deposit=deposit) context = { "deposit_id": deposit.id, "status": deposit.status, "status_detail": DEPOSIT_STATUS_DETAIL[deposit.status], "requests": requests, } return render( req, "deposit/content.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py index 18c26556..9c87db9c 100644 --- a/swh/deposit/api/deposit_status.py +++ b/swh/deposit/api/deposit_status.py @@ -1,65 +1,65 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from django.http import HttpResponse from django.shortcuts import render from rest_framework import status -from .common import SWHBaseDeposit -from .converters import convert_status_detail -from ..errors import NOT_FOUND, make_error_response -from ..errors import make_error_response_from_dict +from ..errors import NOT_FOUND, make_error_response, make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit +from .common import APIBase +from .converters import convert_status_detail -class SWHDepositStatus(SWHBaseDeposit): +class APIStatus(APIBase): """Deposit status. What's known as 'State IRI' in the sword specification. HTTP verbs supported: GET """ - def get(self, req, collection_name, deposit_id, format=None): + def get(self, req, collection_name: str, deposit_id: int) -> HttpResponse: checks = self.checks(req, collection_name, deposit_id) if "error" in checks: return make_error_response_from_dict(req, checks["error"]) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, "deposit %s does not belong to collection %s" % (deposit_id, collection_name), ) status_detail = convert_status_detail(deposit.status_detail) if not status_detail: status_detail = DEPOSIT_STATUS_DETAIL[deposit.status] context = { "deposit_id": deposit.id, "status_detail": status_detail, } keys = ( "status", "swh_id", "swh_id_context", "external_id", ) for k in keys: context[k] = getattr(deposit, k, None) return render( req, "deposit/status.xml", context=context, content_type="application/xml", status=status.HTTP_200_OK, ) diff --git a/swh/deposit/api/deposit_update.py b/swh/deposit/api/deposit_update.py index 21707415..ded1bf5f 100644 --- a/swh/deposit/api/deposit_update.py +++ b/swh/deposit/api/deposit_update.py @@ -1,169 +1,185 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Dict, Optional, Tuple + from rest_framework import status -from .common import SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI -from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI -from ..errors import make_error_dict, BAD_REQUEST -from ..parsers import SWHFileUploadZipParser, SWHFileUploadTarParser -from ..parsers import SWHAtomEntryParser -from ..parsers import SWHMultiPartParser +from ..errors import BAD_REQUEST, make_error_dict +from ..parsers import ( + SWHAtomEntryParser, + SWHFileUploadTarParser, + SWHFileUploadZipParser, + SWHMultiPartParser, +) +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut -class SWHUpdateArchiveDeposit(SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI): +class APIUpdateArchive(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'EM IRI' in the sword specification. HTTP verbs supported: PUT, POST, DELETE """ parser_classes = ( SWHFileUploadZipParser, SWHFileUploadTarParser, ) - def process_put(self, req, headers, collection_name, deposit_id): + def process_put( + self, req, headers, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Replace existing content for the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_binary # noqa Returns: 204 No content """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) return make_error_dict(BAD_REQUEST, msg) return self._binary_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True ) - def process_post(self, req, headers, collection_name, deposit_id): + def process_post( + self, req, headers: Dict, collection_name: str, deposit_id: Optional[int] = None + ) -> Tuple[int, str, Dict]: """Add new content to the existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_mediaresource # noqa Returns: 201 Created Headers: Location: [Cont-File-IRI] Body: [optional Deposit Receipt] """ if req.content_type not in ACCEPT_ARCHIVE_CONTENT_TYPES: msg = "Packaging format supported is restricted to %s" % ( ", ".join(ACCEPT_ARCHIVE_CONTENT_TYPES) ) - return "unused", "unused", make_error_dict(BAD_REQUEST, msg) + unused = 0 + return unused, "unused", make_error_dict(BAD_REQUEST, msg) return ( status.HTTP_201_CREATED, CONT_FILE_IRI, self._binary_upload(req, headers, collection_name, deposit_id), ) - def process_delete(self, req, collection_name, deposit_id): + def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete content (archives) from existing deposit. source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deletingcontent # noqa Returns: 204 Created """ return self._delete_archives(collection_name, deposit_id) -class SWHUpdateMetadataDeposit( - SWHPostDepositAPI, SWHPutDepositAPI, SWHDeleteDepositAPI -): +class APIUpdateMetadata(APIPost, APIPut, APIDelete): """Deposit request class defining api endpoints for sword deposit. What's known as 'Edit IRI' (and SE IRI) in the sword specification. HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE """ parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) - def process_put(self, req, headers, collection_name, deposit_id): + def process_put( + self, req, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict[str, Any]: """Replace existing deposit's metadata/archive with new ones. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa Returns: 204 No content """ if req.content_type.startswith("multipart/"): return self._multipart_upload( req, headers, collection_name, deposit_id=deposit_id, replace_archives=True, replace_metadata=True, ) return self._atom_entry( req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True ) - def process_post(self, req, headers, collection_name, deposit_id): + def process_post( + self, + request, + headers: Dict, + collection_name: str, + deposit_id: Optional[int] = None, + ) -> Tuple[int, str, Dict]: """Add new metadata/archive to existing deposit. source: - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_metadata # noqa - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_addingcontent_multipart # noqa This also deals with an empty post corner case to finalize a deposit. Returns: In optimal case for a multipart and atom-entry update, a 201 Created response. The body response will hold a deposit. And the response headers will contain an entry 'Location' with the EM-IRI. For the empty post case, this returns a 200. """ - if req.content_type.startswith("multipart/"): + assert deposit_id is not None + if request.content_type.startswith("multipart/"): return ( status.HTTP_201_CREATED, EM_IRI, self._multipart_upload( - req, headers, collection_name, deposit_id=deposit_id + request, headers, collection_name, deposit_id=deposit_id ), ) # check for final empty post # source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html # #continueddeposit_complete if headers["content-length"] == 0 and headers["in-progress"] is False: - data = self._empty_post(req, headers, collection_name, deposit_id) + data = self._empty_post(request, headers, collection_name, deposit_id) return (status.HTTP_200_OK, EDIT_SE_IRI, data) return ( status.HTTP_201_CREATED, EM_IRI, - self._atom_entry(req, headers, collection_name, deposit_id=deposit_id), + self._atom_entry(request, headers, collection_name, deposit_id=deposit_id), ) - def process_delete(self, req, collection_name, deposit_id): + def process_delete(self, req, collection_name: str, deposit_id: int) -> Dict: """Delete the container (deposit). source: http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_deleteconteiner # noqa """ return self._delete_deposit(collection_name, deposit_id) diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py index db3e2f5a..4a9aaaa8 100644 --- a/swh/deposit/api/private/__init__.py +++ b/swh/deposit/api/private/__init__.py @@ -1,109 +1,96 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.deposit import utils - -from ...config import METADATA_TYPE, SWHDefaultConfig -from ...models import DepositRequest, Deposit - from rest_framework.permissions import AllowAny -from swh.deposit.api.common import SWHAPIView -from swh.deposit.errors import make_error_dict, NOT_FOUND +from swh.deposit import utils +from swh.deposit.api.common import AuthenticatedAPIView +from swh.deposit.errors import NOT_FOUND, make_error_dict + +from ...config import METADATA_TYPE, APIConfig +from ...models import Deposit, DepositRequest class DepositReadMixin: """Deposit Read mixin """ def _deposit_requests(self, deposit, request_type): """Given a deposit, yields its associated deposit_request Args: deposit (Deposit): Deposit to list requests for request_type (str): 'archive' or 'metadata' Yields: deposit requests of type request_type associated to the deposit """ if isinstance(deposit, int): deposit = Deposit.objects.get(pk=deposit) deposit_requests = DepositRequest.objects.filter( type=request_type, deposit=deposit ).order_by("id") for deposit_request in deposit_requests: yield deposit_request def _metadata_get(self, deposit): """Given a deposit, aggregate all metadata requests. Args: deposit (Deposit): The deposit instance to extract metadata from. Returns: metadata dict from the deposit. """ metadata = ( m.metadata for m in self._deposit_requests(deposit, request_type=METADATA_TYPE) ) return utils.merge(*metadata) -class SWHPrivateAPIView(SWHDefaultConfig, SWHAPIView): +class APIPrivateView(APIConfig, AuthenticatedAPIView): """Mixin intended as private api (so no authentication) based API view (for the private ones). """ authentication_classes = () permission_classes = (AllowAny,) def checks(self, req, collection_name, deposit_id=None): """Override default checks implementation to allow empty collection. """ if deposit_id: try: Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: return make_error_dict( NOT_FOUND, "Deposit with id %s does not exist" % deposit_id ) headers = self._read_headers(req) checks = self.additional_checks(req, headers, collection_name, deposit_id) if "error" in checks: return checks return {"headers": headers} def get( - self, - request, - collection_name=None, - deposit_id=None, - format=None, - *args, - **kwargs, + self, request, collection_name=None, deposit_id=None, *args, **kwargs, ): - return super().get(request, collection_name, deposit_id, format) + return super().get(request, collection_name, deposit_id) def put( - self, - request, - collection_name=None, - deposit_id=None, - format=None, - *args, - **kwargs, + self, request, collection_name=None, deposit_id=None, *args, **kwargs, ): - return super().put(request, collection_name, deposit_id, format) + return super().put(request, collection_name, deposit_id) diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py index 76924560..d2afd5e7 100644 --- a/swh/deposit/api/private/deposit_check.py +++ b/swh/deposit/api/private/deposit_check.py @@ -1,230 +1,234 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json +from itertools import chain import re +from shutil import get_unpack_formats import tarfile +from typing import Dict, Optional, Tuple import zipfile -from itertools import chain -from shutil import get_unpack_formats - from rest_framework import status from swh.scheduler.utils import create_oneshot_task_dict -from . import DepositReadMixin, SWHPrivateAPIView -from ..common import SWHGetDepositAPI -from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED -from ...config import ARCHIVE_TYPE -from ...models import Deposit +from . import APIPrivateView, DepositReadMixin +from ...config import ARCHIVE_TYPE, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED +from ...models import Deposit, DepositRequest +from ..common import APIGet MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" ALTERNATE_FIELDS_MISSING = "Mandatory alternate fields are missing" MANDATORY_ARCHIVE_UNREADABLE = ( "At least one of its associated archives is not readable" # noqa ) MANDATORY_ARCHIVE_INVALID = ( "Mandatory archive is invalid (i.e contains only one archive)" # noqa ) MANDATORY_ARCHIVE_UNSUPPORTED = "Mandatory archive type is not supported" MANDATORY_ARCHIVE_MISSING = "Deposit without archive is rejected" ARCHIVE_EXTENSIONS = [ "zip", "tar", "tar.gz", "xz", "tar.xz", "bz2", "tar.bz2", "Z", "tar.Z", "tgz", "7z", ] PATTERN_ARCHIVE_EXTENSION = re.compile(r".*\.(%s)$" % "|".join(ARCHIVE_EXTENSIONS)) def known_archive_format(filename): return any( filename.endswith(t) for t in chain(*(x[1] for x in get_unpack_formats())) ) -class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): +class APIChecks(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ - def _check_deposit_archives(self, deposit): + def _check_deposit_archives(self, deposit: Deposit) -> Tuple[bool, Optional[Dict]]: """Given a deposit, check each deposit request of type archive. Args: The deposit to check archives for Returns tuple (status, error_detail): True, None if all archives are ok, (False, ) otherwise. """ requests = list(self._deposit_requests(deposit, request_type=ARCHIVE_TYPE)) if len(requests) == 0: # no associated archive is refused return False, {"archive": [{"summary": MANDATORY_ARCHIVE_MISSING,}]} errors = [] for archive_request in requests: check, error_message = self._check_archive(archive_request) if not check: errors.append( {"summary": error_message, "fields": [archive_request.id]} ) if not errors: return True, None return False, {"archive": errors} - def _check_archive(self, archive_request): + def _check_archive( + self, archive_request: DepositRequest + ) -> Tuple[bool, Optional[str]]: """Check that a deposit associated archive is ok: - readable - supported archive format - valid content: the archive does not contain a single archive file If any of those checks are not ok, return the corresponding failing check. Args: archive_path (DepositRequest): Archive to check Returns: (True, None) if archive is check compliant, (False, ) otherwise. """ archive_path = archive_request.archive.path if not known_archive_format(archive_path): return False, MANDATORY_ARCHIVE_UNSUPPORTED try: if zipfile.is_zipfile(archive_path): - with zipfile.ZipFile(archive_path) as f: - files = f.namelist() + with zipfile.ZipFile(archive_path) as zipfile_: + files = zipfile_.namelist() elif tarfile.is_tarfile(archive_path): - with tarfile.open(archive_path) as f: - files = f.getnames() + with tarfile.open(archive_path) as tarfile_: + files = tarfile_.getnames() else: return False, MANDATORY_ARCHIVE_UNSUPPORTED except Exception: return False, MANDATORY_ARCHIVE_UNREADABLE if len(files) > 1: return True, None element = files[0] if PATTERN_ARCHIVE_EXTENSION.match(element): # archive in archive! return False, MANDATORY_ARCHIVE_INVALID return True, None - def _check_metadata(self, metadata): + def _check_metadata(self, metadata: Dict) -> Tuple[bool, Optional[Dict]]: """Check to execute on all metadata for mandatory field presence. Args: metadata (dict): Metadata dictionary to check for mandatory fields Returns: tuple (status, error_detail): True, None if metadata are ok (False, ) otherwise. """ required_fields = { "author": False, } alternate_fields = { ("name", "title"): False, # alternate field, at least one # of them must be present } for field, value in metadata.items(): for name in required_fields: if name in field: required_fields[name] = True for possible_names in alternate_fields: for possible_name in possible_names: if possible_name in field: alternate_fields[possible_names] = True continue mandatory_result = [k for k, v in required_fields.items() if not v] optional_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] if mandatory_result == [] and optional_result == []: return True, None detail = [] if mandatory_result != []: detail.append( {"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result} ) if optional_result != []: detail.append( {"summary": ALTERNATE_FIELDS_MISSING, "fields": optional_result,} ) return False, {"metadata": detail} - def process_get(self, req, collection_name, deposit_id): + def process_get( + self, req, collection_name: str, deposit_id: int + ) -> Tuple[int, Dict, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: req (Request): collection_name (str): Collection owning the deposit deposit_id (id): Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ deposit = Deposit.objects.get(pk=deposit_id) metadata = self._metadata_get(deposit) - problems = {} + problems: Dict = {} # will check each deposit's associated request (both of type # archive and metadata) for errors archives_status, error_detail = self._check_deposit_archives(deposit) if not archives_status: + assert error_detail is not None problems.update(error_detail) metadata_status, error_detail = self._check_metadata(metadata) if not metadata_status: + assert error_detail is not None problems.update(error_detail) deposit_status = archives_status and metadata_status # if any problems arose, the deposit is rejected if not deposit_status: deposit.status = DEPOSIT_STATUS_REJECTED deposit.status_detail = problems response = { "status": deposit.status, "details": deposit.status_detail, } else: deposit.status = DEPOSIT_STATUS_VERIFIED response = { "status": deposit.status, } if not deposit.load_task_id and self.config["checks"]: url = deposit.origin_url task = create_oneshot_task_dict( "load-deposit", url=url, deposit_id=deposit.id, retries_left=3 ) load_task_id = self.scheduler.create_tasks([task])[0]["id"] deposit.load_task_id = load_task_id deposit.save() - return status.HTTP_200_OK, json.dumps(response), "application/json" + return status.HTTP_200_OK, response, "application/json" diff --git a/swh/deposit/api/private/deposit_list.py b/swh/deposit/api/private/deposit_list.py index c63a14df..a5c81c12 100644 --- a/swh/deposit/api/private/deposit_list.py +++ b/swh/deposit/api/private/deposit_list.py @@ -1,66 +1,66 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from rest_framework import serializers from rest_framework.fields import _UnvalidatedField from rest_framework.generics import ListAPIView from rest_framework.pagination import PageNumberPagination -from rest_framework import serializers -from . import SWHPrivateAPIView -from ..converters import convert_status_detail +from . import APIPrivateView from ...models import Deposit +from ..converters import convert_status_detail class DefaultPagination(PageNumberPagination): page_size = 100 page_size_query_param = "page_size" class StatusDetailField(_UnvalidatedField): """status_detail field is a dict, we want a simple message instead. So, we reuse the convert_status_detail from deposit_status endpoint to that effect. """ def to_representation(self, value): return convert_status_detail(value) class DepositSerializer(serializers.ModelSerializer): status_detail = StatusDetailField() class Meta: model = Deposit fields = "__all__" -class DepositList(ListAPIView, SWHPrivateAPIView): +class APIList(ListAPIView, APIPrivateView): """Deposit request class to list the deposit's status per page. HTTP verbs supported: GET """ serializer_class = DepositSerializer pagination_class = DefaultPagination def get_queryset(self): params = self.request.query_params exclude_like = params.get("exclude") if exclude_like: # sql injection: A priori, nothing to worry about, django does it for # queryset # https://docs.djangoproject.com/en/3.0/topics/security/#sql-injection-protection # noqa # https://docs.djangoproject.com/en/2.2/topics/security/#sql-injection-protection # noqa deposits = ( Deposit.objects.all() .exclude(external_id__startswith=exclude_like) .order_by("id") ) else: deposits = Deposit.objects.all().order_by("id") return deposits diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py index a387fc7f..51b6636e 100644 --- a/swh/deposit/api/private/deposit_read.py +++ b/swh/deposit/api/private/deposit_read.py @@ -1,218 +1,195 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json +from contextlib import contextmanager import os import shutil import tempfile +from typing import Any, Dict, Tuple -from contextlib import contextmanager -from django.http import FileResponse from rest_framework import status from swh.core import tarball -from swh.model import identifiers +from swh.deposit.api import __version__ from swh.deposit.utils import normalize_date +from swh.model import identifiers -from . import DepositReadMixin, SWHPrivateAPIView -from ...config import SWH_PERSON, ARCHIVE_TYPE -from ..common import SWHGetDepositAPI +from . import APIPrivateView, DepositReadMixin +from ...config import ARCHIVE_TYPE, SWH_PERSON from ...models import Deposit +from ..common import APIGet @contextmanager def aggregate_tarballs(extraction_dir, archive_paths): """Aggregate multiple tarballs into one and returns this new archive's path. Args: extraction_dir (path): Path to use for the tarballs computation archive_paths ([str]): Deposit's archive paths Returns: Tuple (directory to clean up, archive path (aggregated or not)) """ # rebuild one zip archive from (possibly) multiple ones os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix="swh.deposit-", dir=extraction_dir) # root folder to build an aggregated tarball aggregated_tarball_rootdir = os.path.join(dir_path, "aggregate") os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True) # uncompress in a temporary location all archives for archive_path in archive_paths: tarball.uncompress(archive_path, aggregated_tarball_rootdir) # Aggregate into one big tarball the multiple smaller ones temp_tarpath = shutil.make_archive( aggregated_tarball_rootdir, "zip", aggregated_tarball_rootdir ) # can already clean up temporary directory shutil.rmtree(aggregated_tarball_rootdir) try: yield temp_tarpath finally: shutil.rmtree(dir_path) -class SWHDepositReadArchives(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): +class APIReadArchives(APIPrivateView, APIGet, DepositReadMixin): """Dedicated class to read a deposit's raw archives content. Only GET is supported. """ - ADDITIONAL_CONFIG = { - "extraction_dir": ("str", "/tmp/swh-deposit/archive/"), - } - def __init__(self): super().__init__() self.extraction_dir = self.config["extraction_dir"] if not os.path.exists(self.extraction_dir): os.makedirs(self.extraction_dir) - def process_get(self, request, collection_name, deposit_id): + def process_get( + self, request, collection_name: str, deposit_id: int + ) -> Tuple[int, Any, str]: """Build a unique tarball from the multiple received and stream that content to the client. Args: request (Request): - collection_name (str): Collection owning the deposit - deposit_id (id): Deposit concerned by the reading + collection_name: Collection owning the deposit + deposit_id: Deposit concerned by the reading Returns: Tuple status, stream of content, content-type """ archive_paths = [ r.archive.path for r in self._deposit_requests(deposit_id, request_type=ARCHIVE_TYPE) ] - with aggregate_tarballs(self.extraction_dir, archive_paths) as path: - return FileResponse( - open(path, "rb"), - status=status.HTTP_200_OK, - content_type="application/zip", - ) + return ( + status.HTTP_200_OK, + aggregate_tarballs(self.extraction_dir, archive_paths), + "swh/generator", + ) -class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin): +class APIReadMetadata(APIPrivateView, APIGet, DepositReadMixin): """Class in charge of aggregating metadata on a deposit. - """ - - ADDITIONAL_CONFIG = { - "provider": ( - "dict", - { - # 'provider_name': '', # those are not set since read from the - # 'provider_url': '', # deposit's client - "provider_type": "deposit_client", - "metadata": {}, - }, - ), - "tool": ( - "dict", - { - "name": "swh-deposit", - "version": "0.0.1", - "configuration": {"sword_version": "2"}, - }, - ), - } + """ def __init__(self): super().__init__() self.provider = self.config["provider"] - self.tool = self.config["tool"] + self.tool = { + "name": "swh-deposit", + "version": __version__, + "configuration": {"sword_version": "2"}, + } def _normalize_dates(self, deposit, metadata): """Normalize the date to use as a tuple of author date, committer date from the incoming metadata. Args: deposit (Deposit): Deposit model representation metadata (Dict): Metadata dict representation Returns: Tuple of author date, committer date. Those dates are swh normalized. """ commit_date = metadata.get("codemeta:datePublished") author_date = metadata.get("codemeta:dateCreated") if author_date and commit_date: pass elif commit_date: author_date = commit_date elif author_date: commit_date = author_date else: author_date = deposit.complete_date commit_date = deposit.complete_date return (normalize_date(author_date), normalize_date(commit_date)) def metadata_read(self, deposit): """Read and aggregate multiple data on deposit into one unified data dictionary. Args: deposit (Deposit): Deposit concerned by the data aggregation. Returns: Dictionary of data representing the deposit to inject in swh. """ metadata = self._metadata_get(deposit) # Read information metadata data = {"origin": {"type": "deposit", "url": deposit.origin_url,}} # metadata provider self.provider["provider_name"] = deposit.client.last_name self.provider["provider_url"] = deposit.client.provider_url author_date, commit_date = self._normalize_dates(deposit, metadata) if deposit.parent: swh_persistent_id = deposit.parent.swh_id - persistent_identifier = identifiers.parse_persistent_identifier( - swh_persistent_id - ) - parent_revision = persistent_identifier.object_id + swhid = identifiers.parse_swhid(swh_persistent_id) + parent_revision = swhid.object_id parents = [parent_revision] else: parents = [] data["origin_metadata"] = { "provider": self.provider, "tool": self.tool, "metadata": metadata, } data["deposit"] = { "id": deposit.id, "client": deposit.client.username, "collection": deposit.collection.name, "author": SWH_PERSON, "author_date": author_date, "committer": SWH_PERSON, "committer_date": commit_date, "revision_parents": parents, } return data - def process_get(self, request, collection_name, deposit_id): + def process_get( + self, request, collection_name: str, deposit_id: int + ) -> Tuple[int, Dict, str]: deposit = Deposit.objects.get(pk=deposit_id) data = self.metadata_read(deposit) - d = {} - if data: - d = json.dumps(data) - - return status.HTTP_200_OK, d, "application/json" + return status.HTTP_200_OK, data if data else {}, "application/json" diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index 67fa99f0..af6bcb6c 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,105 +1,107 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from rest_framework.parsers import JSONParser +from typing import Dict -from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT +from rest_framework.parsers import JSONParser -from . import SWHPrivateAPIView -from ..common import SWHPutDepositAPI -from ...errors import make_error_dict, BAD_REQUEST -from ...models import Deposit, DEPOSIT_STATUS_DETAIL -from ...models import DEPOSIT_STATUS_LOAD_SUCCESS +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid +from . import APIPrivateView +from ...errors import BAD_REQUEST, make_error_dict +from ...models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS, Deposit +from ..common import APIPut MANDATORY_KEYS = ["origin_url", "revision_id", "directory_id", "snapshot_id"] -class SWHUpdateStatusDeposit(SWHPrivateAPIView, SWHPutDepositAPI): +class APIUpdateStatus(APIPrivateView, APIPut): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser,) def additional_checks(self, request, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists - no missing information on load success update """ data = request.data status = data.get("status") if not status: msg = "The status key is mandatory with possible values %s" % list( DEPOSIT_STATUS_DETAIL.keys() ) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = "Possible status in %s" % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: missing_keys = [] for key in MANDATORY_KEYS: value = data.get(key) if value is None: missing_keys.append(key) if missing_keys: msg = ( f"Updating deposit status to {status}" f" requires information {','.join(missing_keys)}" ) return make_error_dict(BAD_REQUEST, msg) return {} - def process_put(self, request, headers, collection_name, deposit_id): + def process_put( + self, request, headers: Dict, collection_name: str, deposit_id: int + ) -> Dict: """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data deposit = Deposit.objects.get(pk=deposit_id) status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] revision_id = data["revision_id"] - dir_id = persistent_identifier(DIRECTORY, directory_id) - snp_id = persistent_identifier(SNAPSHOT, data["snapshot_id"]) - rev_id = persistent_identifier(REVISION, revision_id) + dir_id = swhid(DIRECTORY, directory_id) + snp_id = swhid(SNAPSHOT, data["snapshot_id"]) + rev_id = swhid(REVISION, revision_id) deposit.swh_id = dir_id # new id with contextual information - deposit.swh_id_context = persistent_identifier( + deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin_url, "visit": snp_id, "anchor": rev_id, "path": "/", }, ) else: # rejected deposit.status = status deposit.save() return {} diff --git a/swh/deposit/api/private/urls.py b/swh/deposit/api/private/urls.py index 19330bbf..e48290d6 100644 --- a/swh/deposit/api/private/urls.py +++ b/swh/deposit/api/private/urls.py @@ -1,79 +1,78 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.conf.urls import url from ...config import ( - PRIVATE_GET_RAW_CONTENT, - PRIVATE_PUT_DEPOSIT, - PRIVATE_GET_DEPOSIT_METADATA, PRIVATE_CHECK_DEPOSIT, + PRIVATE_GET_DEPOSIT_METADATA, + PRIVATE_GET_RAW_CONTENT, PRIVATE_LIST_DEPOSITS, + PRIVATE_PUT_DEPOSIT, ) -from .deposit_read import SWHDepositReadArchives -from .deposit_read import SWHDepositReadMetadata -from .deposit_update_status import SWHUpdateStatusDeposit -from .deposit_check import SWHChecksDeposit -from .deposit_list import DepositList +from .deposit_check import APIChecks +from .deposit_list import APIList +from .deposit_read import APIReadArchives, APIReadMetadata +from .deposit_update_status import APIUpdateStatus urlpatterns = [ # Retrieve deposit's raw archives' content # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/raw/$", - SWHDepositReadArchives.as_view(), + APIReadArchives.as_view(), name=PRIVATE_GET_RAW_CONTENT, ), # Update deposit's status # -> PUT url( r"^(?P[^/]+)/(?P[^/]+)/update/$", - SWHUpdateStatusDeposit.as_view(), + APIUpdateStatus.as_view(), name=PRIVATE_PUT_DEPOSIT, ), # Retrieve metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/meta/$", - SWHDepositReadMetadata.as_view(), + APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA, ), # Check archive and metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/check/$", - SWHChecksDeposit.as_view(), + APIChecks.as_view(), name=PRIVATE_CHECK_DEPOSIT, ), # Retrieve deposit's raw archives' content # -> GET url( r"^(?P[^/]+)/raw/$", - SWHDepositReadArchives.as_view(), + APIReadArchives.as_view(), name=PRIVATE_GET_RAW_CONTENT + "-nc", ), # Update deposit's status # -> PUT url( r"^(?P[^/]+)/update/$", - SWHUpdateStatusDeposit.as_view(), + APIUpdateStatus.as_view(), name=PRIVATE_PUT_DEPOSIT + "-nc", ), # Retrieve metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/meta/$", - SWHDepositReadMetadata.as_view(), + APIReadMetadata.as_view(), name=PRIVATE_GET_DEPOSIT_METADATA + "-nc", ), # Check archive and metadata information on a specific deposit # -> GET url( r"^(?P[^/]+)/check/$", - SWHChecksDeposit.as_view(), + APIChecks.as_view(), name=PRIVATE_CHECK_DEPOSIT + "-nc", ), - url(r"^deposits/$", DepositList.as_view(), name=PRIVATE_LIST_DEPOSITS), + url(r"^deposits/$", APIList.as_view(), name=PRIVATE_LIST_DEPOSITS), ] diff --git a/swh/deposit/api/service_document.py b/swh/deposit/api/service_document.py index 6aa3899c..a36cb304 100644 --- a/swh/deposit/api/service_document.py +++ b/swh/deposit/api/service_document.py @@ -1,34 +1,33 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from django.urls import reverse -from .common import SWHBaseDeposit, ACCEPT_PACKAGINGS -from .common import ACCEPT_ARCHIVE_CONTENT_TYPES from ..config import COL_IRI from ..models import DepositClient, DepositCollection +from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, ACCEPT_PACKAGINGS, APIBase -class SWHServiceDocument(SWHBaseDeposit): +class APIServiceDocument(APIBase): def get(self, req, *args, **kwargs): client = DepositClient.objects.get(username=req.user) collections = {} for col_id in client.collections: col = DepositCollection.objects.get(pk=col_id) col_uri = req.build_absolute_uri(reverse(COL_IRI, args=[col.name])) collections[col.name] = col_uri context = { "max_upload_size": self.config["max_upload_size"], "accept_packagings": ACCEPT_PACKAGINGS, "accept_content_types": ACCEPT_ARCHIVE_CONTENT_TYPES, "collections": collections, } return render( req, "deposit/service_document.xml", context, content_type="application/xml" ) diff --git a/swh/deposit/api/urls.py b/swh/deposit/api/urls.py index 236db5aa..e7a686af 100644 --- a/swh/deposit/api/urls.py +++ b/swh/deposit/api/urls.py @@ -1,70 +1,68 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SWH's deposit api URL Configuration """ from django.conf.urls import url from django.shortcuts import render -from ..config import EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI -from ..config import SD_IRI, COL_IRI, STATE_IRI -from .deposit import SWHDeposit -from .deposit_status import SWHDepositStatus -from .deposit_update import SWHUpdateMetadataDeposit -from .deposit_update import SWHUpdateArchiveDeposit -from .deposit_content import SWHDepositContent -from .service_document import SWHServiceDocument +from ..config import COL_IRI, CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI, SD_IRI, STATE_IRI +from .deposit import APIPostDeposit +from .deposit_content import APIContent +from .deposit_status import APIStatus +from .deposit_update import APIUpdateArchive, APIUpdateMetadata +from .service_document import APIServiceDocument def api_view(req): return render(req, "api.html") # PUBLIC API urlpatterns = [ # simple view on the api url(r"^$", api_view, name="api"), # SD IRI - Service Document IRI # -> GET - url(r"^servicedocument/", SWHServiceDocument.as_view(), name=SD_IRI), + url(r"^servicedocument/", APIServiceDocument.as_view(), name=SD_IRI), # Col IRI - Collection IRI # -> POST - url(r"^(?P[^/]+)/$", SWHDeposit.as_view(), name=COL_IRI), + url(r"^(?P[^/]+)/$", APIPostDeposit.as_view(), name=COL_IRI), # EM IRI - Atom Edit Media IRI (update archive IRI) # -> PUT (update-in-place existing archive) # -> POST (add new archive) url( r"^(?P[^/]+)/(?P[^/]+)/media/$", - SWHUpdateArchiveDeposit.as_view(), + APIUpdateArchive.as_view(), name=EM_IRI, ), # Edit IRI - Atom Entry Edit IRI (update metadata IRI) # SE IRI - Sword Edit IRI ;; possibly same as Edit IRI # -> PUT (update in place) # -> POST (add new metadata) url( r"^(?P[^/]+)/(?P[^/]+)/metadata/$", - SWHUpdateMetadataDeposit.as_view(), + APIUpdateMetadata.as_view(), name=EDIT_SE_IRI, ), # State IRI # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/status/$", - SWHDepositStatus.as_view(), + APIStatus.as_view(), name=STATE_IRI, ), # Cont/File IRI # -> GET url( r"^(?P[^/]+)/(?P[^/]+)/content/$", - SWHDepositContent.as_view(), + APIContent.as_view(), name=CONT_FILE_IRI, ), # specification is not clear about # FILE-IRI, we assume it's the same as # the CONT-IRI one ] diff --git a/swh/deposit/cli/__init__.py b/swh/deposit/cli/__init__.py index cc6e00dc..b13d0e5b 100644 --- a/swh/deposit/cli/__init__.py +++ b/swh/deposit/cli/__init__.py @@ -1,40 +1,43 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import click import logging -from swh.core.cli import CONTEXT_SETTINGS +# WARNING: do not import unnecessary things here to keep cli startup time under +# control +import click + +from swh.core.cli import CONTEXT_SETTINGS, swh as swh_cli_group logger = logging.getLogger(__name__) -@click.group(context_settings=CONTEXT_SETTINGS) +@swh_cli_group.group(context_settings=CONTEXT_SETTINGS) @click.pass_context def deposit(ctx): """Deposit main command """ ctx.ensure_object(dict) log_level = ctx.obj.get("log_level", logging.INFO) logger.setLevel(log_level) def main(): logging.basicConfig() return deposit(auto_envvar_prefix="SWH_DEPOSIT") # These import statements MUST be executed after defining the 'deposit' group # since the subcommands in these are defined using this 'deposit' group. from . import client # noqa try: from . import admin # noqa except ImportError: # server part is optional logger.debug("admin subcommand not loaded") if __name__ == "__main__": main() diff --git a/swh/deposit/cli/admin.py b/swh/deposit/cli/admin.py index 6b387940..a56581de 100644 --- a/swh/deposit/cli/admin.py +++ b/swh/deposit/cli/admin.py @@ -1,271 +1,275 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +# WARNING: do not import unnecessary things here to keep cli startup time under +# control import click -from swh.deposit.config import setup_django_for from swh.deposit.cli import deposit @deposit.group("admin") @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False,), help="Optional extra configuration file.", ) @click.option( "--platform", default="development", type=click.Choice(["development", "production"]), help="development or production platform", ) @click.pass_context def admin(ctx, config_file, platform): """Server administration tasks (manipulate user or collections)""" + from swh.deposit.config import setup_django_for + # configuration happens here setup_django_for(platform, config_file=config_file) @admin.group("user") @click.pass_context def user(ctx): """Manipulate user.""" # configuration happens here pass def _create_collection(name): """Create the collection with name if it does not exist. Args: name (str): collection's name Returns: collection (DepositCollection): the existing collection object (created or not) """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection try: collection = DepositCollection.objects.get(name=name) click.echo("Collection %s exists, nothing to do." % name) except DepositCollection.DoesNotExist: click.echo("Create new collection %s" % name) collection = DepositCollection.objects.create(name=name) click.echo("Collection %s created" % name) return collection @user.command("create") @click.option("--username", required=True, help="User's name") @click.option("--password", required=True, help="Desired user's password (plain).") @click.option("--firstname", default="", help="User's first name") @click.option("--lastname", default="", help="User's last name") @click.option("--email", default="", help="User's email") @click.option("--collection", help="User's collection") @click.option("--provider-url", default="", help="Provider URL") @click.option("--domain", default="", help="The domain") @click.pass_context def user_create( ctx, username, password, firstname, lastname, email, collection, provider_url, domain, ): """Create a user with some needed information (password, collection) If the collection does not exist, the collection is then created alongside. The password is stored encrypted using django's utilities. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient # If collection is not provided, fallback to username if not collection: collection = username click.echo("collection: %s" % collection) # create the collection if it does not exist collection = _create_collection(collection) # user create/update try: user = DepositClient.objects.get(username=username) click.echo("User %s exists, updating information." % user) user.set_password(password) except DepositClient.DoesNotExist: click.echo("Create new user %s" % username) user = DepositClient.objects.create_user(username=username, password=password) user.collections = [collection.id] user.first_name = firstname user.last_name = lastname user.email = email user.is_active = True user.provider_url = provider_url user.domain = domain user.save() click.echo("Information registered for user %s" % user) @user.command("list") @click.pass_context def user_list(ctx): """List existing users. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient users = DepositClient.objects.all() if not users: output = "Empty user list" else: output = "\n".join((user.username for user in users)) click.echo(output) @user.command("exists") @click.argument("username", required=True) @click.pass_context def user_exists(ctx, username): """Check if user exists. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositClient try: DepositClient.objects.get(username=username) click.echo("User %s exists." % username) ctx.exit(0) except DepositClient.DoesNotExist: click.echo("User %s does not exist." % username) ctx.exit(1) @admin.group("collection") @click.pass_context def collection(ctx): """Manipulate collections.""" pass @collection.command("create") @click.option("--name", required=True, help="Collection's name") @click.pass_context def collection_create(ctx, name): _create_collection(name) @collection.command("list") @click.pass_context def collection_list(ctx): """List existing collections. This entrypoint is not paginated yet as there is not a lot of entry. """ # to avoid loading too early django namespaces from swh.deposit.models import DepositCollection collections = DepositCollection.objects.all() if not collections: output = "Empty collection list" else: output = "\n".join((col.name for col in collections)) click.echo(output) @admin.group("deposit") @click.pass_context def adm_deposit(ctx): """Manipulate deposit.""" pass @adm_deposit.command("reschedule") @click.option("--deposit-id", required=True, help="Deposit identifier") @click.pass_context def adm_deposit_reschedule(ctx, deposit_id): """Reschedule the deposit loading This will: - check the deposit's status to something reasonable (failed or done). That means that the checks have passed alright but something went wrong during the loading (failed: loading failed, done: loading ok, still for some reasons as in bugs, we need to reschedule it) - reset the deposit's status to 'verified' (prior to any loading but after the checks which are fine) and removes the different archives' identifiers (swh-id, ...) - trigger back the loading task through the scheduler """ # to avoid loading too early django namespaces from datetime import datetime - from swh.deposit.models import Deposit + from swh.deposit.config import ( - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_VERIFIED, - SWHDefaultConfig, + APIConfig, ) + from swh.deposit.models import Deposit try: deposit = Deposit.objects.get(pk=deposit_id) except Deposit.DoesNotExist: click.echo("Deposit %s does not exist." % deposit_id) ctx.exit(1) # Check the deposit is in a reasonable state accepted_statuses = [DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE] if deposit.status == DEPOSIT_STATUS_VERIFIED: click.echo("Deposit %s's status already set for rescheduling." % (deposit_id)) ctx.exit(0) if deposit.status not in accepted_statuses: click.echo( "Deposit %s's status be one of %s." % (deposit_id, ", ".join(accepted_statuses)) ) ctx.exit(1) task_id = deposit.load_task_id if not task_id: click.echo( "Deposit %s cannot be rescheduled. It misses the " "associated task." % deposit_id ) ctx.exit(1) # Reset the deposit's state deposit.swh_id = None deposit.swh_id_context = None deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() # Trigger back the deposit - scheduler = SWHDefaultConfig().scheduler + scheduler = APIConfig().scheduler scheduler.set_status_tasks( [task_id], status="next_run_not_scheduled", next_run=datetime.now() ) diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py index cf618307..1e68d8c4 100644 --- a/swh/deposit/cli/client.py +++ b/swh/deposit/cli/client.py @@ -1,497 +1,509 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import logging + +# WARNING: do not import unnecessary things here to keep cli startup time under +# control +import os import sys -import tempfile -import uuid -import json -import yaml import click -import xmltodict -from swh.deposit.client import PublicApiDepositClient, MaintenanceError from swh.deposit.cli import deposit - logger = logging.getLogger(__name__) class InputError(ValueError): """Input script error """ pass def generate_slug(): """Generate a slug (sample purposes). """ + import uuid + return str(uuid.uuid4()) def _url(url): """Force the /1 api version at the end of the url (avoiding confusing issues without it). Args: url (str): api url used by cli users Returns: Top level api url to actually request """ if not url.endswith("/1"): url = "%s/1" % url return url def generate_metadata_file(name, external_id, authors, temp_dir): """Generate a temporary metadata file with the minimum required metadata This generates a xml file in a temporary location and returns the path to that file. This is up to the client of that function to clean up the temporary file. Args: name (str): Software's name external_id (str): External identifier (slug) or generated one authors (List[str]): List of author names Returns: Filepath to the metadata generated file """ + import xmltodict + path = os.path.join(temp_dir, "metadata.xml") # generate a metadata file with the minimum required metadata codemetadata = { "entry": { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:name": name, "codemeta:identifier": external_id, "codemeta:author": [ {"codemeta:name": author_name} for author_name in authors ], }, } logging.debug("Temporary file: %s", path) logging.debug("Metadata dict to generate as xml: %s", codemetadata) s = xmltodict.unparse(codemetadata, pretty=True) logging.debug("Metadata dict as xml generated: %s", s) with open(path, "w") as fp: fp.write(s) return path def _client(url, username, password): """Instantiate a client to access the deposit api server Args: url (str): Deposit api server username (str): User password (str): User's password """ + from swh.deposit.client import PublicApiDepositClient + client = PublicApiDepositClient( {"url": url, "auth": {"username": username, "password": password},} ) return client def _collection(client): """Retrieve the client's collection """ # retrieve user's collection sd_content = client.service_document() if "error" in sd_content: raise InputError("Service document retrieval: %s" % (sd_content["error"],)) collection = sd_content["service"]["workspace"]["collection"]["sword:name"] return collection def client_command_parse_input( username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url, name, authors, temp_dir, ): """Parse the client subcommand options and make sure the combination is acceptable*. If not, an InputError exception is raised explaining the issue. By acceptable, we mean: - A multipart deposit (create or update) requires: - an existing software archive - an existing metadata file or author(s) and name provided in params - A binary deposit (create/update) requires an existing software archive - A metadata deposit (create/update) requires an existing metadata file or author(s) and name provided in params - A deposit update requires a deposit_id This will not prevent all failure cases though. The remaining errors are already dealt with by the underlying api client. Raises: InputError explaining the user input related issue MaintenanceError explaining the api status Returns: dict with the following keys: 'archive': the software archive to deposit 'username': username 'password': associated password 'metadata': the metadata file to deposit 'collection': the username's associated client 'slug': the slug or external id identifying the deposit to make 'partial': if the deposit is partial or not 'client': instantiated class 'url': deposit's server main entry point 'deposit_type': deposit's type (binary, multipart, metadata) 'deposit_id': optional deposit identifier """ if archive_deposit and metadata_deposit: # too many flags use, remove redundant ones (-> multipart deposit) archive_deposit = False metadata_deposit = False if not slug: # generate one as this is mandatory slug = generate_slug() if not metadata: if name and authors: metadata = generate_metadata_file(name, slug, authors, temp_dir) elif not archive_deposit and not partial and not deposit_id: # If we meet all the following conditions: # * there is not an archive-only deposit # * it is not part of a multipart deposit (either create/update # or finish) # * it misses either name or authors raise InputError( "Either a metadata file (--metadata) or both --author and " "--name must be provided, unless this is an archive-only " "deposit." ) elif name or authors: # If we are generating metadata, then all mandatory metadata # must be present raise InputError( "Either a metadata file (--metadata) or both --author and " "--name must be provided." ) else: # TODO: this is a multipart deposit, we might want to check that # metadata are deposited at some point pass elif name or authors: raise InputError( "Using a metadata file (--metadata) is incompatible with " "--author and --name, which are used to generate one." ) if metadata_deposit: archive = None if archive_deposit: metadata = None if metadata_deposit and not metadata: raise InputError( "Metadata deposit must be provided for metadata " "deposit (either a filepath or --name and --author)" ) if not archive and not metadata and partial: raise InputError( - "Please provide an actionable command. See --help for more " "information" + "Please provide an actionable command. See --help for more information" ) if replace and not deposit_id: raise InputError("To update an existing deposit, you must provide its id") client = _client(url, username, password) if not collection: collection = _collection(client) return { "archive": archive, "username": username, "password": password, "metadata": metadata, "collection": collection, "slug": slug, "in_progress": partial, "client": client, "url": url, "deposit_id": deposit_id, "replace": replace, } def _subdict(d, keys): "return a dict from d with only given keys" return {k: v for k, v in d.items() if k in keys} def deposit_create(config, logger): """Delegate the actual deposit to the deposit client. """ logger.debug("Create deposit") client = config["client"] keys = ("collection", "archive", "metadata", "slug", "in_progress") return client.deposit_create(**_subdict(config, keys)) def deposit_update(config, logger): """Delegate the actual deposit to the deposit client. """ logger.debug("Update deposit") client = config["client"] keys = ( "collection", "deposit_id", "archive", "metadata", "slug", "in_progress", "replace", ) return client.deposit_update(**_subdict(config, keys)) @deposit.command() @click.option("--username", required=True, help="(Mandatory) User's name") @click.option( "--password", required=True, help="(Mandatory) User's associated password" ) @click.option( "--archive", type=click.Path(exists=True), help="(Optional) Software archive to deposit", ) @click.option( "--metadata", type=click.Path(exists=True), help=( "(Optional) Path to xml metadata file. If not provided, " "this will use a file named .metadata.xml" ), ) # noqa @click.option( "--archive-deposit/--no-archive-deposit", default=False, help="(Optional) Software archive only deposit", ) @click.option( "--metadata-deposit/--no-metadata-deposit", default=False, help="(Optional) Metadata only deposit", ) @click.option( "--collection", help="(Optional) User's collection. If not provided, this will be fetched.", ) # noqa @click.option( "--slug", help=( "(Optional) External system information identifier. " "If not provided, it will be generated" ), ) # noqa @click.option( "--partial/--no-partial", default=False, help=( "(Optional) The deposit will be partial, other deposits " "will have to take place to finalize it." ), ) # noqa @click.option( "--deposit-id", default=None, help="(Optional) Update an existing partial deposit with its identifier", ) # noqa @click.option( "--replace/--no-replace", default=False, help="(Optional) Update by replacing existing metadata to a deposit", ) # noqa @click.option( "--url", default="https://deposit.softwareheritage.org", help=( "(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1" ), ) # noqa @click.option("--verbose/--no-verbose", default=False, help="Verbose mode") @click.option("--name", help="Software name") @click.option( "--author", multiple=True, help="Software author(s), this can be repeated as many times" " as there are authors", ) @click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", ) @click.pass_context def upload( ctx, username, password, archive=None, metadata=None, archive_deposit=False, metadata_deposit=False, collection=None, slug=None, partial=False, deposit_id=None, replace=False, url="https://deposit.softwareheritage.org", verbose=False, name=None, author=None, output_format=None, ): """Software Heritage Public Deposit Client Create/Update deposit through the command line. More documentation can be found at https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. """ + import tempfile + + from swh.deposit.client import MaintenanceError + url = _url(url) config = {} with tempfile.TemporaryDirectory() as temp_dir: try: logger.debug("Parsing cli options") config = client_command_parse_input( username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url, name, author, temp_dir, ) except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) if verbose: logger.info("Parsed configuration: %s" % (config,)) deposit_id = config["deposit_id"] if deposit_id: r = deposit_update(config, logger) else: r = deposit_create(config, logger) print_result(r, output_format) @deposit.command() @click.option( "--url", default="https://deposit.softwareheritage.org", help="(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1", ) @click.option("--username", required=True, help="(Mandatory) User's name") @click.option( "--password", required=True, help="(Mandatory) User's associated password" ) @click.option("--deposit-id", default=None, required=True, help="Deposit identifier.") @click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", ) @click.pass_context def status(ctx, url, username, password, deposit_id, output_format): """Deposit's status """ + from swh.deposit.client import MaintenanceError + url = _url(url) logger.debug("Status deposit") try: client = _client(url, username, password) collection = _collection(client) except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) print_result( client.deposit_status(collection=collection, deposit_id=deposit_id), output_format, ) def print_result(data, output_format): + import json + + import yaml + if output_format == "json": click.echo(json.dumps(data)) elif output_format == "yaml": click.echo(yaml.dump(data)) else: logger.info(data) diff --git a/swh/deposit/client.py b/swh/deposit/client.py index 9418d03a..a27c166b 100644 --- a/swh/deposit/client.py +++ b/swh/deposit/client.py @@ -1,658 +1,651 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ +from abc import ABCMeta, abstractmethod import hashlib -import os -import requests -import xmltodict import logging - -from abc import ABCMeta, abstractmethod +import os from typing import Any, Dict from urllib.parse import urljoin -from swh.core.config import SWHConfig +import requests +import xmltodict +from swh.core.config import config_basepath, read_raw_config logger = logging.getLogger(__name__) class MaintenanceError(ValueError): """Informational maintenance error exception """ pass def _parse(stream, encoding="utf-8"): """Given a xml stream, parse the result. Args: stream (bytes/text): The stream to parse encoding (str): The encoding to use if to decode the bytes stream Returns: A dict of values corresponding to the parsed xml """ if isinstance(stream, bytes): stream = stream.decode(encoding) data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) if "entry" in data: data = data["entry"] if "sword:error" in data: data = data["sword:error"] return dict(data) def _parse_with_filter(stream, encoding="utf-8", keys=[]): """Given a xml stream, parse the result and filter with keys. Args: stream (bytes/text): The stream to parse encoding (str): The encoding to use if to decode the bytes stream keys ([str]): Keys to filter the parsed result Returns: A dict of values corresponding to the parsed xml filtered by the keys provided. """ data = _parse(stream, encoding=encoding) m = {} for key in keys: m[key] = data.get(key) return m -class BaseApiDepositClient(SWHConfig): +class BaseApiDepositClient: """Deposit client base class """ - CONFIG_BASE_FILENAME = "deposit/client" - DEFAULT_CONFIG = { - "url": ("str", "http://localhost:5006"), - "auth": ("dict", {}), # with optional 'username'/'password' keys - } - def __init__(self, config=None, _client=requests): - super().__init__() if config is None: - self.config = super().parse_config_file() + config_file = os.environ["SWH_CONFIG_FILENAME"] + self.config: Dict[str, Any] = read_raw_config(config_basepath(config_file)) else: self.config = config self._client = _client self.base_url = self.config["url"].strip("/") + "/" auth = self.config["auth"] if auth == {}: self.auth = None else: self.auth = (auth["username"], auth["password"]) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError("Development error, unsupported method %s" % (method)) if self.auth: kwargs["auth"] = self.auth full_url = urljoin(self.base_url, url.lstrip("/")) return method_fn(full_url, *args, **kwargs) class PrivateApiDepositClient(BaseApiDepositClient): """Private API deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ def archive_get(self, archive_update_url, archive): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do("get", archive_update_url, stream=True) if r.ok: with open(archive, "wb") as f: for chunk in r.iter_content(): f.write(chunk) return archive msg = "Problem when retrieving deposit archive at %s" % (archive_update_url,) logger.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do("get", metadata_url) if r.ok: return r.json() msg = "Problem when retrieving metadata at %s" % metadata_url logger.error(msg) raise ValueError(msg) def status_update( self, update_status_url, status, revision_id=None, directory_id=None, origin_url=None, ): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to directory_id (str/None): the directory's identifier to update to origin_url (str/None): deposit's associated origin url """ payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.do("put", update_status_url, json=payload) def check(self, check_url): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do("get", check_url) if r.ok: data = r.json() return data["status"] msg = "Problem when checking deposit %s" % check_url logger.error(msg) raise ValueError(msg) class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta): """Base Deposit client to access the public api. """ def __init__(self, config, error_msg=None, empty_result={}): super().__init__(config) self.error_msg = error_msg self.empty_result = empty_result @abstractmethod def compute_url(self, *args, **kwargs): """Compute api url endpoint to query.""" pass @abstractmethod def compute_method(self, *args, **kwargs): """Http method to use on the url""" pass @abstractmethod def parse_result_ok(self, xml_content): """Given an xml result from the api endpoint, parse it and returns a dict. """ pass def compute_information(self, *args, **kwargs): """Compute some more information given the inputs (e.g http headers, ...) """ return {} def parse_result_error(self, xml_content): """Given an error response in xml, parse it into a dict. Returns: dict with following keys: 'error': The error message 'detail': Some more detail about the error if any """ return _parse_with_filter( xml_content, keys=["summary", "detail", "sword:verboseDescription"] ) def do_execute(self, method, url, info): """Execute the http query to url using method and info information. By default, execute a simple query to url with the http method. Override this in daughter class to improve the default behavior if needed. """ return self.do(method, url) def execute(self, *args, **kwargs) -> Dict[str, Any]: """Main endpoint to prepare and execute the http query to the api. Raises: MaintenanceError if some api maintenance is happening. Returns: Dict of computed api data """ url = self.compute_url(*args, **kwargs) method = self.compute_method(*args, **kwargs) info = self.compute_information(*args, **kwargs) try: r = self.do_execute(method, url, info) except Exception as e: msg = self.error_msg % (url, e) r = self.empty_result r.update( {"error": msg,} ) return r else: if r.ok: if int(r.status_code) == 204: # 204 returns no body return {"status": r.status_code} else: return self.parse_result_ok(r.text) else: error = self.parse_result_error(r.text) empty = self.empty_result error.update(empty) if r.status_code == 503: summary = error.get("summary") detail = error.get("sword:verboseDescription") # Maintenance error if summary and detail: raise MaintenanceError(f"{summary}: {detail}") error.update( {"status": r.status_code,} ) return error class ServiceDocumentDepositClient(BaseDepositClient): """Service Document information retrieval. """ def __init__(self, config): super().__init__( config, error_msg="Service document failure at %s: %s", empty_result={"collection": None}, ) def compute_url(self, *args, **kwargs): return "/servicedocument/" def compute_method(self, *args, **kwargs): return "get" def parse_result_ok(self, xml_content): """Parse service document's success response. """ return _parse(xml_content) class StatusDepositClient(BaseDepositClient): """Status information on a deposit. """ def __init__(self, config): super().__init__( config, error_msg="Status check failure at %s: %s", empty_result={ "deposit_status": None, "deposit_status_detail": None, "deposit_swh_id": None, }, ) def compute_url(self, collection, deposit_id): return "/%s/%s/status/" % (collection, deposit_id) def compute_method(self, *args, **kwargs): return "get" def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ return _parse_with_filter( xml_content, keys=[ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_swh_id", "deposit_swh_id_context", "deposit_external_id", ], ) class BaseCreateDepositClient(BaseDepositClient): """Deposit client base class to post new deposit. """ def __init__(self, config): super().__init__( config, error_msg="Post Deposit failure at %s: %s", empty_result={"deposit_id": None, "deposit_status": None,}, ) def compute_url(self, collection, *args, **kwargs): return "/%s/" % collection def compute_method(self, *args, **kwargs): return "post" def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ return _parse_with_filter( xml_content, keys=[ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_date", ], ) def _compute_information( self, collection, filepath, in_progress, slug, is_archive=True ): """Given a filepath, compute necessary information on that file. Args: filepath (str): Path to a file is_archive (bool): is it an archive or not? Returns: dict with keys: 'content-type': content type associated 'md5sum': md5 sum 'filename': filename """ filename = os.path.basename(filepath) if is_archive: md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() extension = filename.split(".")[-1] if "zip" in extension: content_type = "application/zip" else: content_type = "application/x-tar" else: content_type = None md5sum = None return { "slug": slug, "in_progress": in_progress, "content-type": content_type, "md5sum": md5sum, "filename": filename, "filepath": filepath, } def compute_information( self, collection, filepath, in_progress, slug, is_archive=True, **kwargs ): info = self._compute_information( collection, filepath, in_progress, slug, is_archive=is_archive ) info["headers"] = self.compute_headers(info) return info def do_execute(self, method, url, info): with open(info["filepath"], "rb") as f: return self.do(method, url, data=f, headers=info["headers"]) class CreateArchiveDepositClient(BaseCreateDepositClient): """Post an archive (binary) deposit client.""" def compute_headers(self, info): return { "SLUG": info["slug"], "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": info["content-type"], "CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],), } class UpdateArchiveDepositClient(CreateArchiveDepositClient): """Update (add/replace) an archive (binary) deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/media/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class CreateMetadataDepositClient(BaseCreateDepositClient): """Post a metadata deposit client.""" def compute_headers(self, info): return { "SLUG": info["slug"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": "application/atom+xml;type=entry", } class UpdateMetadataDepositClient(CreateMetadataDepositClient): """Update (add/replace) a metadata deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/metadata/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class CreateMultipartDepositClient(BaseCreateDepositClient): """Create a multipart deposit client.""" def _multipart_info(self, info, info_meta): files = [ ( "file", (info["filename"], open(info["filepath"], "rb"), info["content-type"]), ), ( "atom", ( info_meta["filename"], open(info_meta["filepath"], "rb"), "application/atom+xml", ), ), ] headers = { "SLUG": info["slug"], "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), } return files, headers def compute_information( self, collection, archive, metadata, in_progress, slug, **kwargs ): info = self._compute_information(collection, archive, in_progress, slug) info_meta = self._compute_information( collection, metadata, in_progress, slug, is_archive=False ) files, headers = self._multipart_info(info, info_meta) return {"files": files, "headers": headers} def do_execute(self, method, url, info): return self.do(method, url, files=info["files"], headers=info["headers"]) class UpdateMultipartDepositClient(CreateMultipartDepositClient): """Update a multipart deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/metadata/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class PublicApiDepositClient(BaseApiDepositClient): """Public api deposit client.""" def service_document(self): """Retrieve service document endpoint's information.""" return ServiceDocumentDepositClient(self.config).execute() def deposit_status(self, collection, deposit_id): """Retrieve status information on a deposit.""" return StatusDepositClient(self.config).execute(collection, deposit_id) def deposit_create( self, collection, slug, archive=None, metadata=None, in_progress=False ): """Create a new deposit (archive, metadata, both as multipart).""" if archive and not metadata: return CreateArchiveDepositClient(self.config).execute( collection, archive, in_progress, slug ) elif not archive and metadata: return CreateMetadataDepositClient(self.config).execute( collection, metadata, in_progress, slug, is_archive=False ) else: return CreateMultipartDepositClient(self.config).execute( collection, archive, metadata, in_progress, slug ) def deposit_update( self, collection, deposit_id, slug, archive=None, metadata=None, in_progress=False, replace=False, ): """Update (add/replace) existing deposit (archive, metadata, both).""" r = self.deposit_status(collection, deposit_id) if "error" in r: return r status = r["deposit_status"] if status != "partial": return { "error": "You can only act on deposit with status 'partial'", "detail": "The deposit %s has status '%s'" % (deposit_id, status), "deposit_status": status, "deposit_id": deposit_id, } if archive and not metadata: r = UpdateArchiveDepositClient(self.config).execute( collection, archive, in_progress, slug, deposit_id=deposit_id, replace=replace, ) elif not archive and metadata: r = UpdateMetadataDepositClient(self.config).execute( collection, metadata, in_progress, slug, deposit_id=deposit_id, replace=replace, ) else: r = UpdateMultipartDepositClient(self.config).execute( collection, archive, metadata, in_progress, slug, deposit_id=deposit_id, replace=replace, ) if "error" in r: return r return self.deposit_status(collection, deposit_id) diff --git a/swh/deposit/config.py b/swh/deposit/config.py index 410370d0..16221dfd 100644 --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -1,110 +1,99 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -import logging +from typing import Any, Dict -from typing import Any, Dict, Tuple - -from swh.core.config import SWHConfig +from swh.core import config from swh.scheduler import get_scheduler +from swh.scheduler.interface import SchedulerInterface # IRIs (Internationalized Resource identifier) sword 2.0 specified EDIT_SE_IRI = "edit_se_iri" EM_IRI = "em_iri" CONT_FILE_IRI = "cont_file_iri" SD_IRI = "servicedocument" COL_IRI = "upload" STATE_IRI = "state_iri" PRIVATE_GET_RAW_CONTENT = "private-download" PRIVATE_CHECK_DEPOSIT = "check-deposit" PRIVATE_PUT_DEPOSIT = "private-update" PRIVATE_GET_DEPOSIT_METADATA = "private-read" PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" METADATA_TYPE = "metadata" AUTHORIZED_PLATFORMS = ["development", "production", "testing"] DEPOSIT_STATUS_REJECTED = "rejected" DEPOSIT_STATUS_PARTIAL = "partial" DEPOSIT_STATUS_DEPOSITED = "deposited" DEPOSIT_STATUS_VERIFIED = "verified" DEPOSIT_STATUS_LOAD_SUCCESS = "done" DEPOSIT_STATUS_LOAD_FAILURE = "failed" # Revision author for deposit SWH_PERSON = { "name": "Software Heritage", "fullname": "Software Heritage", "email": "robot@softwareheritage.org", } +DEFAULT_CONFIG = { + "max_upload_size": 209715200, + "checks": True, +} + + def setup_django_for(platform=None, config_file=None): """Setup function for command line tools (swh.deposit.create_user) to initialize the needed db access. Note: Do not import any django related module prior to this function call. Otherwise, this will raise an django.core.exceptions.ImproperlyConfigured error message. Args: platform (str): the platform the scheduling is running config_file (str): Extra configuration file (typically for the production platform) Raises: ValueError in case of wrong platform inputs. """ if platform is not None: if platform not in AUTHORIZED_PLATFORMS: raise ValueError("Platform should be one of %s" % AUTHORIZED_PLATFORMS) if "DJANGO_SETTINGS_MODULE" not in os.environ: os.environ["DJANGO_SETTINGS_MODULE"] = "swh.deposit.settings.%s" % platform if config_file: os.environ.setdefault("SWH_CONFIG_FILENAME", config_file) import django django.setup() -class SWHDefaultConfig(SWHConfig): - """Mixin intended to enrich views with SWH configuration. +class APIConfig: + """API Configuration centralized class. This loads explicitly the configuration file out + of the SWH_CONFIG_FILENAME environment variable. """ - CONFIG_BASE_FILENAME = "deposit/server" - - DEFAULT_CONFIG = { - "max_upload_size": ("int", 209715200), - "checks": ("bool", True), - "scheduler": ( - "dict", - {"cls": "remote", "args": {"url": "http://localhost:5008/"}}, - ), - } - - ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]] - - def __init__(self, **config): - super().__init__() - self.config = self.parse_config_file( - additional_configs=[self.ADDITIONAL_CONFIG] - ) - self.config.update(config) - self.log = logging.getLogger("swh.deposit") - if self.config.get("scheduler"): - self.scheduler = get_scheduler(**self.config["scheduler"]) + def __init__(self): + config_file = os.environ["SWH_CONFIG_FILENAME"] + conf = config.read_raw_config(config.config_basepath(config_file)) + self.config: Dict[str, Any] = config.merge_configs(DEFAULT_CONFIG, conf) + self.scheduler: SchedulerInterface = get_scheduler(**self.config["scheduler"]) diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py index 68cc346b..f41965dd 100644 --- a/swh/deposit/errors.py +++ b/swh/deposit/errors.py @@ -1,151 +1,150 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of providing the standard sword errors """ -from rest_framework import status from django.shortcuts import render - +from rest_framework import status FORBIDDEN = "forbidden" UNAUTHORIZED = "unauthorized" NOT_FOUND = "unknown" BAD_REQUEST = "bad-request" ERROR_CONTENT = "error-content" CHECKSUM_MISMATCH = "checksum-mismatch" MEDIATION_NOT_ALLOWED = "mediation-not-allowed" METHOD_NOT_ALLOWED = "method-not-allowed" MAX_UPLOAD_SIZE_EXCEEDED = "max_upload_size_exceeded" PARSING_ERROR = "parsing-error" class ParserError(ValueError): """Specific parsing error detected when parsing the xml metadata input """ pass ERRORS = { FORBIDDEN: { "status": status.HTTP_403_FORBIDDEN, "iri": "http://purl.org/net/sword/error/ErrorForbidden", "tag": "sword:ErrorForbidden", }, UNAUTHORIZED: { "status": status.HTTP_401_UNAUTHORIZED, "iri": "http://purl.org/net/sword/error/ErrorUnauthorized", "tag": "sword:ErrorUnauthorized", }, NOT_FOUND: { "status": status.HTTP_404_NOT_FOUND, "iri": "http://purl.org/net/sword/error/ErrorNotFound", "tag": "sword:ErrorNotFound", }, ERROR_CONTENT: { "status": status.HTTP_415_UNSUPPORTED_MEDIA_TYPE, "iri": "http://purl.org/net/sword/error/ErrorContent", "tag": "sword:ErrorContent", }, CHECKSUM_MISMATCH: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/ErrorChecksumMismatch", "tag": "sword:ErrorChecksumMismatch", }, BAD_REQUEST: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, PARSING_ERROR: { "status": status.HTTP_400_BAD_REQUEST, "iri": "http://purl.org/net/sword/error/ErrorBadRequest", "tag": "sword:ErrorBadRequest", }, MEDIATION_NOT_ALLOWED: { "status": status.HTTP_412_PRECONDITION_FAILED, "iri": "http://purl.org/net/sword/error/MediationNotAllowed", "tag": "sword:MediationNotAllowed", }, METHOD_NOT_ALLOWED: { "status": status.HTTP_405_METHOD_NOT_ALLOWED, "iri": "http://purl.org/net/sword/error/MethodNotAllowed", "tag": "sword:MethodNotAllowed", }, MAX_UPLOAD_SIZE_EXCEEDED: { "status": status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, "iri": "http://purl.org/net/sword/error/MaxUploadSizeExceeded", "tag": "sword:MaxUploadSizeExceeded", }, } def make_error_dict(key, summary=None, verbose_description=None): """Utility function to factorize error message dictionary. Args: key (str): Error status key referenced in swh.deposit.errors module summary (str/None): Error message clarifying the status verbose_description (str/None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ return { "error": { "key": key, "summary": summary, "verboseDescription": verbose_description, }, } def make_error_response_from_dict(req, error): """Utility function to return an http response with error detail. Args: req (Request): original request error (dict): Error described as dict, typically generated from the make_error_dict function. Returns: HttpResponse with detailed error. """ error_information = ERRORS[error["key"]] context = error context.update(error_information) return render( req, "deposit/error.xml", context=error, content_type="application/xml", status=error_information["status"], ) def make_error_response(req, key, summary=None, verbose_description=None): """Utility function to create an http response with detailed error. Args: req (Request): original request key (str): Error status key referenced in swh.deposit.errors module summary (str): Error message clarifying the status verbose_description (str / None): A more verbose description or work around a potential problem. Returns: Dictionary with key 'error' detailing the 'status' and associated 'message' """ error = make_error_dict(key, summary, verbose_description) return make_error_response_from_dict(req, error["error"]) diff --git a/swh/deposit/exception.py b/swh/deposit/exception.py index cdd1f7d0..e0252e00 100644 --- a/swh/deposit/exception.py +++ b/swh/deposit/exception.py @@ -1,38 +1,37 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, Optional +from django.db.utils import OperationalError +from django.http import HttpResponse from rest_framework.exceptions import APIException from rest_framework.views import exception_handler -from django.http import HttpResponse - -from django.db.utils import OperationalError def custom_exception_handler( exc: APIException, context: Dict ) -> Optional[HttpResponse]: """Custom deposit exception handler to ensure consistent xml output """ # drf's default exception handler first, to get the standard error response response = exception_handler(exc, context) if isinstance(exc, OperationalError): status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." data = f""" {status} {detail} """.encode( "utf-8" ) return HttpResponse(data, status=503, content_type="application/xml") return response diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py index bb054529..5e239083 100644 --- a/swh/deposit/loader/checker.py +++ b/swh/deposit/loader/checker.py @@ -1,51 +1,42 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging +import os +from typing import Any, Dict -from typing import Mapping - -from swh.core.config import SWHConfig - +from swh.core import config from swh.deposit.client import PrivateApiDepositClient - logger = logging.getLogger(__name__) -class DepositChecker(SWHConfig): +class DepositChecker: """Deposit checker implementation. Trigger deposit's checks through the private api. """ - CONFIG_BASE_FILENAME = "deposit/checker" - - DEFAULT_CONFIG = { - "deposit": ("dict", {"url": "http://localhost:5006/1/private/", "auth": {},}) - } - - def __init__(self, config=None): - super().__init__() - if config is None: - self.config = self.parse_config_file() - else: - self.config = config + def __init__(self): + config_file = os.environ["SWH_CONFIG_FILENAME"] + self.config: Dict[str, Any] = config.read_raw_config( + config.config_basepath(config_file) + ) self.client = PrivateApiDepositClient(config=self.config["deposit"]) - def check(self, collection: str, deposit_id: str) -> Mapping[str, str]: + def check(self, collection: str, deposit_id: str) -> Dict[str, str]: status = None deposit_check_url = f"/{collection}/{deposit_id}/check/" logger.debug("deposit-check-url: %s", deposit_check_url) try: r = self.client.check(deposit_check_url) logger.debug("Check result: %s", r) status = "eventful" if r == "verified" else "failed" except Exception: logger.exception("Failure during check on '%s'", deposit_check_url) status = "failed" logger.debug("Check status: %s", status) return {"status": status} diff --git a/swh/deposit/manage.py b/swh/deposit/manage.py index 80fbcb86..eeb30601 100755 --- a/swh/deposit/manage.py +++ b/swh/deposit/manage.py @@ -1,53 +1,52 @@ #!/usr/bin/env python3 # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import sys from swh.core import config - DEFAULT_CONFIG = { "port": ("int", 5006), "host": ("str", "127.0.0.1"), } if __name__ == "__main__": settings_file = "development" if sys.argv[1] == "runserver": # override the default host:port for the 'runserver' task conf = config.load_named_config("deposit/server", default_conf=DEFAULT_CONFIG) extra_cmd = ["%s:%s" % (conf["host"], conf["port"])] cmd = sys.argv + extra_cmd elif sys.argv[1] == "test": # override the default settings file to read in testing mode settings_file = "testing" cmd = sys.argv else: # otherwise, do nothing cmd = sys.argv os.environ.setdefault( "DJANGO_SETTINGS_MODULE", "swh.deposit.settings.%s" % settings_file ) try: from django.core.management import execute_from_command_line except ImportError: # The above import may fail for some other reason. Ensure that the # issue is really that Django is missing to avoid masking other # exceptions on Python 2. try: import django # noqa except ImportError: raise ImportError( "Couldn't import Django. Are you sure it's installed and " "available on your PYTHONPATH environment variable? Did you " "forget to activate a virtual environment?" ) raise execute_from_command_line(cmd) diff --git a/swh/deposit/migrations/0002_depositrequest_archive.py b/swh/deposit/migrations/0002_depositrequest_archive.py index 68e0b080..b8931667 100644 --- a/swh/deposit/migrations/0002_depositrequest_archive.py +++ b/swh/deposit/migrations/0002_depositrequest_archive.py @@ -1,23 +1,24 @@ # -*- coding: utf-8 -*- # Generated by Django 1.10.7 on 2017-10-05 10:36 from __future__ import unicode_literals from django.db import migrations, models + import swh.deposit.models class Migration(migrations.Migration): dependencies = [ ("deposit", "0001_initial"), ] operations = [ migrations.AddField( model_name="depositrequest", name="archive", field=models.FileField( null=True, upload_to=swh.deposit.models.client_directory_path ), ), ] diff --git a/swh/deposit/migrations/0018_migrate_swhids.py b/swh/deposit/migrations/0018_migrate_swhids.py index ebac5f14..a2724bee 100644 --- a/swh/deposit/migrations/0018_migrate_swhids.py +++ b/swh/deposit/migrations/0018_migrate_swhids.py @@ -1,363 +1,342 @@ # -*- coding: utf-8 -*- +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + from __future__ import unicode_literals -import os import logging +import os +from typing import Any, Dict, Optional, Tuple from django.db import migrations -from typing import Any, Dict, Optional, Tuple from swh.core import config from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.identifiers import ( - parse_persistent_identifier, - persistent_identifier, - DIRECTORY, - REVISION, - SNAPSHOT, -) +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, parse_swhid, swhid from swh.storage import get_storage as get_storage_client - +from swh.storage.algos.snapshot import snapshot_id_get_from_revision SWH_PROVIDER_URL = "https://www.softwareheritage.org" logger = logging.getLogger(__name__) swh_storage = None def get_storage() -> Optional[Any]: """Instantiate a storage client """ settings = os.environ.get("DJANGO_SETTINGS_MODULE") if settings != "swh.deposit.settings.production": # Bypass for now return None global swh_storage if not swh_storage: config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError( "Production: configuration %s does not exist." % (config_file,) ) storage_config = conf.get("storage") if not storage_config: raise ValueError( "Production: invalid configuration; missing 'storage' config entry." ) swh_storage = get_storage_client(**storage_config) return swh_storage -def get_snapshot(storage, origin: str, revision_id: str) -> Optional[str]: - """Retrieve the snapshot targeting the revision_id for the given origin. - - """ - all_visits = storage.origin_visit_get(origin) - for visit in all_visits: - if not visit["snapshot"]: - continue - detail_snapshot = storage.snapshot_get(visit["snapshot"]) - if not detail_snapshot: - continue - for branch_name, branch in detail_snapshot["branches"].items(): - if branch["target_type"] == "revision": - revision = branch["target"] - if hash_to_hex(revision) == revision_id: - # Found the snapshot - return hash_to_hex(visit["snapshot"]) - return None - - def migrate_deposit_swhid_context_not_null(apps, schema_editor): """Migrate deposit SWHIDs to the new format. Migrate deposit SWHIDs to the new format. Only deposit with status done and swh_id_context not null are concerned. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=False ): - obj_dir = parse_persistent_identifier(deposit.swh_id_context) + obj_dir = parse_swhid(deposit.swh_id_context) assert obj_dir.object_type == DIRECTORY - obj_rev = parse_persistent_identifier(deposit.swh_anchor_id) + obj_rev = parse_swhid(deposit.swh_anchor_id) assert obj_rev.object_type == REVISION if set(obj_dir.metadata.keys()) != {"origin"}: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Starting migration dir_id = obj_dir.object_id origin = obj_dir.metadata["origin"] - check_origin = storage.origin_get({"url": origin}) + check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue rev_id = obj_rev.object_id # Find the snapshot targeting the revision - snp_id = get_snapshot(storage, origin, rev_id) - if not snp_id: + snp_id = snapshot_id_get_from_revision(storage, origin, hash_to_bytes(rev_id)) + if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # Update - deposit.swh_id_context = persistent_identifier( + deposit.swh_id_context = swhid( DIRECTORY, dir_id, metadata={ "origin": origin, - "visit": persistent_identifier(SNAPSHOT, snp_id), - "anchor": persistent_identifier(REVISION, rev_id), + "visit": swhid(SNAPSHOT, snp_id.hex()), + "anchor": swhid(REVISION, rev_id), "path": "/", }, ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id == deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert old_swh_anchor_id == deposit.swh_anchor_id logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert old_swh_anchor_id_context == deposit.swh_anchor_id_context # Commit deposit.save() def resolve_origin(deposit_id: int, provider_url: str, external_id: str) -> str: """Resolve the origin from provider-url and external-id For some edge case, only the external_id is used as there is some old inconsistency from testing which exists. """ map_edge_case_origin: Dict[Tuple[int, str], str] = { ( 76, "hal-01588782", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782", ( 87, "hal-01588927", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927", (89, "hal-01588935"): "https://hal-preprod.archives-ouvertes.fr/hal-01588935", ( 88, "hal-01588928", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588928", ( 90, "hal-01588942", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942", (143, "hal-01592430"): "https://hal-preprod.archives-ouvertes.fr/hal-01592430", ( 75, "hal-01588781", ): "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781", } origin = map_edge_case_origin.get((deposit_id, external_id)) if origin: return origin # Some simpler origin edge cases (mostly around the initial deposits) map_origin = { ( SWH_PROVIDER_URL, "je-suis-gpl", ): "https://forge.softwareheritage.org/source/jesuisgpl/", ( SWH_PROVIDER_URL, "external-id", ): "https://hal.archives-ouvertes.fr/external-id", } key = (provider_url, external_id) return map_origin.get(key, f"{provider_url.rstrip('/')}/{external_id}") def migrate_deposit_swhid_context_null(apps, schema_editor): """Migrate deposit SWHIDs to the new format. Migrate deposit whose swh_id_context is not set (initial deposits not migrated at the time). Only deposit with status done and swh_id_context null are concerned. Note: Those deposits have their swh_id being the SWHPIDs of the revision! So we can align them as well. """ storage = get_storage() if not storage: logging.warning("Nothing to do") return None Deposit = apps.get_model("deposit", "Deposit") for deposit in Deposit.objects.filter( status=DEPOSIT_STATUS_LOAD_SUCCESS, swh_id_context__isnull=True ): - obj_rev = parse_persistent_identifier(deposit.swh_id) + obj_rev = parse_swhid(deposit.swh_id) if obj_rev.object_type == DIRECTORY: # Assuming the migration is already done for that deposit logger.warning( "Deposit id %s: Migration already done, skipping", deposit.id ) continue # Ensuring Migration not done assert obj_rev.object_type == REVISION assert deposit.swh_id is not None assert deposit.swh_id_context is None assert deposit.swh_anchor_id is None assert deposit.swh_anchor_id_context is None rev_id = obj_rev.object_id - revisions = list(storage.revision_get([hash_to_bytes(rev_id)])) - if not revisions: + rev_id_bytes = hash_to_bytes(rev_id) + revision = storage.revision_get([rev_id_bytes])[0] + if not revision: logger.warning("Deposit id %s: Revision %s not found!", deposit.id, rev_id) continue - revision = revisions[0] provider_url = deposit.client.provider_url external_id = deposit.external_id origin = resolve_origin(deposit.id, provider_url, external_id) - check_origin = storage.origin_get({"url": origin}) + check_origin = storage.origin_get([origin])[0] if not check_origin: logger.warning("Deposit id %s: Origin %s not found!", deposit.id, origin) continue dir_id = hash_to_hex(revision["directory"]) # Reference the old values to do some checks later old_swh_id = deposit.swh_id old_swh_id_context = deposit.swh_id_context old_swh_anchor_id = deposit.swh_anchor_id old_swh_anchor_id_context = deposit.swh_anchor_id_context # retrieve the snapshot from the archive - snp_id = get_snapshot(storage, origin, rev_id) - if not snp_id: + snp_id = snapshot_id_get_from_revision(storage, origin, rev_id_bytes) + if snp_id is None: logger.warning( "Deposit id %s: Snapshot targeting revision %s not found!", deposit.id, rev_id, ) continue # New SWHIDs ids - deposit.swh_id = persistent_identifier(DIRECTORY, dir_id) - deposit.swh_id_context = persistent_identifier( + deposit.swh_id = swhid(DIRECTORY, dir_id) + deposit.swh_id_context = swhid( DIRECTORY, dir_id, metadata={ "origin": origin, - "visit": persistent_identifier(SNAPSHOT, snp_id), - "anchor": persistent_identifier(REVISION, rev_id), + "visit": swhid(SNAPSHOT, snp_id.hex()), + "anchor": swhid(REVISION, rev_id), "path": "/", }, ) # Realign the remaining deposit SWHIDs fields - deposit.swh_anchor_id = persistent_identifier(REVISION, rev_id) - deposit.swh_anchor_id_context = persistent_identifier( + deposit.swh_anchor_id = swhid(REVISION, rev_id) + deposit.swh_anchor_id_context = swhid( REVISION, rev_id, metadata={"origin": origin,} ) # Ensure only deposit.swh_id_context changed logging.debug("deposit.id: {deposit.id}") logging.debug("deposit.swh_id: %s -> %s", old_swh_id, deposit.swh_id) assert old_swh_id != deposit.swh_id logging.debug( "deposit.swh_id_context: %s -> %s", old_swh_id_context, deposit.swh_id_context, ) assert old_swh_id_context != deposit.swh_id_context assert deposit.swh_id_context is not None logging.debug( "deposit.swh_anchor_id: %s -> %s", old_swh_anchor_id, deposit.swh_anchor_id ) assert deposit.swh_anchor_id == old_swh_id assert deposit.swh_anchor_id is not None logging.debug( "deposit.swh_anchor_id_context: %s -> %s", old_swh_anchor_id_context, deposit.swh_anchor_id_context, ) assert deposit.swh_anchor_id_context is not None deposit.save() class Migration(migrations.Migration): dependencies = [ ("deposit", "0017_auto_20190925_0906"), ] operations = [ # Migrate and make the operations possibly reversible # https://docs.djangoproject.com/en/3.0/ref/migration-operations/#django.db.migrations.operations.RunPython.noop # noqa migrations.RunPython( migrate_deposit_swhid_context_not_null, reverse_code=migrations.RunPython.noop, ), migrations.RunPython( migrate_deposit_swhid_context_null, reverse_code=migrations.RunPython.noop ), ] diff --git a/swh/deposit/models.py b/swh/deposit/models.py index 04e86e6c..03b015e6 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,240 +1,240 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb import datetime -from django.contrib.postgres.fields import JSONField, ArrayField from django.contrib.auth.models import User, UserManager +from django.contrib.postgres.fields import ArrayField, JSONField from django.db import models from django.utils.timezone import now from .config import ( - DEPOSIT_STATUS_VERIFIED, + ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_REJECTED, - ARCHIVE_TYPE, + DEPOSIT_STATUS_VERIFIED, METADATA_TYPE, ) class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = "dbversion" def __str__(self): return str( { "version": self.version, "release": self.release, "description": self.description, } ) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ("expired", "expired"), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ("loading", "loading"), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: "Deposit is partially received. To finalize it, " "In-Progress header should be false", "expired": "Deposit has been there too long and is now " "deemed ready to be garbage collected", DEPOSIT_STATUS_DEPOSITED: "Deposit is ready for additional checks " "(tarball ok, metadata, etc...)", DEPOSIT_STATUS_VERIFIED: "Deposit is fully received, checked, and " "ready for loading", DEPOSIT_STATUS_REJECTED: "Deposit failed the checks", "loading": "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: "The deposit has been successfully " "loaded into the Software Heritage archive", DEPOSIT_STATUS_LOAD_FAILURE: "The deposit loading into the " "Software Heritage archive failed", } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() # type: ignore # this typing hint is due to a mypy/django-stubs limitation, # see https://github.com/typeddjango/django-stubs/issues/174 provider_url = models.TextField(null=False) domain = models.TextField(null=False) class Meta: db_table = "deposit_client" def __str__(self): return str( { "id": self.id, "collections": self.collections, "username": super().username, "domain": self.domain, "provider_url": self.provider_url, } ) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey("DepositCollection", models.DO_NOTHING) # Deposit's external identifier external_id = models.TextField() # Deposit client client = models.ForeignKey("DepositClient", models.DO_NOTHING) # SWH's loading result identifier swh_id = models.TextField(blank=True, null=True) swh_id_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField(choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey("self", on_delete=models.PROTECT, null=True) check_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated checking task id" ) load_task_id = models.TextField( blank=True, null=True, verbose_name="Scheduler's associated loading task id" ) class Meta: db_table = "deposit" def __str__(self): d = { "id": self.id, "reception_date": self.reception_date, "collection": self.collection.name, "external_id": self.external_id, "client": self.client.username, "status": self.status, } if self.status in (DEPOSIT_STATUS_REJECTED): d["status_detail"] = self.status_detail return str(d) @property def origin_url(self): return "%s/%s" % (self.client.provider_url.rstrip("/"), self.external_id) def client_directory_path(instance: "DepositRequest", filename: str) -> str: """Callable to determine the upload archive path. This defaults to MEDIA_ROOT/client_/%Y%m%d-%H%M%S.%f/. The format "%Y%m%d-%H%M%S.%f" is the reception date of the associated deposit formatted using strftime. Args: instance: DepositRequest concerned by the upload filename: Filename of the uploaded file Returns: The upload archive path. """ reception_date = instance.deposit.reception_date assert isinstance(reception_date, datetime.datetime) folder = reception_date.strftime("%Y%m%d-%H%M%S.%f") return f"client_{instance.deposit.client.id}/{folder}/{filename}" REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE), (METADATA_TYPE, METADATA_TYPE)] class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.CharField(max_length=8, choices=REQUEST_TYPES, null=True) class Meta: db_table = "deposit_request" def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str( { "id": self.id, "deposit": self.deposit, "metadata": meta, "archive": archive_name, } ) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = "deposit_collection" def __str__(self): return str({"id": self.id, "name": self.name}) diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index 0cb49065..9f52a3af 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,96 +1,94 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ -import xmltodict +from xml.parsers.expat import ExpatError from django.conf import settings -from rest_framework.parsers import BaseParser -from rest_framework.parsers import FileUploadParser -from rest_framework.parsers import MultiPartParser -from xml.parsers.expat import ExpatError +from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser +import xmltodict from swh.deposit.errors import ParserError class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive. """ media_type = "application/zip" class SWHFileUploadTarParser(FileUploadParser): """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. """ media_type = "application/x-tar" class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml" def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) if "entry" in data: data = data["entry"] return data class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype """ media_type = "application/atom+xml;type=entry" def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call - # (cf. swh.deposit.api.common.SWHBaseDeposit._atom_entry) + # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes. """ media_type = "multipart/*; *" def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) diff --git a/swh/deposit/settings/production.py b/swh/deposit/settings/production.py index 5cc7c8b1..e1e6ac4d 100644 --- a/swh/deposit/settings/production.py +++ b/swh/deposit/settings/production.py @@ -1,110 +1,111 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +from swh.core import config + from .common import * # noqa from .common import ALLOWED_HOSTS -from swh.core import config ALLOWED_HOSTS += ["deposit.softwareheritage.org"] # Setup support for proxy headers USE_X_FORWARDED_HOST = True SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https") DEBUG = False # Database # https://docs.djangoproject.com/en/1.10/ref/settings/#databases # https://docs.djangoproject.com/en/1.10/ref/settings/#std:setting-DATABASES # https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/#databases # Retrieve the deposit's configuration file # and check the required setup is ok # If not raise an error explaining the errors config_file = os.environ.get("SWH_CONFIG_FILENAME") if not config_file: raise ValueError( "Production: SWH_CONFIG_FILENAME must be set to the" " configuration file needed!" ) if not os.path.exists(config_file): raise ValueError( "Production: configuration file %s does not exist!" % (config_file,) ) conf = config.load_named_config(config_file) if not conf: raise ValueError("Production: configuration %s does not exist." % (config_file,)) for key in ("scheduler", "private"): if not conf.get(key): raise ValueError( "Production: invalid configuration; missing %s config entry." % (key,) ) ALLOWED_HOSTS += conf.get("allowed_hosts", []) private_conf = conf["private"] SECRET_KEY = private_conf["secret_key"] # https://docs.djangoproject.com/en/1.10/ref/settings/#logging LOGGING = { "version": 1, "disable_existing_loggers": False, "formatters": { "standard": { "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s", # noqa "datefmt": "%d/%b/%Y %H:%M:%S", }, }, "handlers": { "console": { "level": "INFO", "class": "logging.StreamHandler", "formatter": "standard", }, }, "loggers": { "django": {"handlers": ["console"], "level": "INFO", "propagate": True,}, }, } # database db_conf = private_conf.get("db", {"name": "unset"}) db = { "ENGINE": "django.db.backends.postgresql", "NAME": db_conf["name"], } db_user = db_conf.get("user") if db_user: db["USER"] = db_user db_pass = db_conf.get("password") if db_pass: db["PASSWORD"] = db_pass db_host = db_conf.get("host") if db_host: db["HOST"] = db_host db_port = db_conf.get("port") if db_port: db["PORT"] = db_port # https://docs.djangoproject.com/en/1.10/ref/settings/#databases DATABASES = { "default": db, } # Upload user directory # https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-MEDIA_ROOT MEDIA_ROOT = private_conf.get("media_root") diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py index 1f5f779a..17e29af7 100644 --- a/swh/deposit/tests/api/conftest.py +++ b/swh/deposit/tests/api/conftest.py @@ -1,87 +1,93 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib -import pytest +import os from django.urls import reverse +import pytest +from swh.deposit.api.private.deposit_check import APIChecks from swh.deposit.config import ( - DEPOSIT_STATUS_DEPOSITED, COL_IRI, + DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_VERIFIED, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml -from swh.deposit.api.private.deposit_check import SWHChecksDeposit + +@pytest.fixture +def datadir(request): + """Override default datadir to target main test datadir""" + return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def ready_deposit_ok(partial_deposit_with_metadata): """Returns a deposit ready for checks (it will pass the checks). """ deposit = partial_deposit_with_metadata deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def ready_deposit_verified(partial_deposit_with_metadata): """Returns a deposit ready for checks (it will pass the checks). """ deposit = partial_deposit_with_metadata deposit.status = DEPOSIT_STATUS_VERIFIED deposit.save() return deposit @pytest.fixture def ready_deposit_only_metadata(partial_deposit_only_metadata): """Deposit in status ready that will fail the checks (because missing archive). """ deposit = partial_deposit_only_metadata deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def ready_deposit_invalid_archive(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG="external-id-invalid", HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) response_content = parse_xml(response.content) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() return deposit @pytest.fixture def swh_checks_deposit(): - return SWHChecksDeposit() + return APIChecks() diff --git a/swh/deposit/tests/api/test_deposit.py b/swh/deposit/tests/api/test_deposit.py index 2e6cce7b..00c38d91 100644 --- a/swh/deposit/tests/api/test_deposit.py +++ b/swh/deposit/tests/api/test_deposit.py @@ -1,195 +1,194 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib +from io import BytesIO from django.urls import reverse -from io import BytesIO from rest_framework import status from swh.deposit.config import ( COL_IRI, - EDIT_SE_IRI, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_PARTIAL, - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_REJECTED, + EDIT_SE_IRI, ) - from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml def test_deposit_post_will_fail_with_401(client): """Without authentication, endpoint refuses access with 401 response """ url = reverse(COL_IRI, args=["hal"]) response = client.post(url) assert response.status_code == status.HTTP_401_UNAUTHORIZED def test_access_to_another_user_collection_is_forbidden( authenticated_client, deposit_another_collection, deposit_user ): """Access to another user collection should return a 403 """ coll2 = deposit_another_collection url = reverse(COL_IRI, args=[coll2.name]) response = authenticated_client.post(url) assert response.status_code == status.HTTP_403_FORBIDDEN msg = "Client %s cannot access collection %s" % (deposit_user.username, coll2.name,) assert msg in response.content.decode("utf-8") def test_delete_on_col_iri_not_supported(authenticated_client, deposit_collection): """Delete on col iri should return a 405 response """ url = reverse(COL_IRI, args=[deposit_collection.name]) response = authenticated_client.delete(url) assert response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED assert "DELETE method is not supported on this endpoint" in response.content.decode( "utf-8" ) def create_deposit_with_rejection_status(authenticated_client, deposit_collection): url = reverse(COL_IRI, args=[deposit_collection.name]) data = b"some data which is clearly not a zip file" md5sum = hashlib.md5(data).hexdigest() external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) actual_state = response_content["deposit_status"] assert actual_state == DEPOSIT_STATUS_REJECTED def test_act_on_deposit_rejected_is_not_permitted( authenticated_client, deposit_collection, rejected_deposit, atom_dataset ): deposit = rejected_deposit response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[deposit.collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_400_BAD_REQUEST msg = "You can only act on deposit with status '%s'" % ( DEPOSIT_STATUS_PARTIAL, ) assert msg in response.content.decode("utf-8") def test_add_deposit_when_partial_makes_new_deposit( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Posting deposit on collection when previous is partial makes new deposit """ deposit = partial_deposit assert deposit.status == DEPOSIT_STATUS_PARTIAL # adding a new deposit with the same external id response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] assert deposit_id != deposit.id # new deposit new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None def test_add_deposit_when_failed_makes_new_deposit_with_no_parent( authenticated_client, deposit_collection, failed_deposit, atom_dataset ): """Posting deposit on collection when deposit done makes new deposit with parent """ deposit = failed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert new_deposit != deposit assert new_deposit.parent is None def test_add_deposit_when_done_makes_new_deposit_with_parent_old_one( authenticated_client, deposit_collection, completed_deposit, atom_dataset ): """Posting deposit on collection when deposit done makes new deposit with parent """ # given multiple deposit already loaded deposit = completed_deposit assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS # adding a new deposit with the same external id as a completed deposit # creates the parenting chain response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id, HTTP_SLUG=deposit.external_id, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] assert deposit_id != deposit.id new_deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == new_deposit.collection assert deposit.external_id == new_deposit.external_id assert new_deposit != deposit assert new_deposit.parent == deposit diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py index 2de803d7..f551a317 100644 --- a/swh/deposit/tests/api/test_deposit_atom.py +++ b/swh/deposit/tests/api/test_deposit_atom.py @@ -1,326 +1,326 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest +from io import BytesIO from django.urls import reverse -from io import BytesIO +import pytest from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED -from swh.deposit.models import Deposit, DepositRequest, DepositCollection +from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml def test_post_deposit_atom_201_even_with_decimal( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ atom_error_with_decimal = atom_dataset["error-with-decimal"] response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_error_with_decimal, HTTP_SLUG="external-id", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) assert dr.metadata is not None sw_version = dr.metadata.get("codemeta:softwareVersion") assert sw_version == "10.4" def test_post_deposit_atom_400_with_empty_body( authenticated_client, deposit_collection, atom_dataset ): """Posting empty body request should return a 400 response """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-empty-body"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_400_badly_formatted_atom( authenticated_client, deposit_collection, atom_dataset ): """Posting a badly formatted atom should return a 400 response """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-badly-formatted"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_parsing_error( authenticated_client, deposit_collection, atom_dataset ): """Posting parsing error prone atom should return 400 """ response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-parsing-error-prone"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_no_slug_header( authenticated_client, deposit_collection, atom_dataset ): """Posting an atom entry without a slug header should return a 400 """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"], # + headers HTTP_IN_PROGRESS="false", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_atom_unknown_collection(authenticated_client, atom_dataset): """Posting an atom entry to an unknown collection should return a 404 """ unknown_collection = "unknown-one" with pytest.raises(DepositCollection.DoesNotExist): DepositCollection.objects.get(name=unknown_collection) response = authenticated_client.post( reverse(COL_IRI, args=[unknown_collection]), # <- unknown collection content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"], HTTP_SLUG="something", ) assert response.status_code == status.HTTP_404_NOT_FOUND def test_post_deposit_atom_entry_initial( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["entry-data0"] % external_id # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_with_codemeta( authenticated_client, deposit_collection, atom_dataset ): """Posting an initial atom entry should return 201 with deposit receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["codemeta-sample"] % external_id # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_tei( authenticated_client, deposit_collection, atom_dataset ): """Posting initial atom entry as TEI should return 201 with receipt """ # given external_id = "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) atom_entry_data = atom_dataset["tei-sample"] # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_SLUG=external_id, HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False def test_post_deposit_atom_entry_multiple_steps( authenticated_client, deposit_collection, atom_dataset ): """After initial deposit, updating a deposit should return a 201 """ # given external_id = "urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a" with pytest.raises(Deposit.DoesNotExist): deposit = Deposit.objects.get(external_id=external_id) # when response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_IN_PROGRESS="True", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == "partial" # one associated request to a deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 1 atom_entry_data = atom_dataset["entry-data-minimal"] % external_id.encode( "utf-8" ) # noqa update_uri = response._headers["location"][1] # when updating the first deposit post response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data=atom_entry_data, HTTP_IN_PROGRESS="False", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection assert deposit.external_id == external_id assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert len(Deposit.objects.all()) == 1 # now 2 associated requests to a same deposit deposit_requests = DepositRequest.objects.filter(deposit=deposit).order_by("id") assert len(deposit_requests) == 2 atom_entry_data1 = atom_dataset["entry-data1"] expected_meta = [ {"metadata": parse_xml(atom_entry_data1), "raw_metadata": atom_entry_data1}, {"metadata": parse_xml(atom_entry_data), "raw_metadata": atom_entry_data}, ] for i, deposit_request in enumerate(deposit_requests): actual_metadata = deposit_request.metadata assert actual_metadata == expected_meta[i]["metadata"] assert deposit_request.raw_metadata == expected_meta[i]["raw_metadata"] assert bool(deposit_request.archive) is False diff --git a/swh/deposit/tests/api/test_deposit_binary.py b/swh/deposit/tests/api/test_deposit_binary.py index eced5d17..00b4179b 100644 --- a/swh/deposit/tests/api/test_deposit_binary.py +++ b/swh/deposit/tests/api/test_deposit_binary.py @@ -1,567 +1,562 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest +from io import BytesIO from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse -from io import BytesIO - +import pytest from rest_framework import status -from swh.deposit.config import ( - COL_IRI, - EM_IRI, - DEPOSIT_STATUS_DEPOSITED, -) +from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED, EM_IRI from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml -from swh.deposit.tests.common import create_arborescence_archive, check_archive +from swh.deposit.tests.common import check_archive, create_arborescence_archive def test_post_deposit_binary_no_slug( authenticated_client, deposit_collection, sample_archive ): """Posting a binary deposit without slug header should return 400 """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_binary_support( authenticated_client, deposit_collection, sample_archive ): """Binary upload with content-type not in [zip,x-tar] should return 415 """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/octet-stream", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_ok( authenticated_client, deposit_collection, sample_archive ): """Binary upload with correct headers should return 201 with receipt """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"],), ) # then response_content = parse_xml(BytesIO(response.content)) assert response.status_code == status.HTTP_201_CREATED deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_request = DepositRequest.objects.get(deposit=deposit) check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None response_content = parse_xml(BytesIO(response.content)) assert response_content["deposit_archive"] == sample_archive["name"] assert int(response_content["deposit_id"]) == deposit.id assert response_content["deposit_status"] == deposit.status edit_se_iri = reverse("edit_se_iri", args=[deposit_collection.name, deposit.id]) assert response._headers["location"] == ( "Location", "http://testserver" + edit_se_iri, ) def test_post_deposit_binary_failure_unsupported_packaging_header( authenticated_client, deposit_collection, sample_archive ): """Bin deposit without supported content_disposition header returns 400 """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id" # when response = authenticated_client.post( url, content_type="application/zip", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="something-unsupported", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_no_content_disposition_header( authenticated_client, deposit_collection, sample_archive ): """Binary upload without content_disposition header should return 400 """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id" # when response = authenticated_client.post( url, content_type="application/zip", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_mediation_not_supported( authenticated_client, deposit_collection, sample_archive ): """Binary upload with mediation should return a 412 response """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_ON_BEHALF_OF="someone", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_412_PRECONDITION_FAILED with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_binary_upload_fail_if_upload_size_limit_exceeded( authenticated_client, deposit_collection, sample_archive, tmp_path ): """Binary upload must not exceed the limit set up... """ tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) archive = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file", up_to_size=500 ) external_id = "some-external-id" # when response = authenticated_client.post( url, content_type="application/zip", data=archive["data"], # + headers CONTENT_LENGTH=archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE assert b"Upload size limit exceeded" in response.content with pytest.raises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) def test_post_deposit_2_post_2_different_deposits( authenticated_client, deposit_collection, sample_archive ): """2 posting deposits should return 2 different 201 with receipt """ url = reverse(COL_IRI, args=[deposit_collection.name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG="some-external-id-1", HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) deposits = Deposit.objects.all() assert len(deposits) == 1 assert deposits[0] == deposit # second post response = authenticated_client.post( url, content_type="application/x-tar", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG="another-external-id", HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename1", ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id2 = response_content["deposit_id"] deposit2 = Deposit.objects.get(pk=deposit_id2) assert deposit != deposit2 deposits = Deposit.objects.all().order_by("id") assert len(deposits) == 2 assert list(deposits), [deposit == deposit2] def test_post_deposit_binary_and_post_to_add_another_archive( authenticated_client, deposit_collection, sample_archive, tmp_path ): """Updating a deposit should return a 201 with receipt """ tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="true", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"],), ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.deposit == deposit assert deposit_request.type == "archive" check_archive(sample_archive["name"], deposit_request.archive.name) # 2nd archive to upload archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) # uri to update the content update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) # adding another archive for the deposit and finalizing it response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"]), ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = list( DepositRequest.objects.filter(deposit=deposit).order_by("id") ) # 2 deposit requests for the same deposit assert len(deposit_requests) == 2 assert deposit_requests[0].deposit == deposit assert deposit_requests[0].type == "archive" check_archive(sample_archive["name"], deposit_requests[0].archive.name) assert deposit_requests[1].deposit == deposit assert deposit_requests[1].type == "archive" check_archive(archive2["name"], deposit_requests[1].archive.name) # only 1 deposit in db deposits = Deposit.objects.all() assert len(deposits) == 1 def test_post_deposit_then_update_refused( authenticated_client, deposit_collection, sample_archive, atom_dataset, tmp_path ): """Updating a deposit with status 'ready' should return a 400 """ tmp_path = str(tmp_path) url = reverse(COL_IRI, args=[deposit_collection.name]) external_id = "some-external-id-1" # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_request = DepositRequest.objects.get(deposit=deposit) assert deposit_request.deposit == deposit check_archive("filename0", deposit_request.archive.name) # updating/adding is forbidden # uri to update the content edit_se_iri = reverse("edit_se_iri", args=[deposit_collection.name, deposit_id]) em_iri = reverse("em_iri", args=[deposit_collection.name, deposit_id]) # Testing all update/add endpoint should fail # since the status is ready archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some content in file 2" ) # replacing file is no longer possible since the deposit's # status is ready r = authenticated_client.put( em_iri, content_type="application/zip", data=archive2["data"], CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert r.status_code == status.HTTP_400_BAD_REQUEST # adding file is no longer possible since the deposit's status # is ready r = authenticated_client.post( em_iri, content_type="application/zip", data=archive2["data"], CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=filename0", ) assert r.status_code == status.HTTP_400_BAD_REQUEST # replacing metadata is no longer possible since the deposit's # status is ready r = authenticated_client.put( edit_se_iri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]), HTTP_SLUG=external_id, ) assert r.status_code == status.HTTP_400_BAD_REQUEST # adding new metadata is no longer possible since the # deposit's status is ready r = authenticated_client.post( edit_se_iri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], CONTENT_LENGTH=len(atom_dataset["entry-data-deposit-binary"]), HTTP_SLUG=external_id, ) assert r.status_code == status.HTTP_400_BAD_REQUEST archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/zip", size=len(archive_content), charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(atom_dataset["entry-data-deposit-binary"].encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(atom_dataset["entry-data-deposit-binary"]), charset="utf-8", ) # replacing multipart metadata is no longer possible since the # deposit's status is ready r = authenticated_client.put( edit_se_iri, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, ) assert r.status_code == status.HTTP_400_BAD_REQUEST # adding new metadata is no longer possible since the # deposit's status is ready r = authenticated_client.post( edit_se_iri, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, ) assert r.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_delete.py b/swh/deposit/tests/api/test_deposit_delete.py index 496af061..76959c24 100644 --- a/swh/deposit/tests/api/test_deposit_delete.py +++ b/swh/deposit/tests/api/test_deposit_delete.py @@ -1,123 +1,123 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict +from typing import Dict, Mapping + from django.urls import reverse from rest_framework import status -from typing import Dict, Mapping from swh.deposit.config import ( + ARCHIVE_KEY, + DEPOSIT_STATUS_DEPOSITED, EDIT_SE_IRI, EM_IRI, - ARCHIVE_KEY, METADATA_KEY, - DEPOSIT_STATUS_DEPOSITED, ) - from swh.deposit.models import Deposit, DepositRequest def count_deposit_request_types(deposit_requests) -> Mapping[str, int]: deposit_request_types = defaultdict(int) # type: Dict[str, int] for dr in deposit_requests: deposit_request_types[dr.type] += 1 return deposit_request_types def test_delete_archive_on_partial_deposit_works( authenticated_client, partial_deposit_with_metadata, deposit_collection ): """Removing partial deposit's archive should return a 204 response """ deposit_id = partial_deposit_with_metadata.id deposit = Deposit.objects.get(pk=deposit_id) deposit_requests = DepositRequest.objects.filter(deposit=deposit) # deposit request type: 'archive', 1 'metadata' deposit_request_types = count_deposit_request_types(deposit_requests) assert deposit_request_types == {ARCHIVE_KEY: 1, METADATA_KEY: 1} # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit_id) deposit_requests2 = DepositRequest.objects.filter(deposit=deposit) deposit_request_types = count_deposit_request_types(deposit_requests2) assert deposit_request_types == {METADATA_KEY: 1} def test_delete_archive_on_undefined_deposit_fails( authenticated_client, deposit_collection, sample_archive ): """Delete undefined deposit returns a 404 response """ # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, 999]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_404_NOT_FOUND def test_delete_non_partial_deposit( authenticated_client, deposit_collection, deposited_deposit ): """Delete !partial status deposit should return a 400 response """ deposit = deposited_deposit assert deposit.status == DEPOSIT_STATUS_DEPOSITED # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(update_uri) # then assert response.status_code == status.HTTP_400_BAD_REQUEST deposit = Deposit.objects.get(pk=deposit.id) assert deposit is not None def test_delete_partial_deposit( authenticated_client, deposit_collection, partial_deposit ): """Delete deposit should return a 204 response """ # given deposit = partial_deposit # when url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(url) # then assert response.status_code == status.HTTP_204_NO_CONTENT deposit_requests = list(DepositRequest.objects.filter(deposit=deposit)) assert deposit_requests == [] deposits = list(Deposit.objects.filter(pk=deposit.id)) assert deposits == [] def test_delete_on_edit_se_iri_cannot_delete_non_partial_deposit( authenticated_client, deposit_collection, complete_deposit ): """Delete !partial deposit should return a 400 response """ # given deposit = complete_deposit # when url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.delete(url) # then assert response.status_code == status.HTTP_400_BAD_REQUEST deposit = Deposit.objects.get(pk=deposit.id) assert deposit is not None diff --git a/swh/deposit/tests/api/test_deposit_list.py b/swh/deposit/tests/api/test_deposit_list.py index e36d04ef..3de52950 100644 --- a/swh/deposit/tests/api/test_deposit_list.py +++ b/swh/deposit/tests/api/test_deposit_list.py @@ -1,100 +1,100 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status from swh.deposit.api.converters import convert_status_detail from swh.deposit.config import ( + DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, PRIVATE_LIST_DEPOSITS, - DEPOSIT_STATUS_DEPOSITED, ) STATUS_DETAIL = { "url": { "summary": "At least one compatible url field. Failed", "fields": ["testurl"], }, "metadata": [{"summary": "Mandatory fields missing", "fields": ["9", 10, 1.212],},], "archive": [ {"summary": "Invalid archive", "fields": ["3"],}, {"summary": "Unsupported archive", "fields": [2],}, ], } def test_deposit_list(partial_deposit, deposited_deposit, authenticated_client): """Deposit list api should return all deposits in a paginated way """ partial_deposit.status_detail = STATUS_DETAIL partial_deposit.save() deposit_id = partial_deposit.id deposit_id2 = deposited_deposit.id # NOTE: does not work as documented # https://docs.djangoproject.com/en/1.11/ref/urlresolvers/#django.core.urlresolvers.reverse # noqa # url = reverse(PRIVATE_LIST_DEPOSITS, kwargs={'page_size': 1}) main_url = reverse(PRIVATE_LIST_DEPOSITS) url = "%s?page_size=1" % main_url response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 2 # 2 deposits expected_next = f"{main_url}?page=2&page_size=1" assert data["next"].endswith(expected_next) is True assert data["previous"] is None assert len(data["results"]) == 1 # page of size 1 deposit = data["results"][0] assert deposit["id"] == deposit_id assert deposit["status"] == DEPOSIT_STATUS_PARTIAL expected_status_detail = convert_status_detail(STATUS_DETAIL) assert deposit["status_detail"] == expected_status_detail # then 2nd page response2 = authenticated_client.get(expected_next) assert response2.status_code == status.HTTP_200_OK data2 = response2.json() assert data2["count"] == 2 # still 2 deposits assert data2["next"] is None expected_previous = f"{main_url}?page_size=1" assert data2["previous"].endswith(expected_previous) is True assert len(data2["results"]) == 1 # page of size 1 deposit2 = data2["results"][0] assert deposit2["id"] == deposit_id2 assert deposit2["status"] == DEPOSIT_STATUS_DEPOSITED def test_deposit_list_exclude(partial_deposit, deposited_deposit, authenticated_client): """Exclusion pattern on external_id should be respected """ partial_deposit.status_detail = STATUS_DETAIL partial_deposit.save() main_url = reverse(PRIVATE_LIST_DEPOSITS) # Testing exclusion pattern exclude_pattern = "external-id" assert partial_deposit.external_id.startswith(exclude_pattern) assert deposited_deposit.external_id.startswith(exclude_pattern) url = f"{main_url}?page_size=1&exclude=external-id" response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 0 url = "%s?page_size=1&exclude=dummy" % main_url # that won't exclude anything response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["count"] == 2 diff --git a/swh/deposit/tests/api/test_deposit_multipart.py b/swh/deposit/tests/api/test_deposit_multipart.py index bb4f42d7..c9a4a871 100644 --- a/swh/deposit/tests/api/test_deposit_multipart.py +++ b/swh/deposit/tests/api/test_deposit_multipart.py @@ -1,400 +1,401 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from io import BytesIO + from django.core.files.uploadedfile import InMemoryUploadedFile from django.urls import reverse -from io import BytesIO from rest_framework import status from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive def test_post_deposit_multipart_without_slug_header_is_bad_request( authenticated_client, deposit_collection, atom_dataset ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/zip", size=len(archive_content), charset=None, ) data_atom_entry = atom_dataset["entry-data-deposit-binary"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", ) assert b"Missing SLUG header" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST def test_post_deposit_multipart_zip( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """one multipart deposit (zip+xml) should be accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/zip", size=sample_archive["length"], charset=None, ) data_atom_entry = atom_dataset["entry-data-deposit-binary"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) external_id = "external-id" # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry def test_post_deposit_multipart_tar( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """one multipart deposit (tar+xml) should be accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) # from django.core.files import uploadedfile data_atom_entry = atom_dataset["entry-data-deposit-binary"] archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/x-tar", size=sample_archive["length"], charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) external_id = "external-id" # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) assert deposit_request.metadata is None assert deposit_request.raw_metadata is None else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry def test_post_deposit_multipart_put_to_replace_metadata( authenticated_client, deposit_collection, atom_dataset, sample_archive ): """One multipart deposit followed by a metadata update should be accepted """ # given url = reverse(COL_IRI, args=[deposit_collection.name]) data_atom_entry = atom_dataset["entry-data-deposit-binary"] archive = InMemoryUploadedFile( BytesIO(sample_archive["data"]), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/zip", size=sample_archive["length"], charset=None, ) atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry), charset="utf-8", ) external_id = "external-id" # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="true", HTTP_SLUG=external_id, ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = response_content["deposit_id"] deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry replace_metadata_uri = response._headers["location"][1] response = authenticated_client.put( replace_metadata_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data-deposit-binary"], HTTP_IN_PROGRESS="false", ) assert response.status_code == status.HTTP_204_NO_CONTENT # deposit_id did not change deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED assert deposit.external_id == external_id assert deposit.collection == deposit_collection assert deposit.swh_id is None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert len(deposit_requests) == 2 for deposit_request in deposit_requests: assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( deposit_request.metadata["id"] == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert ( deposit_request.raw_metadata == atom_dataset["entry-data-deposit-binary"] ) # FAILURE scenarios def test_post_deposit_multipart_only_archive_and_atom_entry( authenticated_client, deposit_collection ): """Multipart deposit only accepts one archive and one atom+xml""" # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = b"some content representing archive" archive = InMemoryUploadedFile( BytesIO(archive_content), field_name="archive0", name="archive0", content_type="application/x-tar", size=len(archive_content), charset=None, ) other_archive_content = b"some-other-content" other_archive = InMemoryUploadedFile( BytesIO(other_archive_content), field_name="atom0", name="atom0", content_type="application/x-tar", size=len(other_archive_content), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": other_archive,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE assert ( "Only 1 application/zip (or application/x-tar) archive" in response.content.decode("utf-8") ) # when archive.seek(0) response = authenticated_client.post( url, format="multipart", data={"archive": archive,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) # then assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE assert ( "You must provide both 1 application/zip (or " "application/x-tar) and 1 atom+xml entry for " "multipart deposit" in response.content.decode("utf-8") ) is True def test_post_deposit_multipart_400_when_badly_formatted_xml( authenticated_client, deposit_collection, sample_archive, atom_dataset ): # given url = reverse(COL_IRI, args=[deposit_collection.name]) archive_content = sample_archive["data"] archive = InMemoryUploadedFile( BytesIO(archive_content), field_name=sample_archive["name"], name=sample_archive["name"], content_type="application/zip", size=len(archive_content), charset=None, ) data_atom_entry_ko = atom_dataset["entry-data-ko"] atom_entry = InMemoryUploadedFile( BytesIO(data_atom_entry_ko.encode("utf-8")), field_name="atom0", name="atom0", content_type='application/atom+xml; charset="utf-8"', size=len(data_atom_entry_ko), charset="utf-8", ) # when response = authenticated_client.post( url, format="multipart", data={"archive": archive, "atom_entry": atom_entry,}, # + headers HTTP_IN_PROGRESS="false", HTTP_SLUG="external-id", ) assert b"Malformed xml metadata" in response.content assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py index 8982f232..c882f817 100644 --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -1,283 +1,282 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse import pytest from rest_framework import status -from swh.deposit.config import ( - DEPOSIT_STATUS_VERIFIED, - PRIVATE_CHECK_DEPOSIT, - DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_REJECTED, - COL_IRI, -) from swh.deposit.api.private.deposit_check import ( - MANDATORY_ARCHIVE_INVALID, - MANDATORY_FIELDS_MISSING, - MANDATORY_ARCHIVE_UNSUPPORTED, ALTERNATE_FIELDS_MISSING, + MANDATORY_ARCHIVE_INVALID, MANDATORY_ARCHIVE_MISSING, + MANDATORY_ARCHIVE_UNSUPPORTED, + MANDATORY_FIELDS_MISSING, +) +from swh.deposit.config import ( + COL_IRI, + DEPOSIT_STATUS_DEPOSITED, + DEPOSIT_STATUS_REJECTED, + DEPOSIT_STATUS_VERIFIED, + PRIVATE_CHECK_DEPOSIT, ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, create_archive_with_archive, ) - PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id]), ] @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_ok( authenticated_client, deposit_collection, ready_deposit_ok, extension ): """Proper deposit should succeed the checks (-> status ready) """ deposit = ready_deposit_ok for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() @pytest.mark.parametrize("extension", ["zip", "tar", "tar.gz", "tar.bz2", "tar.xz"]) def test_deposit_invalid_tarball( tmp_path, authenticated_client, deposit_collection, extension ): """Deposit with tarball (of 1 tarball) should fail the checks: rejected """ deposit = create_deposit_archive_with_archive( tmp_path, extension, authenticated_client, deposit_collection.name ) for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_INVALID deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED def test_deposit_ko_missing_tarball( authenticated_client, deposit_collection, ready_deposit_only_metadata ): """Deposit without archive should fail the checks: rejected """ deposit = ready_deposit_only_metadata assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_MISSING deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_deposit_ko_unsupported_tarball( tmp_path, authenticated_client, deposit_collection, ready_deposit_invalid_archive ): """Deposit with an unsupported tarball should fail the checks: rejected """ deposit = ready_deposit_invalid_archive assert DEPOSIT_STATUS_DEPOSITED == deposit.status for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_REJECTED details = data["details"] # archive checks failure assert len(details["archive"]) == 1 assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED # metadata check failure assert len(details["metadata"]) == 2 mandatory = details["metadata"][0] assert mandatory["summary"] == MANDATORY_FIELDS_MISSING assert set(mandatory["fields"]) == set(["author"]) alternate = details["metadata"][1] assert alternate["summary"] == ALTERNATE_FIELDS_MISSING assert alternate["fields"] == ["name or title"] deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_REJECTED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_check_deposit_metadata_ok( authenticated_client, deposit_collection, ready_deposit_ok ): """Proper deposit should succeed the checks (-> status ready) with all **MUST** metadata using the codemeta metadata test set """ deposit = ready_deposit_ok assert deposit.status == DEPOSIT_STATUS_DEPOSITED for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK data = response.json() assert data["status"] == DEPOSIT_STATUS_VERIFIED deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_VERIFIED deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() def test_check_metadata_ok(swh_checks_deposit): actual_check, detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "name": "foo", "author": "someone", } ) assert actual_check is True assert detail is None def test_check_metadata_ok2(swh_checks_deposit): actual_check, detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "title": "bar", "author": "someone", } ) assert actual_check is True assert detail is None def test_check_metadata_ko(swh_checks_deposit): """Missing optional field should be caught """ actual_check, error_detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "author": "someone", } ) expected_error = { "metadata": [ { "summary": "Mandatory alternate fields are missing", "fields": ["name or title"], } ] } assert actual_check is False assert error_detail == expected_error def test_check_metadata_ko2(swh_checks_deposit): """Missing mandatory fields should be caught """ actual_check, error_detail = swh_checks_deposit._check_metadata( { "url": "something", "external_identifier": "something-else", "title": "foobar", } ) expected_error = { "metadata": [{"summary": "Mandatory fields are missing", "fields": ["author"],}] } assert actual_check is False assert error_detail == expected_error def create_deposit_archive_with_archive( root_path, archive_extension, client, collection_name ): # we create the holding archive to a given extension archive = create_arborescence_archive( root_path, "archive1", "file1", b"some content in file", extension=archive_extension, ) # now we create an archive holding the first created archive invalid_archive = create_archive_with_archive(root_path, "invalid.tgz", archive) # we deposit it response = client.post( reverse(COL_IRI, args=[collection_name]), content_type="application/x-tar", data=invalid_archive["data"], CONTENT_LENGTH=invalid_archive["length"], HTTP_MD5SUM=invalid_archive["md5sum"], HTTP_SLUG="external-id", HTTP_IN_PROGRESS=False, HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (invalid_archive["name"],), ) # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_status = response_content["deposit_status"] assert deposit_status == DEPOSIT_STATUS_DEPOSITED deposit_id = int(response_content["deposit_id"]) deposit = Deposit.objects.get(pk=deposit_id) assert DEPOSIT_STATUS_DEPOSITED == deposit.status return deposit diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py index 1724a2a9..6c265130 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_archive.py +++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py @@ -1,87 +1,86 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import zipfile from django.urls import reverse from rest_framework import status -from swh.deposit.config import PRIVATE_GET_RAW_CONTENT, EM_IRI +from swh.deposit.config import EM_IRI, PRIVATE_GET_RAW_CONTENT from swh.deposit.tests.common import create_arborescence_archive - PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_GET_RAW_CONTENT, args=[collection.name, deposit.id]), reverse(PRIVATE_GET_RAW_CONTENT_NC, args=[deposit.id]), ] def test_access_to_existing_deposit_with_one_archive( authenticated_client, deposit_collection, complete_deposit, sample_archive ): """Access to deposit should stream a 200 response with its raw content """ deposit = complete_deposit for url in private_get_raw_url_endpoints(deposit_collection, deposit): r = authenticated_client.get(url) assert r.status_code == status.HTTP_200_OK assert r._headers["content-type"][1] == "application/zip" # read the stream data = b"".join(r.streaming_content) # extract the file from the zip zfile = zipfile.ZipFile(io.BytesIO(data)) assert zfile.namelist() == ["file1"] assert zfile.open("file1").read() == b"some content in file" def test_access_to_existing_deposit_with_multiple_archives( tmp_path, authenticated_client, deposit_collection, partial_deposit, sample_archive ): """Access to deposit should stream a 200 response with its raw contents """ deposit = partial_deposit archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) # Add a second archive to deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=deposit.external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_201_CREATED for url in private_get_raw_url_endpoints(deposit_collection, deposit): r = authenticated_client.get(url) assert r.status_code == status.HTTP_200_OK assert r._headers["content-type"][1] == "application/zip" # read the stream data = b"".join(r.streaming_content) # extract the file from the zip zfile = zipfile.ZipFile(io.BytesIO(data)) assert set(zfile.namelist()) == {"file1", "file2"} assert zfile.open("file1").read() == b"some content in file" assert zfile.open("file2").read() == b"some other content in file" diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index 475ab1b8..ec62dc73 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,551 +1,551 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status +from swh.deposit.api import __version__ +from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit -from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON, EDIT_SE_IRI - PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit(authenticated_client, collection, deposit, atom_dataset): for atom_data in ["entry-data2", "entry-data3"]: update_deposit_with_metadata( authenticated_client, collection, deposit, atom_dataset[atom_data] ) return deposit def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", - "version": "0.0.1", + "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private read metadata to a deposit (with parent) returns metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swh_id = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swh_id=swh_id, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", - "version": "0.0.1", + "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], }, } assert data == expected_meta def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """date(Created|Published) provided, uses author/committer date """ deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) # add metadata to the deposit with datePublished and dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 """ ) update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { "@xmlns": ["http://www.w3.org/2005/Atom"], "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": [ "some awesome author", "another one", "no one", {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, ], "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2017-10-07T15:17:08Z", "2015-04-06T17:08:47+02:00", ], "codemeta:datePublished": "2017-05-03T16:08:47+02:00", "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ {"codemeta:name": "GNU General Public License v3.0 only"}, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": ["some-external-id", "hal-01243065"], "id": "hal-01243065", - "title": "Composing a Web of Audio " "Applications", + "title": "Composing a Web of Audio Applications", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", } expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, "origin_metadata": { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", - "version": "0.0.1", + "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): """dateCreated/datePublished not provided, revision uses complete_date """ deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", - "codemeta:author": {"codemeta:name": "Morane " "Gruenpeter"}, - "codemeta:description": "this is the " "description", + "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, + "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", - "title": "Composing a Web of Audio " "Applications", + "title": "Composing a Web of Audio Applications", } expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id), } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", - "version": "0.0.1", + "version": __version__, }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", } metadata = { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", - "codemeta:author": {"codemeta:name": "Morane " "Gruenpeter"}, + "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2015-04-06T17:08:47+02:00", "2016-04-06T17:08:47+02:00", ], "codemeta:datePublished": [ "2017-05-03T16:08:47+02:00", "2018-05-03T16:08:47+02:00", ], "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", - "title": "Composing a Web of Audio " "Applications", + "title": "Composing a Web of Audio Applications", } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", - "version": "0.0.1", + "version": __version__, }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1428332927}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_access_to_nonexisting_deposit_returns_404_response( authenticated_client, deposit_collection, ): """Read unknown collection should return a 404 response """ unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit with id %s does not exist" % unknown_id assert msg in response.content.decode("utf-8") diff --git a/swh/deposit/tests/api/test_deposit_private_update_status.py b/swh/deposit/tests/api/test_deposit_private_update_status.py index ba07f0bc..f93801de 100644 --- a/swh/deposit/tests/api/test_deposit_private_update_status.py +++ b/swh/deposit/tests/api/test_deposit_private_update_status.py @@ -1,194 +1,191 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json from django.urls import reverse from rest_framework import status -from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT - from swh.deposit.api.private.deposit_update_status import MANDATORY_KEYS - -from swh.deposit.models import Deposit from swh.deposit.config import ( - PRIVATE_PUT_DEPOSIT, - DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, + DEPOSIT_STATUS_LOAD_SUCCESS, + PRIVATE_PUT_DEPOSIT, ) - +from swh.deposit.models import Deposit +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid PRIVATE_PUT_DEPOSIT_NC = PRIVATE_PUT_DEPOSIT + "-nc" def private_check_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" return [ reverse(PRIVATE_PUT_DEPOSIT, args=[collection.name, deposit.id]), reverse(PRIVATE_PUT_DEPOSIT_NC, args=[deposit.id]), ] def test_update_deposit_status_success_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit with load success should require all information to succeed """ deposit = ready_deposit_verified expected_status = DEPOSIT_STATUS_LOAD_SUCCESS origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" full_body_info = { "status": DEPOSIT_STATUS_LOAD_SUCCESS, "revision_id": revision_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): - dir_id = persistent_identifier(DIRECTORY, directory_id) - rev_id = persistent_identifier(REVISION, revision_id) - snp_id = persistent_identifier(SNAPSHOT, snapshot_id) + dir_id = swhid(DIRECTORY, directory_id) + rev_id = swhid(REVISION, revision_id) + snp_id = swhid(SNAPSHOT, snapshot_id) expected_swh_id = "swh:1:dir:%s" % directory_id expected_swh_id_context = ( f"{dir_id};origin={origin_url};" + f"visit={snp_id};anchor={rev_id};path=/" ) response = authenticated_client.put( url, content_type="application/json", data=json.dumps(full_body_info), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == expected_status assert deposit.swh_id == expected_swh_id assert deposit.swh_id_context == expected_swh_id_context # Reset deposit deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_rejected_with_info( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit with rejected status needs few information to succeed """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_FAILURE}), ) assert response.status_code == status.HTTP_204_NO_CONTENT deposit = Deposit.objects.get(pk=deposit.id) assert deposit.status == DEPOSIT_STATUS_LOAD_FAILURE assert deposit.swh_id is None assert deposit.swh_id_context is None # Reset status deposit = ready_deposit_verified deposit.save() def test_update_deposit_status_success_with_incomplete_data( authenticated_client, deposit_collection, ready_deposit_verified ): """Update deposit status with status success and incomplete information should fail """ deposit = ready_deposit_verified origin_url = "something" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "47dc6b4636c7f6cba0df83e3d5490bf4334d987e" snapshot_id = "68c0d26104d47e278dd6be07ed61fafb561d0d20" new_status = DEPOSIT_STATUS_LOAD_SUCCESS full_body_info = { "status": new_status, "revision_id": revision_id, "directory_id": directory_id, "snapshot_id": snapshot_id, "origin_url": origin_url, } for url in private_check_url_endpoints(deposit_collection, deposit): for key in MANDATORY_KEYS: # Crafting body with missing information so that it raises body = copy.deepcopy(full_body_info) body.pop(key) # make the body incomplete response = authenticated_client.put( url, content_type="application/json", data=json.dumps(body), ) assert response.status_code == status.HTTP_400_BAD_REQUEST assert ( f"deposit status to {new_status} requires information {key}" in response.content.decode("utf-8") ) def test_update_deposit_status_will_fail_with_unknown_status( authenticated_client, deposit_collection, ready_deposit_verified ): """Unknown status for update should return a 400 response """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": "unknown"}) ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_update_deposit_status_will_fail_with_no_status_key( authenticated_client, deposit_collection, ready_deposit_verified ): """No status provided for update should return a 400 response """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"something": "something"}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST def test_update_deposit_status_success_without_swh_id_fail( authenticated_client, deposit_collection, ready_deposit_verified ): """Providing successful status without swh_id should return a 400 """ deposit = ready_deposit_verified for url in private_check_url_endpoints(deposit_collection, deposit): response = authenticated_client.put( url, content_type="application/json", data=json.dumps({"status": DEPOSIT_STATUS_LOAD_SUCCESS}), ) assert response.status_code == status.HTTP_400_BAD_REQUEST diff --git a/swh/deposit/tests/api/test_deposit_schedule.py b/swh/deposit/tests/api/test_deposit_schedule.py index 8541420a..4218797e 100644 --- a/swh/deposit/tests/api/test_deposit_schedule.py +++ b/swh/deposit/tests/api/test_deposit_schedule.py @@ -1,91 +1,81 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import datetime from io import BytesIO -from typing import Dict from django.urls import reverse import pytest from rest_framework import status -from swh.deposit.config import ( - COL_IRI, - DEPOSIT_STATUS_DEPOSITED, -) +from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED from swh.deposit.parsers import parse_xml -from ..conftest import TEST_CONFIG - - -TEST_CONFIG_WITH_CHECKS: Dict[str, object] = { - **TEST_CONFIG, - "checks": True, -} - - @pytest.fixture() -def deposit_config(): +def deposit_config(deposit_config): """Overrides the `deposit_config` fixture define in swh/deposit/tests/conftest.py to re-enable the checks.""" - return TEST_CONFIG_WITH_CHECKS + config_d = copy.deepcopy(deposit_config) + config_d["checks"] = True + return config_d def now() -> datetime.datetime: return datetime.datetime.now(tz=datetime.timezone.utc) def test_add_deposit_schedules_check( authenticated_client, deposit_collection, sample_archive, swh_scheduler ): """Posting deposit on collection creates a checker task """ external_id = "external-id-schedules-check" url = reverse(COL_IRI, args=[deposit_collection.name]) timestamp_before_call = now() response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) timestamp_after_call = now() assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) actual_state = response_content["deposit_status"] assert actual_state == DEPOSIT_STATUS_DEPOSITED deposit_id = response_content["deposit_id"] tasks = swh_scheduler.grab_ready_tasks("check-deposit") assert len(tasks) == 1 task = tasks[0] assert timestamp_before_call <= task.pop("next_run") <= timestamp_after_call assert task == { "arguments": { "args": [], "kwargs": {"collection": "test", "deposit_id": int(deposit_id),}, }, "current_interval": datetime.timedelta(days=1), "id": 1, "policy": "oneshot", "priority": None, "retries_left": 3, "status": "next_run_scheduled", "type": "check-deposit", } diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py index 4b03f7c7..c8f5f89e 100644 --- a/swh/deposit/tests/api/test_deposit_status.py +++ b/swh/deposit/tests/api/test_deposit_status.py @@ -1,120 +1,121 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from django.urls import reverse from io import BytesIO + +from django.urls import reverse from rest_framework import status from swh.deposit.config import ( - STATE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, + STATE_IRI, ) from swh.deposit.models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.parsers import parse_xml def test_post_deposit_with_status_check(authenticated_client, deposited_deposit): """Successful but not loaded deposit should have a status 'deposited' """ deposit = deposited_deposit status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # check status status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_200_OK r = parse_xml(BytesIO(status_response.content)) assert int(r["deposit_id"]) == deposit.id assert r["deposit_status"] == DEPOSIT_STATUS_DEPOSITED assert r["deposit_status_detail"] == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED] assert r["deposit_external_id"] == deposit.external_id def test_status_unknown_deposit(authenticated_client, deposit_collection): """Unknown deposit status should return 404 response """ unknown_deposit_id = 999 status_url = reverse(STATE_IRI, args=[deposit_collection.name, unknown_deposit_id]) status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_404_NOT_FOUND def test_status_unknown_collection(authenticated_client, deposited_deposit): """Unknown collection status should return 404 response""" deposit = deposited_deposit unknown_collection = "something-unknown" status_url = reverse(STATE_IRI, args=[unknown_collection, deposit.id]) status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_404_NOT_FOUND def test_status_deposit_rejected(authenticated_client, rejected_deposit): """Rejected deposit status should be 'rejected' with detailed summary """ deposit = rejected_deposit # _status_detail = {'url': {'summary': 'Wrong url'}} url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # when status_response = authenticated_client.get(url) # then assert status_response.status_code == status.HTTP_200_OK r = parse_xml(BytesIO(status_response.content)) assert int(r["deposit_id"]) == deposit.id assert r["deposit_status"] == DEPOSIT_STATUS_REJECTED assert r["deposit_status_detail"] == "Deposit failed the checks" if deposit.swh_id: assert r["deposit_swh_id"] == deposit.swh_id def test_status_with_http_accept_header_should_not_break( authenticated_client, partial_deposit ): """Asking deposit status with Accept header should return 200 """ deposit = partial_deposit status_url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) response = authenticated_client.get(status_url) assert response.status_code == status.HTTP_200_OK response = authenticated_client.get( status_url, HTTP_ACCEPT="text/html,application/xml;q=9,*/*,q=8" ) assert response.status_code == status.HTTP_200_OK def test_status_complete_deposit(authenticated_client, complete_deposit): """Successful and loaded deposit should be 'done' and have detailed swh ids """ deposit = complete_deposit url = reverse(STATE_IRI, args=[deposit.collection.name, deposit.id]) # when status_response = authenticated_client.get(url) # then assert status_response.status_code == status.HTTP_200_OK r = parse_xml(BytesIO(status_response.content)) assert int(r["deposit_id"]) == deposit.id assert r["deposit_status"] == DEPOSIT_STATUS_LOAD_SUCCESS assert ( r["deposit_status_detail"] == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS] ) assert deposit.swh_id is not None assert r["deposit_swh_id"] == deposit.swh_id assert deposit.swh_id_context is not None assert r["deposit_swh_id_context"] == deposit.swh_id_context diff --git a/swh/deposit/tests/api/test_deposit_update.py b/swh/deposit/tests/api/test_deposit_update.py index 43b268cd..0b173c4f 100644 --- a/swh/deposit/tests/api/test_deposit_update.py +++ b/swh/deposit/tests/api/test_deposit_update.py @@ -1,395 +1,394 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status -from swh.deposit.models import Deposit, DepositRequest, DepositCollection from swh.deposit.config import EDIT_SE_IRI, EM_IRI +from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml - -from swh.deposit.tests.common import create_arborescence_archive, check_archive +from swh.deposit.tests.common import check_archive, create_arborescence_archive def test_replace_archive_to_deposit_is_possible( tmp_path, partial_deposit, deposit_collection, authenticated_client, sample_archive, atom_dataset, ): """Replace all archive with another one should return a 204 response """ tmp_path = str(tmp_path) # given deposit = partial_deposit requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(sample_archive["name"], requests[0].archive.name) # we have no metadata for that deposit requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 0 response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = authenticated_client.put( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_204_NO_CONTENT requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(list(requests)) == 1 check_archive(archive2["name"], requests[0].archive.name) # check we did not touch the other parts requests = list(DepositRequest.objects.filter(deposit=deposit, type="metadata")) assert len(requests) == 1 def test_replace_metadata_to_deposit_is_possible( tmp_path, authenticated_client, partial_deposit_with_metadata, deposit_collection, atom_dataset, ): """Replace all metadata with another one should return a 204 response """ # given deposit = partial_deposit_with_metadata raw_metadata0 = atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8") requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta0 = requests_meta[0] assert request_meta0.raw_metadata == raw_metadata0 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_204_NO_CONTENT requests_meta = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta) == 1 request_meta1 = requests_meta[0] raw_metadata1 = request_meta1.raw_metadata assert raw_metadata1 == atom_dataset["entry-data1"] assert raw_metadata0 != raw_metadata1 assert request_meta0 != request_meta1 # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_archive_to_deposit_is_possible( tmp_path, authenticated_client, deposit_collection, partial_deposit_with_metadata, sample_archive, ): """Add another archive to a deposit return a 201 response """ tmp_path = str(tmp_path) deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests) == 1 check_archive(sample_archive["name"], requests[0].archive.name) requests_meta0 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta0) == 1 update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) external_id = "some-external-id-1" archive2 = create_arborescence_archive( tmp_path, "archive2", "file2", b"some other content in file" ) response = authenticated_client.post( update_uri, content_type="application/zip", # as zip data=archive2["data"], # + headers CONTENT_LENGTH=archive2["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=archive2["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (archive2["name"],), ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="archive").order_by( "id" ) assert len(requests) == 2 # first archive still exists check_archive(sample_archive["name"], requests[0].archive.name) # a new one was added check_archive(archive2["name"], requests[1].archive.name) # check we did not touch the other parts requests_meta1 = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests_meta1) == 1 assert set(requests_meta0) == set(requests_meta1) def test_add_metadata_to_deposit_is_possible( authenticated_client, deposit_collection, partial_deposit_with_metadata, atom_dataset, ): """Add metadata with another one should return a 204 response """ deposit = partial_deposit_with_metadata requests = DepositRequest.objects.filter(deposit=deposit, type="metadata") assert len(requests) == 1 requests_archive0 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive0) == 1 update_uri = reverse(EDIT_SE_IRI, args=[deposit_collection.name, deposit.id]) atom_entry = atom_dataset["entry-data1"] response = authenticated_client.post( update_uri, content_type="application/atom+xml;type=entry", data=atom_entry ) assert response.status_code == status.HTTP_201_CREATED requests = DepositRequest.objects.filter(deposit=deposit, type="metadata").order_by( "id" ) assert len(requests) == 2 expected_raw_meta0 = atom_dataset["entry-data0"] % ( deposit.external_id.encode("utf-8") ) # a new one was added assert requests[0].raw_metadata == expected_raw_meta0 assert requests[1].raw_metadata == atom_entry # check we did not touch the other parts requests_archive1 = DepositRequest.objects.filter(deposit=deposit, type="archive") assert len(requests_archive1) == 1 assert set(requests_archive0) == set(requests_archive1) def test_add_metadata_to_unknown_deposit( deposit_collection, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 1000 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[deposit_collection, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["summary"] def test_add_metadata_to_unknown_collection( partial_deposit, authenticated_client, atom_dataset ): """Replacing metadata to unknown deposit should return a 404 response """ deposit = partial_deposit unknown_collection_name = "unknown-collection" try: DepositCollection.objects.get(name=unknown_collection_name) except DepositCollection.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[unknown_collection_name, deposit.id]) response = authenticated_client.post( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert "Unknown collection name" in response_content["sword:error"]["summary"] def test_replace_metadata_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 998 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EDIT_SE_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_add_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Adding metadata to unknown deposit should return a 404 response """ unknown_deposit_id = 997 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.post( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_replace_archive_to_unknown_deposit( authenticated_client, deposit_collection, atom_dataset ): """Replacing archive to unknown deposit should return a 404 response """ unknown_deposit_id = 996 try: Deposit.objects.get(pk=unknown_deposit_id) except Deposit.DoesNotExist: assert True url = reverse(EM_IRI, args=[deposit_collection.name, unknown_deposit_id]) response = authenticated_client.put( url, content_type="application/zip", data=atom_dataset["entry-data1"] ) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( "Deposit with id %s does not exist" % unknown_deposit_id == response_content["sword:error"]["summary"] ) def test_post_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (POST) archive with wrong content type should return 400 """ deposit = partial_deposit update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.post( update_uri, content_type="application/x-gtar-compressed", data=atom_dataset["entry-data1"], ) assert response.status_code == status.HTTP_400_BAD_REQUEST response_content = parse_xml(response.content) msg = ( "Packaging format supported is restricted to " + "application/zip, application/x-tar" ) assert msg == response_content["sword:error"]["summary"] def test_put_metadata_to_em_iri_failure( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Update (PUT) archive with wrong content type should return 400 """ # given deposit = partial_deposit # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id]) response = authenticated_client.put( update_uri, content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], ) # then assert response.status_code == status.HTTP_400_BAD_REQUEST response_content = parse_xml(response.content) msg = ( "Packaging format supported is restricted to " + "application/zip, application/x-tar" ) assert msg == response_content["sword:error"]["summary"] diff --git a/swh/deposit/tests/api/test_exception.py b/swh/deposit/tests/api/test_exception.py index 0d71926b..a606397f 100644 --- a/swh/deposit/tests/api/test_exception.py +++ b/swh/deposit/tests/api/test_exception.py @@ -1,53 +1,52 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.deposit.exception import custom_exception_handler - +from django.db.utils import OperationalError from rest_framework.exceptions import APIException from rest_framework.response import Response -from django.db.utils import OperationalError +from swh.deposit.exception import custom_exception_handler def test_custom_exception_handler_operational_error(mocker): """Operation error are translated to service unavailable """ fake_exception = OperationalError("Fake internal error", 503) response = custom_exception_handler(fake_exception, {}) assert response is not None assert response.status_code == 503 status = "Database backend maintenance" detail = "Service temporarily unavailable, try again later." assert ( response.content.decode("utf-8") == f""" {status} {detail} """ ) def test_custom_exception_handler_default_behavior_maintained(mocker): """Other internal errors are transmitted as is """ fake_exception = APIException("Fake internal error", 500) fake_response = Response( exception=fake_exception, status=fake_exception.status_code ) mock_exception_handler = mocker.patch("swh.deposit.exception.exception_handler") mock_exception_handler.return_value = fake_response response = custom_exception_handler(fake_exception, {}) assert response is not None assert response == fake_response diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py index b1cc9119..0adea4f5 100644 --- a/swh/deposit/tests/api/test_parser.py +++ b/swh/deposit/tests/api/test_parser.py @@ -1,134 +1,133 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import io - from collections import OrderedDict +import io from swh.deposit.parsers import SWHXMLParser def test_parsing_without_duplicates(): xml_no_duplicate = io.BytesIO( b""" Awesome Compiler GPL3.0 https://opensource.org/licenses/GPL-3.0 Python3 author1 Inria ocaml http://issuetracker.com """ ) actual_result = SWHXMLParser().parse(xml_no_duplicate) expected_dict = OrderedDict( [ ("@xmlns", "http://www.w3.org/2005/Atom"), ("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"), ("title", "Awesome Compiler"), ( "codemeta:license", OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), ), ("codemeta:runtimePlatform", "Python3"), ( "codemeta:author", OrderedDict( [("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")] ), ), ("codemeta:programmingLanguage", "ocaml"), ("codemeta:issueTracker", "http://issuetracker.com"), ] ) assert expected_dict == actual_result def test_parsing_with_duplicates(): xml_with_duplicates = io.BytesIO( b""" Another Compiler GNU/Linux GPL3.0 https://opensource.org/licenses/GPL-3.0 Un*x author1 Inria author2 Inria ocaml haskell spdx http://spdx.org python3 """ ) actual_result = SWHXMLParser().parse(xml_with_duplicates) expected_dict = OrderedDict( [ ("@xmlns", "http://www.w3.org/2005/Atom"), ("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"), ("title", "Another Compiler"), ("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]), ( "codemeta:license", [ OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), OrderedDict( [("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")] ), ], ), ( "codemeta:author", [ OrderedDict( [ ("codemeta:name", "author1"), ("codemeta:affiliation", "Inria"), ] ), OrderedDict( [ ("codemeta:name", "author2"), ("codemeta:affiliation", "Inria"), ] ), ], ), ("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]), ] ) assert expected_dict == actual_result diff --git a/swh/deposit/tests/cli/data/atom/codemeta-sample.xml b/swh/deposit/tests/cli/data/atom/codemeta-sample.xml deleted file mode 100644 index d804eff5..00000000 --- a/swh/deposit/tests/cli/data/atom/codemeta-sample.xml +++ /dev/null @@ -1,51 +0,0 @@ - - - %s - hal-01587361 - https://hal.inria.fr/hal-01587361 - https://hal.inria.fr/hal-01587361/document - https://hal.inria.fr/hal-01587361/file/AffectationRO-v1.0.0.zip - doi:10.5281/zenodo.438684 - The assignment problem - AffectationRO - Gruenpeter, Morane - [INFO] Computer Science [cs] - [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO] - SOFTWARE - Project in OR: The assignment problemA java implementation for the assignment problem first release - description fr - 2015-06-01 - 2017-10-19 - en - - - url stable - Version sur hal - Version entre par lutilisateur - Mots-cls - Commentaire - Rfrence interne - - Collaboration/Projet - nom du projet - id - - Voir aussi - Financement - Projet ANR - Projet Europen - Platform/OS - Dpendances - Etat du dveloppement - - license - url spdx - - Outils de dveloppement- outil no1 - Outils de dveloppement- outil no2 - http://code.com - language 1 - language 2 - diff --git a/swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml b/swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml deleted file mode 100644 index 25a417fb..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml b/swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml deleted file mode 100644 index 65b7f63b..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - Title - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a - 2005-10-07T17:17:08Z - Contributor - The abstract - - - The abstract - Access Rights - Alternative Title - Date Available - Bibliographic Citation # noqa - Contributor - Description - Has Part - Has Version - Identifier - Is Part Of - Publisher - References - Rights Holder - Source - Title - Type - - diff --git a/swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml b/swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml deleted file mode 100644 index e4caf44f..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/swh/deposit/tests/cli/data/atom/entry-data-ko.xml b/swh/deposit/tests/cli/data/atom/entry-data-ko.xml deleted file mode 100644 index 3f5d8802..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data-ko.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a - diff --git a/swh/deposit/tests/cli/data/atom/entry-data-minimal.xml b/swh/deposit/tests/cli/data/atom/entry-data-minimal.xml deleted file mode 100644 index 9432ac0e..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data-minimal.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - %s - diff --git a/swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml b/swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml deleted file mode 100644 index 34710195..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - Composing a Web of Audio Applications - - diff --git a/swh/deposit/tests/cli/data/atom/entry-data0.xml b/swh/deposit/tests/cli/data/atom/entry-data0.xml deleted file mode 100644 index 2b0ccc00..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data0.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - Awesome Compiler - hal - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a - %s - 2017-10-07T15:17:08Z - some awesome author - something - awesome-compiler - This is an awesome compiler destined to -awesomely compile stuff -and other stuff - compiler,programming,language - 2005-10-07T17:17:08Z - 2005-10-07T17:17:08Z - release note - related link - - Awesome - https://hoster.org/awesome-compiler - GNU/Linux - 0.0.1 - running - all - diff --git a/swh/deposit/tests/cli/data/atom/entry-data1.xml b/swh/deposit/tests/cli/data/atom/entry-data1.xml deleted file mode 100644 index e4f415c7..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data1.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - hal - urn:uuid:2225c695-cfb8-4ebb-aaaa-80da344efa6a - 2017-10-07T15:17:08Z - some awesome author - something - awesome-compiler - This is an awesome compiler destined to -awesomely compile stuff -and other stuff - compiler,programming,language - 2005-10-07T17:17:08Z - 2005-10-07T17:17:08Z - release note - related link - - Awesome - https://hoster.org/awesome-compiler - GNU/Linux - 0.0.1 - running - all - diff --git a/swh/deposit/tests/cli/data/atom/entry-data2.xml b/swh/deposit/tests/cli/data/atom/entry-data2.xml deleted file mode 100644 index 73cfafeb..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data2.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - some-external-id - https://hal-test.archives-ouvertes.fr/some-external-id - some awesome author - diff --git a/swh/deposit/tests/cli/data/atom/entry-data3.xml b/swh/deposit/tests/cli/data/atom/entry-data3.xml deleted file mode 100644 index c75d9739..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-data3.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - another one - no one - 2017-10-07T15:17:08Z - diff --git a/swh/deposit/tests/cli/data/atom/entry-update-in-place.xml b/swh/deposit/tests/cli/data/atom/entry-update-in-place.xml deleted file mode 100644 index 1a7d7bbb..00000000 --- a/swh/deposit/tests/cli/data/atom/entry-update-in-place.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa7b - Title - Type - diff --git a/swh/deposit/tests/cli/data/atom/error-with-decimal.xml b/swh/deposit/tests/cli/data/atom/error-with-decimal.xml deleted file mode 100644 index be002442..00000000 --- a/swh/deposit/tests/cli/data/atom/error-with-decimal.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - Composing a Web of Audio Applications - hal - hal-01243065 - hal-01243065 - https://hal-test.archives-ouvertes.fr/hal-01243065 - test - - - DSP programming,Web,Composability,Faust - 2017-05-03T16:08:47+02:00 - The Web offers a great opportunity to share, deploy and use programs without installation difficulties. In this article we explore the idea of freely combining/composing real-time audio applications deployed on the Web using Faust audio DSP language. - 1 - 10.4 - phpstorm - stable - - linux - php - python - C - - GNU General Public License v3.0 only - - - CeCILL Free Software License Agreement v1.1 - - - HAL - hal@ccsd.cnrs.fr - - - Someone Nice - someone@nice.fr - FFJ - - diff --git a/swh/deposit/tests/cli/data/atom/metadata.xml b/swh/deposit/tests/cli/data/atom/metadata.xml deleted file mode 100644 index 65f58543..00000000 --- a/swh/deposit/tests/cli/data/atom/metadata.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - Composing a Web of Audio Applications - hal - hal-01243065 - hal-01243065 - https://hal-test.archives-ouvertes.fr/hal-01243065 - test - DSP programming - this is the description - 1 - phpstorm - stable - php - python - C - - GNU General Public License v3.0 only - - - CeCILL Free Software License Agreement v1.1 - - - HAL - hal@ccsd.cnrs.fr - - - Morane Gruenpeter - -%s - diff --git a/swh/deposit/tests/cli/data/atom/tei-sample.xml b/swh/deposit/tests/cli/data/atom/tei-sample.xml deleted file mode 100644 index cf2266af..00000000 --- a/swh/deposit/tests/cli/data/atom/tei-sample.xml +++ /dev/null @@ -1 +0,0 @@ -HAL TEI export of hal-01587083CCSDDistributed under a Creative Commons Attribution 4.0 International License

HAL API platform

questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733MoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.com2017-09-29 11:21:322017-10-03 17:20:132017-10-03 17:20:132017-09-292017-09-29contributorMoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.comCCSDhal-01587083https://hal.inria.fr/hal-01587083gruenpeter:hal-0158708320172017questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733EnglishComputer Science [cs]SoftwareIRILLInitiative pour la Recherche et l'Innovation sur le Logiciel Libre
https://www.irill.org/
Universite Pierre et Marie Curie - Paris 6UPMC
4 place Jussieu - 75005 Paris
http://www.upmc.fr/
Institut National de Recherche en Informatique et en AutomatiqueInria
Domaine de VoluceauRocquencourt - BP 10578153 Le Chesnay Cedex
http://www.inria.fr/en/
Universite Paris Diderot - Paris 7UPD7
5 rue Thomas-Mann - 75205 Paris cedex 13
http://www.univ-paris-diderot.fr
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index 7b5fd7ff..2e793fa4 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,457 +1,463 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import logging import os import re from unittest.mock import MagicMock from click.testing import CliRunner import pytest -from swh.deposit.client import PublicApiDepositClient, MaintenanceError -from swh.deposit.cli.client import generate_slug, _url, _client, _collection, InputError from swh.deposit.cli import deposit as cli -from ..conftest import TEST_USER +from swh.deposit.cli.client import InputError, _client, _collection, _url, generate_slug +from swh.deposit.client import MaintenanceError, PublicApiDepositClient +from ..conftest import TEST_USER EXAMPLE_SERVICE_DOCUMENT = { "service": {"workspace": {"collection": {"sword:name": "softcol",}}} } +@pytest.fixture +def datadir(request): + """Override default datadir to target main test datadir""" + return os.path.join(os.path.dirname(str(request.fspath)), "../data") + + @pytest.fixture def slug(): return generate_slug() @pytest.fixture def client_mock(mocker, slug): """A successful deposit client with hard-coded default values """ mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT mock_client.deposit_create.return_value = '{"foo": "bar"}' return mock_client @pytest.fixture def client_mock_api_down(mocker, slug): """A mock client whose connection with api fails due to maintenance issue """ mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.side_effect = MaintenanceError( "Database backend maintenance: Temporarily unavailable, try again later." ) return mock_client def test_url(): assert _url("http://deposit") == "http://deposit/1" assert _url("https://other/1") == "https://other/1" def test_client(): client = _client("http://deposit", "user", "pass") assert isinstance(client, PublicApiDepositClient) def test_collection_error(): mock_client = MagicMock() mock_client.service_document.return_value = {"error": "something went wrong"} with pytest.raises(InputError) as e: _collection(mock_client) assert "Service document retrieval: something went wrong" == str(e.value) def test_collection_ok(): mock_client = MagicMock() mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT collection_name = _collection(mock_client) assert collection_name == "softcol" def test_collection_ko_because_downtime(): mock_client = MagicMock() mock_client.service_document.side_effect = MaintenanceError("downtime") with pytest.raises(MaintenanceError, match="downtime"): _collection(mock_client) def test_deposit_with_server_down_for_maintenance( sample_archive, mocker, caplog, client_mock_api_down, slug, tmp_path ): """ Deposit failure due to maintenance down time should be explicit """ runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" assert caplog.record_tuples == [ ( "swh.deposit.cli.client", logging.ERROR, "Database backend maintenance: Temporarily unavailable, try again later.", ) ] client_mock_api_down.service_document.assert_called_once_with() def test_single_minimal_deposit( sample_archive, mocker, caplog, client_mock, slug, tmp_path ): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(tmp_path, "metadata.xml") mocker.patch( - "swh.deposit.cli.client.tempfile.TemporaryDirectory", + "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"foo": "bar"}'), ] client_mock.deposit_create.assert_called_once_with( archive=sample_archive["path"], collection="softcol", in_progress=False, metadata=metadata_path, slug=slug, ) with open(metadata_path) as fd: assert ( fd.read() == f"""\ \ttest-project \t{slug} \t \t\tJane Doe \t """ ) def test_metadata_validation(sample_archive, mocker, caplog, tmp_path): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa slug = generate_slug() mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.return_value = EXAMPLE_SERVICE_DOCUMENT mock_client.deposit_create.return_value = '{"foo": "bar"}' metadata_path = os.path.join(tmp_path, "metadata.xml") mocker.patch( - "swh.deposit.cli.client.tempfile.TemporaryDirectory", + "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) with open(metadata_path, "a"): pass # creates the file runner = CliRunner() # Test missing author result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], ], ) assert result.exit_code == 1, result.output assert result.output == "" assert len(caplog.record_tuples) == 1 (_logger, level, message) = caplog.record_tuples[0] assert level == logging.ERROR assert " --author " in message # Clear mocking state caplog.clear() mock_client.reset_mock() # Test missing name result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" assert len(caplog.record_tuples) == 1 (_logger, level, message) = caplog.record_tuples[0] assert level == logging.ERROR assert " --name " in message # Clear mocking state caplog.clear() mock_client.reset_mock() # Test both --metadata and --author result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--metadata", metadata_path, "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" assert len(caplog.record_tuples) == 1 (_logger, level, message) = caplog.record_tuples[0] assert level == logging.ERROR assert re.search("--metadata.*is incompatible with", message) # Clear mocking state caplog.clear() mock_client.reset_mock() def test_single_deposit_slug_generation( sample_archive, mocker, caplog, tmp_path, client_mock ): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa slug = "my-slug" collection = "my-collection" metadata_path = os.path.join(tmp_path, "metadata.xml") mocker.patch( - "swh.deposit.cli.client.tempfile.TemporaryDirectory", + "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--slug", slug, "--collection", collection, "--author", "Jane Doe", ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"foo": "bar"}'), ] client_mock.deposit_create.assert_called_once_with( archive=sample_archive["path"], collection=collection, in_progress=False, metadata=metadata_path, slug=slug, ) with open(metadata_path) as fd: assert ( fd.read() == """\ \ttest-project \tmy-slug \t \t\tJane Doe \t """ ) def test_multisteps_deposit( sample_archive, atom_dataset, mocker, caplog, datadir, client_mock, slug ): """ from: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#multisteps-deposit """ # noqa slug = generate_slug() mocker.patch("swh.deposit.cli.client.generate_slug", return_value=slug) # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#create-an-incomplete-deposit client_mock.deposit_create.return_value = '{"deposit_id": "42"}' runner = CliRunner() result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--partial", ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"deposit_id": "42"}'), ] client_mock.deposit_create.assert_called_once_with( archive=sample_archive["path"], collection="softcol", in_progress=True, metadata=None, slug=slug, ) # Clear mocking state caplog.clear() client_mock.reset_mock() # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#add-content-or-metadata-to-the-deposit metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") result = runner.invoke( cli, [ "upload", "--url", "mock://deposit.swh/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, ], ) assert result.exit_code == 0, result.output assert result.output == "" assert caplog.record_tuples == [ ("swh.deposit.cli.client", logging.INFO, '{"deposit_id": "42"}'), ] client_mock.deposit_create.assert_called_once_with( archive=None, collection="softcol", in_progress=False, metadata=metadata_path, slug=slug, ) # Clear mocking state caplog.clear() client_mock.reset_mock() diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py index 5bbe064e..c92a6916 100644 --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -1,428 +1,418 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import base64 -import pytest -import psycopg2 +import os +from typing import Mapping -from django.urls import reverse from django.test.utils import setup_databases # type: ignore - -# mypy is asked to ignore the import statement above because setup_databases -# is not part of the d.t.utils.__all__ variable. - +from django.urls import reverse +import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT +import pytest from rest_framework import status from rest_framework.test import APIClient -from typing import Mapping +import yaml -from swh.scheduler import get_scheduler -from swh.scheduler.tests.conftest import * # noqa -from swh.model.identifiers import DIRECTORY, persistent_identifier, REVISION, SNAPSHOT -from swh.deposit.config import setup_django_for -from swh.deposit.parsers import parse_xml -from swh.deposit.config import SWHDefaultConfig from swh.deposit.config import ( COL_IRI, - EDIT_SE_IRI, DEPOSIT_STATUS_DEPOSITED, - DEPOSIT_STATUS_REJECTED, - DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS, + DEPOSIT_STATUS_PARTIAL, + DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_VERIFIED, - DEPOSIT_STATUS_LOAD_FAILURE, + EDIT_SE_IRI, + setup_django_for, ) +from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import create_arborescence_archive +from swh.model.identifiers import DIRECTORY, REVISION, SNAPSHOT, swhid +from swh.scheduler import get_scheduler + +# mypy is asked to ignore the import statement above because setup_databases +# is not part of the d.t.utils.__all__ variable. TEST_USER = { "username": "test", "password": "password", "email": "test@example.org", "provider_url": "https://hal-test.archives-ouvertes.fr/", "domain": "archives-ouvertes.fr/", "collection": {"name": "test"}, } -TEST_CONFIG = { - "max_upload_size": 500, - "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", - "checks": False, - "provider": { - "provider_name": "", - "provider_type": "deposit_client", - "provider_url": "", - "metadata": {}, - }, - "tool": { - "name": "swh-deposit", - "version": "0.0.1", - "configuration": {"sword_version": "2"}, - }, -} - - def pytest_configure(): setup_django_for("testing") @pytest.fixture() -def deposit_config(): - return TEST_CONFIG +def deposit_config(swh_scheduler_config): + return { + "max_upload_size": 500, + "extraction_dir": "/tmp/swh-deposit/test/extraction-dir", + "checks": False, + "provider": { + "provider_name": "", + "provider_type": "deposit_client", + "provider_url": "", + "metadata": {}, + }, + "scheduler": {"cls": "local", "args": swh_scheduler_config,}, + } -@pytest.fixture(autouse=True) -def deposit_autoconfig(monkeypatch, deposit_config, swh_scheduler_config): - """Enforce config for deposit classes inherited from SWHDefaultConfig.""" - - def mock_parse_config(*args, **kw): - config = deposit_config.copy() - config["scheduler"] = { - "cls": "local", - "args": swh_scheduler_config, - } - return config +@pytest.fixture() +def deposit_config_path(tmp_path, monkeypatch, deposit_config): + conf_path = os.path.join(tmp_path, "deposit.yml") + with open(conf_path, "w") as f: + f.write(yaml.dump(deposit_config)) + monkeypatch.setenv("SWH_CONFIG_FILENAME", conf_path) + return conf_path - monkeypatch.setattr(SWHDefaultConfig, "parse_config_file", mock_parse_config) + +@pytest.fixture(autouse=True) +def deposit_autoconfig(deposit_config_path, swh_scheduler_config): + """Enforce config for deposit classes inherited from APIConfig.""" scheduler = get_scheduler("local", swh_scheduler_config) task_type = { "type": "load-deposit", "backend_name": "swh.loader.packages.deposit.tasks.LoadDeposit", - "description": "why does this have not-null constraint?", + "description": "Load deposit task", } scheduler.create_task_type(task_type) @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", "tests"), ("USER", postgresql_proc.user), # noqa ("HOST", postgresql_proc.host), # noqa ("PORT", postgresql_proc.port), # noqa } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) def execute_sql(sql): """Execute sql to postgres db""" with psycopg2.connect(database="postgres") as conn: conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) @pytest.fixture(autouse=True, scope="session") def swh_proxy(): """Automatically inject this fixture in all tests to ensure no outside connection takes place. """ os.environ["http_proxy"] = "http://localhost:999" os.environ["https_proxy"] = "http://localhost:999" def create_deposit_collection(collection_name: str): """Create a deposit collection with name collection_name """ from swh.deposit.models import DepositCollection try: collection = DepositCollection._default_manager.get(name=collection_name) except DepositCollection.DoesNotExist: collection = DepositCollection(name=collection_name) collection.save() return collection def deposit_collection_factory(collection_name=TEST_USER["collection"]["name"]): @pytest.fixture def _deposit_collection(db, collection_name=collection_name): return create_deposit_collection(collection_name) return _deposit_collection deposit_collection = deposit_collection_factory() deposit_another_collection = deposit_collection_factory("another-collection") @pytest.fixture def deposit_user(db, deposit_collection): """Create/Return the test_user "test" """ from swh.deposit.models import DepositClient try: user = DepositClient._default_manager.get(username=TEST_USER["username"]) except DepositClient.DoesNotExist: user = DepositClient._default_manager.create_user( username=TEST_USER["username"], email=TEST_USER["email"], password=TEST_USER["password"], provider_url=TEST_USER["provider_url"], domain=TEST_USER["domain"], ) user.collections = [deposit_collection.id] user.save() return user @pytest.fixture def client(): """Override pytest-django one which does not work for djangorestframework. """ return APIClient() # <- drf's client @pytest.yield_fixture def authenticated_client(client, deposit_user): """Returned a logged client """ _token = "%s:%s" % (deposit_user.username, TEST_USER["password"]) token = base64.b64encode(_token.encode("utf-8")) authorization = "Basic %s" % token.decode("utf-8") client.credentials(HTTP_AUTHORIZATION=authorization) yield client client.logout() @pytest.fixture def sample_archive(tmp_path): """Returns a sample archive """ tmp_path = str(tmp_path) # pytest version limitation in previous version archive = create_arborescence_archive( tmp_path, "archive1", "file1", b"some content in file" ) return archive @pytest.fixture def atom_dataset(datadir) -> Mapping[str, str]: """Compute the paths to atom files. Returns: Dict of atom name per content (bytes) """ atom_path = os.path.join(datadir, "atom") data = {} for filename in os.listdir(atom_path): filepath = os.path.join(atom_path, filename) with open(filepath, "rb") as f: raw_content = f.read().decode("utf-8") # Keep the filename without extension atom_name = filename.split(".")[0] data[atom_name] = raw_content return data def create_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED, ): """Create a skeleton shell deposit """ url = reverse(COL_IRI, args=[collection_name]) # when response = authenticated_client.post( url, content_type="application/zip", # as zip data=sample_archive["data"], # + headers CONTENT_LENGTH=sample_archive["length"], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=sample_archive["md5sum"], HTTP_PACKAGING="http://purl.org/net/sword/package/SimpleZip", HTTP_IN_PROGRESS="false", HTTP_CONTENT_DISPOSITION="attachment; filename=%s" % (sample_archive["name"]), ) # then assert response.status_code == status.HTTP_201_CREATED from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(external_id=external_id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def create_binary_deposit( authenticated_client, collection_name: str, sample_archive, external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED, atom_dataset: Mapping[str, bytes] = {}, ): """Create a deposit with both metadata and archive set. Then alters its status to `deposit_status`. """ deposit = create_deposit( authenticated_client, collection_name, sample_archive, external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL, ) response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data0"] % deposit.external_id.encode("utf-8"), HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS="true", ) assert response.status_code == status.HTTP_201_CREATED assert deposit.status == DEPOSIT_STATUS_PARTIAL from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit.id) if deposit.status != deposit_status: deposit.status = deposit_status deposit.save() assert deposit.status == deposit_status return deposit def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED): """Build deposit with a specific status """ @pytest.fixture() def _deposit( sample_archive, deposit_collection, authenticated_client, deposit_status=deposit_status, ): external_id = "external-id-%s" % deposit_status return create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id=external_id, deposit_status=deposit_status, ) return _deposit deposited_deposit = deposit_factory() rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED) partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL) verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED) completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS) failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE) @pytest.fixture def partial_deposit_with_metadata( sample_archive, deposit_collection, authenticated_client, atom_dataset ): """Returns deposit with archive and metadata provided, status 'partial' """ return create_binary_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-partial", deposit_status=DEPOSIT_STATUS_PARTIAL, atom_dataset=atom_dataset, ) @pytest.fixture def partial_deposit_only_metadata( deposit_collection, authenticated_client, atom_dataset ): response = authenticated_client.post( reverse(COL_IRI, args=[deposit_collection.name]), content_type="application/atom+xml;type=entry", data=atom_dataset["entry-data1"], HTTP_SLUG="external-id-partial", HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) deposit_id = response_content["deposit_id"] from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_PARTIAL return deposit @pytest.fixture def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success) """ deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" revision_id = "548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10" snapshot_id = "e5e82d064a9c3df7464223042e0c55d72ccff7f0" - deposit.swh_id = persistent_identifier(DIRECTORY, directory_id) - deposit.swh_id_context = persistent_identifier( + deposit.swh_id = swhid(DIRECTORY, directory_id) + deposit.swh_id_context = swhid( DIRECTORY, directory_id, metadata={ "origin": origin, - "visit": persistent_identifier(SNAPSHOT, snapshot_id), - "anchor": persistent_identifier(REVISION, revision_id), + "visit": swhid(SNAPSHOT, snapshot_id), + "anchor": swhid(REVISION, revision_id), "path": "/", }, ) deposit.save() return deposit @pytest.fixture() def tmp_path(tmp_path): return str(tmp_path) # issue with oldstable's pytest version diff --git a/swh/deposit/tests/api/data/atom/codemeta-sample.xml b/swh/deposit/tests/data/atom/codemeta-sample.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/codemeta-sample.xml rename to swh/deposit/tests/data/atom/codemeta-sample.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml b/swh/deposit/tests/data/atom/entry-data-badly-formatted.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml rename to swh/deposit/tests/data/atom/entry-data-badly-formatted.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml b/swh/deposit/tests/data/atom/entry-data-deposit-binary.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml rename to swh/deposit/tests/data/atom/entry-data-deposit-binary.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data-empty-body.xml b/swh/deposit/tests/data/atom/entry-data-empty-body.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data-empty-body.xml rename to swh/deposit/tests/data/atom/entry-data-empty-body.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data-ko.xml b/swh/deposit/tests/data/atom/entry-data-ko.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data-ko.xml rename to swh/deposit/tests/data/atom/entry-data-ko.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data-minimal.xml b/swh/deposit/tests/data/atom/entry-data-minimal.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data-minimal.xml rename to swh/deposit/tests/data/atom/entry-data-minimal.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml b/swh/deposit/tests/data/atom/entry-data-parsing-error-prone.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml rename to swh/deposit/tests/data/atom/entry-data-parsing-error-prone.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data0.xml b/swh/deposit/tests/data/atom/entry-data0.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data0.xml rename to swh/deposit/tests/data/atom/entry-data0.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data1.xml b/swh/deposit/tests/data/atom/entry-data1.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data1.xml rename to swh/deposit/tests/data/atom/entry-data1.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data2.xml b/swh/deposit/tests/data/atom/entry-data2.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data2.xml rename to swh/deposit/tests/data/atom/entry-data2.xml diff --git a/swh/deposit/tests/api/data/atom/entry-data3.xml b/swh/deposit/tests/data/atom/entry-data3.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-data3.xml rename to swh/deposit/tests/data/atom/entry-data3.xml diff --git a/swh/deposit/tests/api/data/atom/entry-update-in-place.xml b/swh/deposit/tests/data/atom/entry-update-in-place.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/entry-update-in-place.xml rename to swh/deposit/tests/data/atom/entry-update-in-place.xml diff --git a/swh/deposit/tests/api/data/atom/error-with-decimal.xml b/swh/deposit/tests/data/atom/error-with-decimal.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/error-with-decimal.xml rename to swh/deposit/tests/data/atom/error-with-decimal.xml diff --git a/swh/deposit/tests/api/data/atom/metadata.xml b/swh/deposit/tests/data/atom/metadata.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/metadata.xml rename to swh/deposit/tests/data/atom/metadata.xml diff --git a/swh/deposit/tests/api/data/atom/tei-sample.xml b/swh/deposit/tests/data/atom/tei-sample.xml similarity index 100% rename from swh/deposit/tests/api/data/atom/tei-sample.xml rename to swh/deposit/tests/data/atom/tei-sample.xml diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py index 510830af..0ebbc603 100644 --- a/swh/deposit/tests/loader/common.py +++ b/swh/deposit/tests/loader/common.py @@ -1,138 +1,139 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json +from typing import Dict, Optional -from typing import Dict from swh.deposit.client import PrivateApiDepositClient - from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.model import SnapshotBranch, TargetType +from swh.storage.algos.snapshot import snapshot_get_all_branches CLIENT_TEST_CONFIG = { "url": "http://nowhere:9000/", "auth": {}, # no authentication in test scenario } class SWHDepositTestClient(PrivateApiDepositClient): """Deposit test client to permit overriding the default request client. """ def __init__(self, client, config): super().__init__(config=config) self.client = client def archive_get(self, archive_update_url, archive_path, log=None): r = self.client.get(archive_update_url) with open(archive_path, "wb") as f: for chunk in r.streaming_content: f.write(chunk) return archive_path def metadata_get(self, metadata_url, log=None): r = self.client.get(metadata_url) return json.loads(r.content.decode("utf-8")) def status_update( self, update_status_url, status, revision_id=None, directory_id=None, origin_url=None, ): payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.client.put( update_status_url, content_type="application/json", data=json.dumps(payload) ) def check(self, check_url): r = self.client.get(check_url) data = json.loads(r.content.decode("utf-8")) return data["status"] def get_stats(storage) -> Dict: """Adaptation utils to unify the stats counters across storage implementation. """ storage.refresh_stat_counters() stats = storage.stat_counters() keys = [ "content", "directory", "origin", "origin_visit", "person", "release", "revision", "skipped_content", "snapshot", ] return {k: stats.get(k) for k in keys} -def decode_target(target): +def decode_target(branch: Optional[SnapshotBranch]) -> Optional[Dict]: """Test helper to ease readability in test """ - if not target: - return target - target_type = target["target_type"] + if not branch: + return None + target_type = branch.target_type - if target_type == "alias": - decoded_target = target["target"].decode("utf-8") + if target_type == TargetType.ALIAS: + decoded_target = branch.target.decode("utf-8") else: - decoded_target = hash_to_hex(target["target"]) + decoded_target = hash_to_hex(branch.target) return {"target": decoded_target, "target_type": target_type} def check_snapshot(expected_snapshot, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (dict): full snapshot with hex ids storage (Storage): expected storage """ expected_snapshot_id = expected_snapshot["id"] expected_branches = expected_snapshot["branches"] - snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id)) + snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id)) if snap is None: # display known snapshots instead if possible if hasattr(storage, "_snapshots"): # in-mem storage from pprint import pprint for snap_id, (_snap, _) in storage._snapshots.items(): snapd = _snap.to_dict() snapd["id"] = hash_to_hex(snapd["id"]) branches = { branch.decode("utf-8"): decode_target(target) for branch, target in snapd["branches"].items() } snapd["branches"] = branches pprint(snapd) raise AssertionError("Snapshot is not found") branches = { - branch.decode("utf-8"): decode_target(target) - for branch, target in snap["branches"].items() + branch.decode("utf-8"): decode_target(branch) + for branch_name, branch in snap["branches"].items() } assert expected_branches == branches diff --git a/swh/deposit/tests/loader/conftest.py b/swh/deposit/tests/loader/conftest.py index e340da91..260bd327 100644 --- a/swh/deposit/tests/loader/conftest.py +++ b/swh/deposit/tests/loader/conftest.py @@ -1,65 +1,37 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from functools import partial import re -import os -import pytest -import yaml -from functools import partial +import pytest from swh.core.pytest_plugin import get_response_cb -from swh.scheduler.tests.conftest import * # noqa -from swh.storage.tests.conftest import * # noqa from swh.deposit.loader.checker import DepositChecker -@pytest.fixture(scope="session") # type: ignore # expected redefinition -def celery_includes(): - return [ - "swh.deposit.loader.tasks", - ] - - @pytest.fixture -def swh_config(tmp_path, swh_storage_postgresql, monkeypatch): - storage_config = { - "url": "https://deposit.softwareheritage.org/", - "storage": { - "cls": "local", - "args": { - "db": swh_storage_postgresql.dsn, - "objstorage": {"cls": "memory", "args": {}}, - }, - }, +def deposit_config(tmp_path): + return { + "deposit": { + "url": "https://deposit.softwareheritage.org/1/private/", + "auth": {}, + } } - conffile = os.path.join(tmp_path, "deposit.yml") - with open(conffile, "w") as f: - f.write(yaml.dump(storage_config)) - monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) - return conffile - @pytest.fixture -def deposit_checker(): - return DepositChecker( - config={ - "deposit": { - "url": "https://deposit.softwareheritage.org/1/private/", - "auth": {}, - } - } - ) +def deposit_checker(deposit_config_path): + return DepositChecker() @pytest.fixture def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with put method """ cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.put(re.compile("https://"), body=cb) return requests_mock_datadir diff --git a/swh/deposit/tests/loader/test_checker.py b/swh/deposit/tests/loader/test_checker.py index c299b3bd..60d451ef 100644 --- a/swh/deposit/tests/loader/test_checker.py +++ b/swh/deposit/tests/loader/test_checker.py @@ -1,32 +1,32 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -def test_check_deposit_ready(swh_config, requests_mock_datadir, deposit_checker): +def test_checker_deposit_ready(requests_mock_datadir, deposit_checker): """Check on a valid 'deposited' deposit should result in 'verified' """ actual_result = deposit_checker.check(collection="test", deposit_id=1) assert actual_result == {"status": "eventful"} -def test_check_deposit_rejected(swh_config, requests_mock_datadir, deposit_checker): +def test_checker_deposit_rejected(requests_mock_datadir, deposit_checker): """Check on invalid 'deposited' deposit should result in 'rejected' """ actual_result = deposit_checker.check(collection="test", deposit_id=2) assert actual_result == {"status": "failed"} @patch("swh.deposit.client.requests.get") -def test_check_deposit_rejected_exception(mock_requests, swh_config, deposit_checker): +def test_checker_deposit_rejected_exception(mock_requests, deposit_checker): """Check on invalid 'deposited' deposit should result in 'rejected' """ mock_requests.side_effect = ValueError("simulated problem when checking") actual_result = deposit_checker.check(collection="test", deposit_id=3) assert actual_result == {"status": "failed"} diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py index 4f099d40..55edd2c7 100644 --- a/swh/deposit/tests/loader/test_client.py +++ b/swh/deposit/tests/loader/test_client.py @@ -1,247 +1,246 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os import json -import pytest -import unittest - +import os from typing import Any, Callable, Optional +import unittest from urllib.parse import urlparse -from swh.deposit.client import PrivateApiDepositClient -from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE +import pytest +from swh.deposit.client import PrivateApiDepositClient +from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_SUCCESS CLIENT_TEST_CONFIG = { "url": "https://nowhere.org/", "auth": {}, # no authentication in test scenario } def build_expected_path(datadir, base_url: str, api_url: str) -> str: """Build expected path from api to served file """ url = urlparse(base_url) dirname = "%s_%s" % (url.scheme, url.hostname) if api_url.endswith("/"): api_url = api_url[:-1] if api_url.startswith("/"): api_url = api_url[1:] suffix_path = api_url.replace("/", "_") return os.path.join(datadir, dirname, suffix_path) def test_build_expected_path(datadir): actual_path = build_expected_path(datadir, "http://example.org", "/hello/you/") assert actual_path == os.path.join(datadir, "http_example.org", "hello_you") def read_served_path( datadir, base_url: str, api_url: str, convert_fn: Optional[Callable[[str], Any]] = None, ) -> bytes: """Read served path """ archive_path = build_expected_path(datadir, base_url, api_url) with open(archive_path, "rb") as f: content = f.read() if convert_fn: content = convert_fn(content.decode("utf-8")) return content def test_read_served_path(datadir): actual_content = read_served_path(datadir, "http://example.org", "/hello/you/") assert actual_content == b"hello people\n" actual_content2 = read_served_path( datadir, "http://example.org", "/hello.json", convert_fn=json.loads ) assert actual_content2 == {"a": [1, 3]} # private api to retrieve archive def test_archive_get(tmp_path, datadir, requests_mock_datadir): """Retrieving archive data through private api should stream data """ api_url = "/1/private/test/1/raw/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) expected_content = read_served_path(datadir, client.base_url, api_url) archive_path = os.path.join(tmp_path, "test.archive") archive_path = client.archive_get(api_url, archive_path) assert os.path.exists(archive_path) is True with open(archive_path, "rb") as f: actual_content = f.read() assert actual_content == expected_content assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth is None def test_archive_get_auth(tmp_path, datadir, requests_mock_datadir): """Retrieving archive data through private api should stream data """ api_url = "/1/private/test/1/raw/" config = CLIENT_TEST_CONFIG.copy() config["auth"] = { # add authentication setup "username": "user", "password": "pass", } client = PrivateApiDepositClient(config) expected_content = read_served_path(datadir, client.base_url, api_url) archive_path = os.path.join(tmp_path, "test.archive") archive_path = client.archive_get(api_url, archive_path) assert os.path.exists(archive_path) is True with open(archive_path, "rb") as f: actual_content = f.read() assert actual_content == expected_content assert client.base_url == CLIENT_TEST_CONFIG["url"] assert client.auth == ("user", "pass") def test_archive_get_ko(tmp_path, datadir, requests_mock_datadir): """Reading archive can fail for some reasons """ unknown_api_url = "/1/private/unknown/deposit-id/raw/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when retrieving deposit"): client.archive_get(unknown_api_url, "some/path") # private api read metadata def test_metadata_get(datadir, requests_mock_datadir): """Reading archive should write data in temporary directory """ api_url = "/1/private/test/1/metadata" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) actual_metadata = client.metadata_get(api_url) assert isinstance(actual_metadata, str) is False expected_content = read_served_path( datadir, client.base_url, api_url, convert_fn=json.loads ) assert actual_metadata == expected_content def test_metadata_get_ko(requests_mock_datadir): """Reading metadata can fail for some reasons """ unknown_api_url = "/1/private/unknown/deposit-id/metadata/" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when retrieving metadata"): client.metadata_get(unknown_api_url) # private api check def test_check(requests_mock_datadir): """When check ok, this should return the deposit's status """ api_url = "/1/private/test/1/check" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) r = client.check(api_url) assert r == "something" def test_check_fails(requests_mock_datadir): """Checking deposit can fail for some reason """ unknown_api_url = "/1/private/test/10/check" client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG) with pytest.raises(ValueError, match="Problem when checking deposit"): client.check(unknown_api_url) # private api update status class FakeRequestClientPut: """Fake Request client dedicated to put request method calls. """ args = None kwargs = None def put(self, *args, **kwargs): self.args = args self.kwargs = kwargs class PrivateApiDepositClientStatusUpdateTest(unittest.TestCase): def test_status_update(self): """Update status """ _client = FakeRequestClientPut() deposit_client = PrivateApiDepositClient( config=CLIENT_TEST_CONFIG, _client=_client ) deposit_client.status_update( "/update/status", DEPOSIT_STATUS_LOAD_SUCCESS, revision_id="some-revision-id", ) self.assertEqual(_client.args, ("https://nowhere.org/update/status",)) self.assertEqual( _client.kwargs, { "json": { "status": DEPOSIT_STATUS_LOAD_SUCCESS, "revision_id": "some-revision-id", } }, ) def test_status_update_with_no_revision_id(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientPut() deposit_client = PrivateApiDepositClient( config=CLIENT_TEST_CONFIG, _client=_client ) deposit_client.status_update("/update/status/fail", DEPOSIT_STATUS_LOAD_FAILURE) self.assertEqual(_client.args, ("https://nowhere.org/update/status/fail",)) self.assertEqual( _client.kwargs, {"json": {"status": DEPOSIT_STATUS_LOAD_FAILURE,}} ) diff --git a/swh/deposit/tests/loader/test_tasks.py b/swh/deposit/tests/loader/test_tasks.py index c62fd45a..5f85ebcd 100644 --- a/swh/deposit/tests/loader/test_tasks.py +++ b/swh/deposit/tests/loader/test_tasks.py @@ -1,69 +1,75 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest @pytest.mark.db -def test_deposit_check_eventful(mocker, swh_config, swh_app, celery_session_worker): +def test_task_check_eventful( + mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker +): """Successful check should make the check succeed """ client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") client.return_value = "verified" collection = "collection" deposit_id = 42 - res = swh_app.send_task( + res = swh_scheduler_celery_app.send_task( "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] ) assert res res.wait() assert res.successful() assert res.result == {"status": "eventful"} client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") @pytest.mark.db -def test_deposit_check_failure(mocker, swh_config, swh_app, celery_session_worker): +def test_task_check_failure( + mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker +): """Unverified check status should make the check fail """ client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") client.return_value = "not-verified" # will make the status "failed" collection = "collec" deposit_id = 666 - res = swh_app.send_task( + res = swh_scheduler_celery_app.send_task( "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] ) assert res res.wait() assert res.successful() assert res.result == {"status": "failed"} client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") @pytest.mark.db -def test_deposit_check_3(mocker, swh_config, swh_app, celery_session_worker): +def test_task_check_3( + mocker, deposit_config_path, swh_scheduler_celery_app, swh_scheduler_celery_worker +): """Unexpected failures should fail the check """ client = mocker.patch("swh.deposit.loader.checker.PrivateApiDepositClient.check") client.side_effect = ValueError("unexpected failure will make it fail") collection = "another-collection" deposit_id = 999 - res = swh_app.send_task( + res = swh_scheduler_celery_app.send_task( "swh.deposit.loader.tasks.ChecksDepositTsk", args=[collection, deposit_id] ) assert res res.wait() assert res.successful() assert res.result == {"status": "failed"} client.assert_called_once_with(f"/{collection}/{deposit_id}/check/") diff --git a/swh/deposit/tests/test_init.py b/swh/deposit/tests/test_init.py new file mode 100644 index 00000000..88fca573 --- /dev/null +++ b/swh/deposit/tests/test_init.py @@ -0,0 +1,10 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_version(): + from swh.deposit.api import __version__ + + assert __version__ is not None diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py index 644d8f33..8be41c4c 100644 --- a/swh/deposit/tests/test_utils.py +++ b/swh/deposit/tests/test_utils.py @@ -1,141 +1,141 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import pytest - from unittest.mock import patch +import pytest + from swh.deposit import utils def test_merge(): """Calling utils.merge on dicts should merge without losing information """ d0 = {"author": "someone", "license": [["gpl2"]], "a": 1} d1 = { "author": ["author0", {"name": "author1"}], "license": [["gpl3"]], "b": {"1": "2"}, } d2 = {"author": map(lambda x: x, ["else"]), "license": "mit", "b": {"2": "3",}} d3 = { "author": (v for v in ["no one"]), } actual_merge = utils.merge(d0, d1, d2, d3) expected_merge = { "a": 1, "license": [["gpl2"], ["gpl3"], "mit"], "author": ["someone", "author0", {"name": "author1"}, "else", "no one"], "b": {"1": "2", "2": "3",}, } assert actual_merge == expected_merge def test_merge_2(): d0 = {"license": "gpl2", "runtime": {"os": "unix derivative"}} d1 = {"license": "gpl3", "runtime": "GNU/Linux"} expected = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } actual = utils.merge(d0, d1) assert actual == expected def test_merge_edge_cases(): input_dict = { "license": ["gpl2", "gpl3"], "runtime": [{"os": "unix derivative"}, "GNU/Linux"], } # against empty dict actual = utils.merge(input_dict, {}) assert actual == input_dict # against oneself actual = utils.merge(input_dict, input_dict, input_dict) assert actual == input_dict def test_merge_one_dict(): """Merge one dict should result in the same dict value """ input_and_expected = {"anything": "really"} actual = utils.merge(input_and_expected) assert actual == input_and_expected def test_merge_raise(): """Calling utils.merge with any no dict argument should raise """ d0 = {"author": "someone", "a": 1} d1 = ["not a dict"] with pytest.raises(ValueError): utils.merge(d0, d1) with pytest.raises(ValueError): utils.merge(d1, d0) with pytest.raises(ValueError): utils.merge(d1) assert utils.merge(d0) == d0 @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_0(mock_normalize): """When date is a list, choose the first date and normalize it Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date(["2017-10-12", "date1"]) expected_date = "2017-10-12 00:00:00+00:00" assert str(actual_date) == expected_date @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_1(mock_normalize): """Providing a date in a reasonable format, everything is fine Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date("2018-06-11 17:02:02") expected_date = "2018-06-11 17:02:02+00:00" assert str(actual_date) == expected_date @patch("swh.deposit.utils.normalize_timestamp", side_effect=lambda x: x) def test_normalize_date_doing_irrelevant_stuff(mock_normalize): """Providing a date with only the year results in a reasonable date Note: We do not test swh.model.identifiers which is already tested in swh.model """ actual_date = utils.normalize_date("2017") expected_date = "2017-01-01 00:00:00+00:00" assert str(actual_date) == expected_date diff --git a/swh/deposit/urls.py b/swh/deposit/urls.py index 384844c3..9f6ab0eb 100644 --- a/swh/deposit/urls.py +++ b/swh/deposit/urls.py @@ -1,31 +1,31 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SWH's main deposit URL Configuration """ -from django.conf.urls import url, include +from django.conf.urls import include, url from django.shortcuts import render from django.views.generic.base import RedirectView from rest_framework.urlpatterns import format_suffix_patterns favicon_view = RedirectView.as_view( url="/static/img/icons/swh-logo-32x32.png", permanent=True ) def default_view(req): return render(req, "homepage.html") urlpatterns = [ url(r"^favicon\.ico$", favicon_view), url(r"^1/", include("swh.deposit.api.urls")), url(r"^1/private/", include("swh.deposit.api.private.urls")), url(r"^$", default_view, name="home"), ] urlpatterns = format_suffix_patterns(urlpatterns) diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py index ee3711db..3b79293e 100644 --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -1,83 +1,83 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import iso8601 - from types import GeneratorType +import iso8601 + from swh.model.identifiers import normalize_timestamp def merge(*dicts): """Given an iterator of dicts, merge them losing no information. Args: *dicts: arguments are all supposed to be dict to merge into one Returns: dict merged without losing information """ def _extend(existing_val, value): """Given an existing value and a value (as potential lists), merge them together without repetition. """ if isinstance(value, (list, map, GeneratorType)): vals = value else: vals = [value] for v in vals: if v in existing_val: continue existing_val.append(v) return existing_val d = {} for data in dicts: if not isinstance(data, dict): raise ValueError("dicts is supposed to be a variable arguments of dict") for key, value in data.items(): existing_val = d.get(key) if not existing_val: d[key] = value continue if isinstance(existing_val, (list, map, GeneratorType)): new_val = _extend(existing_val, value) elif isinstance(existing_val, dict): if isinstance(value, dict): new_val = merge(existing_val, value) else: new_val = _extend([existing_val], value) else: new_val = _extend([existing_val], value) d[key] = new_val return d def normalize_date(date): """Normalize date fields as expected by swh workers. If date is a list, elect arbitrarily the first element of that list If date is (then) a string, parse it through dateutil.parser.parse to extract a datetime. Then normalize it through swh.model.identifiers.normalize_timestamp. Returns The swh date object """ if isinstance(date, list): date = date[0] if isinstance(date, str): date = iso8601.parse_date(date) return normalize_timestamp(date) diff --git a/tox.ini b/tox.ini index 00c7376d..625647a4 100644 --- a/tox.ini +++ b/tox.ini @@ -1,44 +1,45 @@ [tox] envlist=flake8,mypy,py3-django2 [testenv] extras = testing deps = # the dependency below is needed for now as a workaround for # https://github.com/pypa/pip/issues/6239 - swh.core[http] >= 0.0.75 + swh.core[http] >= 0.3 + swh.scheduler[testing] >= 0.5.0 dev: pdbpp pytest-cov django2: Django>=2,<3 commands = pytest \ !dev: --cov {envsitepackagesdir}/swh/deposit --cov-branch \ {envsitepackagesdir}/swh/deposit \ {posargs} [testenv:black] skip_install = true deps = black commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 \ --exclude=.tox,.git,__pycache__,.tox,.eggs,*.egg,swh/deposit/migrations [testenv:mypy] setenv = DJANGO_SETTINGS_MODULE=swh.deposit.settings.testing extras = testing deps = mypy django-stubs djangorestframework-stubs commands = mypy swh diff --git a/version.txt b/version.txt deleted file mode 100644 index c5d8e33c..00000000 --- a/version.txt +++ /dev/null @@ -1 +0,0 @@ -v0.0.90-0-gc586ff17 \ No newline at end of file