diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c279bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +*.pyc +*.sw? +*~ +/.coverage +/.coverage.* +.eggs/ +__pycache__ +*.egg-info/ +build/ +dist/ +version.txt +/sql/createdb-stamp +/sql/filldb-stamp +.tox/ +.hypothesis/ +.mypy_cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d1f84e3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,40 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.4.0 + hooks: + - id: trailing-whitespace + - id: flake8 + - id: check-json + - id: check-yaml + +- repo: https://github.com/codespell-project/codespell + rev: v1.16.0 + hooks: + - id: codespell + exclude: ^(swh/indexer/data/codemeta/crosswalk.csv)$ +- repo: local + hooks: + - id: mypy + name: mypy + entry: mypy + args: [swh] + pass_filenames: false + language: system + types: [python] + +- repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black + +# unfortunately, we are far from being able to enable this... +# - repo: https://github.com/PyCQA/pydocstyle.git +# rev: 4.0.0 +# hooks: +# - id: pydocstyle +# name: pydocstyle +# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. +# entry: pydocstyle --convention=google +# language: python +# types: [python] + diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..27d038e --- /dev/null +++ b/AUTHORS @@ -0,0 +1,3 @@ +Copyright (C) 2015-2017 The Software Heritage developers + +See http://www.softwareheritage.org/ for more information. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..0ad22b5 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,78 @@ +# Software Heritage Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as Software +Heritage contributors and maintainers pledge to making participation in our +project and our community a harassment-free experience for everyone, regardless +of age, body size, disability, ethnicity, sex characteristics, gender identity +and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at `conduct@softwareheritage.org`. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an +incident. Further details of specific enforcement policies may be posted +separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..a1a7b45 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1,2 @@ +Siddharth Ravikumar +Thibault Allançon diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..94a9ed0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/MANIFEST.in b/MANIFEST.in index c6e3a9a..d5bc305 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,8 +1,9 @@ include README.md include Makefile include requirements*.txt include version.txt +include conftest.py recursive-include sql * recursive-include swh/indexer/sql *.sql recursive-include swh/indexer/data * recursive-include swh py.typed diff --git a/Makefile.local b/Makefile.local new file mode 100644 index 0000000..c163514 --- /dev/null +++ b/Makefile.local @@ -0,0 +1 @@ +TESTFLAGS=--hypothesis-profile=fast diff --git a/PKG-INFO b/PKG-INFO index a2920a6..06fbd34 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.1.0 +Version: 0.1.1 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/codemeta.json b/codemeta.json new file mode 100644 index 0000000..8eaf5cc --- /dev/null +++ b/codemeta.json @@ -0,0 +1,39 @@ +{ + "@context": "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld", + "@type": "SoftwareSourceCode", + "identifier": "5682a72dc61f86ae69f2841c2184d6159c0b6d5d", + "description": "Software Heritage Indexer for revisions and contents", + "name": "swh-indexer", + "isPartOf": { + "@type": "SoftwareSourceCode", + "name": "swh-environment", + "identifier": "83e766feafde91242883be1bf369ed3e6865824f" + }, + "codeRepository": "https://forge.softwareheritage.org/diffusion/78/", + "issueTracker": "https://forge.softwareheritage.org/maniphest/", + "license": "https://spdx.org/licenses/GPL-3.0.html", + "version": "0.0.35", + "author": [ + { + "@type": "Organization", + "name": "Software Heritage", + "url": "https://www.softwareheritage.org", + "email": "swh-devel@inria.fr" + } + ], + "developmentStatus": "active", + "keywords": [ + "indexer", + "software", + "mimetype", + "ctags", + "language", + "fossology-license", + "metadata", + "metadata-detector", + "metadata-translator" + ], + "dateCreated":"2017-06-12", + "datePublished":"2017-06-12", + "programmingLanguage": "Python" +} diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..de31662 --- /dev/null +++ b/conftest.py @@ -0,0 +1,19 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from hypothesis import settings + +# define tests profile. Full documentation is at: +# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles +settings.register_profile("fast", max_examples=5, deadline=5000) +settings.register_profile("slow", max_examples=20, deadline=5000) + +# Ignore the following modules because wsgi module fails as no +# configuration file is found (--doctest-modules forces the module +# loading) +collect_ignore = ["swh/indexer/storage/api/wsgi.py"] + +# we use the swh_scheduler fixture +pytest_plugins = ["swh.scheduler.pytest_plugin"] diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..58a761e --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,3 @@ +_build/ +apidoc/ +*-stamp diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..59d8b80 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,3 @@ +include ../../swh-docs/Makefile.sphinx +-include Makefile.local + diff --git a/docs/Makefile.local b/docs/Makefile.local new file mode 100644 index 0000000..cd07101 --- /dev/null +++ b/docs/Makefile.local @@ -0,0 +1,11 @@ +sphinx/html: images +sphinx/clean: clean-images +assets: images + +images: + make -C images/ +clean-images: + make -C images/ clean + +.PHONY: images clean-images + diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..f4f2481 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,49 @@ +swh-indexer +============ + +Tools to compute multiple indexes on SWH's raw contents: +- content: + - mimetype + - ctags + - language + - fossology-license + - metadata +- revision: + - metadata + +An indexer is in charge of: +- looking up objects +- extracting information from those objects +- store those information in the swh-indexer db + +There are multiple indexers working on different object types: + - content indexer: works with content sha1 hashes + - revision indexer: works with revision sha1 hashes + - origin indexer: works with origin identifiers + +Indexation procedure: +- receive batch of ids +- retrieve the associated data depending on object type +- compute for that object some index +- store the result to swh's storage + +Current content indexers: + +- mimetype (queue swh_indexer_content_mimetype): detect the encoding + and mimetype + +- language (queue swh_indexer_content_language): detect the + programming language + +- ctags (queue swh_indexer_content_ctags): compute tags information + +- fossology-license (queue swh_indexer_fossology_license): compute the + license + +- metadata: translate file into translated_metadata dict + +Current revision indexers: + +- metadata: detects files containing metadata and retrieves translated_metadata + in content_metadata table in storage or run content indexer to translate + files. diff --git a/docs/_static/.placeholder b/docs/_static/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/docs/_templates/.placeholder b/docs/_templates/.placeholder new file mode 100644 index 0000000..e69de29 diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..190deb7 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1 @@ +from swh.docs.sphinx.conf import * # NoQA diff --git a/docs/dev-info.rst b/docs/dev-info.rst new file mode 100644 index 0000000..493b102 --- /dev/null +++ b/docs/dev-info.rst @@ -0,0 +1,206 @@ +Hacking on swh-indexer +====================== + +This tutorial will guide you through the hacking on the swh-indexer. +If you do not have a local copy of the Software Heritage archive, go to the +`getting started tutorial +`_ + +Configuration files +------------------- +You will need the following YAML configuration files to run the swh-indexer +commands: + +- Orchestrator at + ``~/.config/swh/indexer/orchestrator.yml`` + +.. code-block:: yaml + + indexers: + mimetype: + check_presence: false + batch_size: 100 + +- Orchestrator-text at + ``~/.config/swh/indexer/orchestrator-text.yml`` + +.. code-block:: yaml + + indexers: + # language: + # batch_size: 10 + # check_presence: false + fossology_license: + batch_size: 10 + check_presence: false + # ctags: + # batch_size: 2 + # check_presence: false + +- Mimetype indexer at + ``~/.config/swh/indexer/mimetype.yml`` + +.. code-block:: yaml + + # storage to read sha1's metadata (path) + # storage: + # cls: local + # args: + # db: "service=swh-dev" + # objstorage: + # cls: pathslicing + # args: + # root: /home/storage/swh-storage/ + # slicing: 0:1/1:5 + + storage: + cls: remote + args: + url: http://localhost:5002/ + + indexer_storage: + cls: remote + args: + url: http://localhost:5007/ + + # storage to read sha1's content + # adapt this to your need + # locally: this needs to match your storage's setup + objstorage: + cls: pathslicing + args: + slicing: 0:1/1:5 + root: /home/storage/swh-storage/ + + destination_task: swh.indexer.tasks.SWHOrchestratorTextContentsTask + rescheduling_task: swh.indexer.tasks.SWHContentMimetypeTask + + +- Fossology indexer at + ``~/.config/swh/indexer/fossology_license.yml`` + +.. code-block:: yaml + + # storage to read sha1's metadata (path) + # storage: + # cls: local + # args: + # db: "service=swh-dev" + # objstorage: + # cls: pathslicing + # args: + # root: /home/storage/swh-storage/ + # slicing: 0:1/1:5 + + storage: + cls: remote + url: http://localhost:5002/ + + indexer_storage: + cls: remote + args: + url: http://localhost:5007/ + + # storage to read sha1's content + # adapt this to your need + # locally: this needs to match your storage's setup + objstorage: + cls: pathslicing + args: + slicing: 0:1/1:5 + root: /home/storage/swh-storage/ + + workdir: /tmp/swh/worker.indexer/license/ + + tools: + name: 'nomos' + version: '3.1.0rc2-31-ga2cbb8c' + configuration: + command_line: 'nomossa ' + + +- Worker at + ``~/.config/swh/worker.yml`` + +.. code-block:: yaml + + task_broker: amqp://guest@localhost// + task_modules: + - swh.loader.svn.tasks + - swh.loader.tar.tasks + - swh.loader.git.tasks + - swh.storage.archiver.tasks + - swh.indexer.tasks + - swh.indexer.orchestrator + task_queues: + - swh_loader_svn + - swh_loader_tar + - swh_reader_git_to_azure_archive + - swh_storage_archive_worker_to_backend + - swh_indexer_orchestrator_content_all + - swh_indexer_orchestrator_content_text + - swh_indexer_content_mimetype + - swh_indexer_content_language + - swh_indexer_content_ctags + - swh_indexer_content_fossology_license + - swh_loader_svn_mount_and_load + - swh_loader_git_express + - swh_loader_git_archive + - swh_loader_svn_archive + task_soft_time_limit: 0 + + +Database +-------- + +swh-indxer uses a database to store the indexed content. The default +db is expected to be called swh-indexer-dev. + +Create or add ``swh-dev`` and ``swh-indexer-dev`` to +the ``~/.pg_service.conf`` and ``~/.pgpass`` files, which are postgresql's +configuration files. + +Add data to local DB +-------------------- +from within the ``swh-environment``, run the following command:: + + make rebuild-testdata + +and fetch some real data to work with, using:: + + python3 -m swh.loader.git.updater --origin-url + +Then you can list all content files using this script:: + + #!/usr/bin/env bash + + psql service=swh-dev -c "copy (select sha1 from content) to stdin" | sed -e 's/^\\\\x//g' + +Run the indexers +----------------- +Use the list off contents to feed the indexers with with the +following command:: + + ./list-sha1.sh | python3 -m swh.indexer.producer --batch 100 --task-name orchestrator_all + +Activate the workers +-------------------- +To send messages to different queues using rabbitmq +(which should already be installed through dependencies installation), +run the following command in a dedicated terminal:: + + python3 -m celery worker --app=swh.scheduler.celery_backend.config.app \ + --pool=prefork \ + --concurrency=1 \ + -Ofair \ + --loglevel=info \ + --without-gossip \ + --without-mingle \ + --without-heartbeat 2>&1 + +With this command rabbitmq will consume message using the worker +configuration file. + +Note: for the fossology_license indexer, you need a package fossology-nomossa +which is in our `public debian repository +`_. diff --git a/docs/images/.gitignore b/docs/images/.gitignore new file mode 100644 index 0000000..d890b03 --- /dev/null +++ b/docs/images/.gitignore @@ -0,0 +1 @@ +tasks-metadata-indexers.svg diff --git a/docs/images/Makefile b/docs/images/Makefile new file mode 100644 index 0000000..3481956 --- /dev/null +++ b/docs/images/Makefile @@ -0,0 +1,11 @@ + +UML_DIAGS_SRC = $(wildcard *.uml) +UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC)) + +all: $(UML_DIAGS) + +%.svg: %.uml + DISPLAY="" plantuml -tsvg $< + +clean: + -rm -f $(DEP_GRAPHS) $(UML_DIAGS) diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml new file mode 100644 index 0000000..954e079 --- /dev/null +++ b/docs/images/tasks-metadata-indexers.uml @@ -0,0 +1,84 @@ +@startuml + participant LOADERS as "Loaders" + participant JOURNAL as "Journal" + participant SCHEDULER as "Scheduler" + participant IDX_ORIG_HEAD as "Origin-Head Indexer" + participant IDX_REV_META as "Revision Metadata Indexer" + participant IDX_CONT_META as "Content Metadata Indexer" + participant IDX_ORIG_META as "Origin Metadata Indexer" + participant IDX_STORAGE as "Indexer Storage" + participant STORAGE as "Graph Storage" + participant OBJ_STORAGE as "Object Storage" + + activate OBJ_STORAGE + activate IDX_STORAGE + activate STORAGE + activate JOURNAL + activate SCHEDULER + + activate LOADERS + + LOADERS->>JOURNAL: Origin 42 was added/revisited + deactivate LOADERS + + JOURNAL->>SCHEDULER: run indexers on origin 42 + + SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42 + activate IDX_ORIG_HEAD + + IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42) + + STORAGE->>IDX_ORIG_HEAD: branches + + IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42) + deactivate IDX_ORIG_HEAD + + SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42) + activate IDX_REV_META + + IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef) + STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...} + + IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab) + STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...] + + IDX_REV_META->>IDX_REV_META: package.json is a metadata file + + IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe) + IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...} + + alt If the storage answered "none" + IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file + activate IDX_CONT_META + + IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe + + OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...' + + IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author + + IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_CONT_META: ok + + IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...} + deactivate IDX_CONT_META + end + + IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_REV_META: ok + + IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef + deactivate IDX_REV_META + + SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef + activate IDX_ORIG_META + + IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef) + IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...} + + IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_ORIG_META: ok + deactivate IDX_ORIG_META + + +@enduml diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..b80d6f4 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,25 @@ +.. _swh-indexer: + +Software Heritage - Indexer +=========================== + +Tools and workers used to mine the content of the archive and extract derived +information from archive source code artifacts. + + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + README.md + dev-info.rst + metadata-workflow.rst + + +Reference Documentation +----------------------- + +.. toctree:: + :maxdepth: 2 + + /apidoc/swh.indexer diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst new file mode 100644 index 0000000..471ce8c --- /dev/null +++ b/docs/metadata-workflow.rst @@ -0,0 +1,208 @@ +Metadata workflow +================= + +Intrinsic metadata +------------------ + +Indexing :term:`intrinsic metadata` requires extracting information from the +lowest levels of the :ref:`Merkle DAG ` (directories, files, +and content blobs) and associate them to the highest ones (origins). +In order to deduplicate the work between origins, we split this work between +multiple indexers, which coordinate with each other and save their results +at each step in the indexer storage. + +Indexer architecture +-------------------- + +.. thumbnail:: images/tasks-metadata-indexers.svg + + +Origin-Head Indexer +___________________ + +First, the Origin-Head indexer gets called externally, with an origin as +argument (or multiple origins, that are handled sequentially). +For now, its tasks are scheduled manually via recurring Scheduler tasks; but +in the near future, the :term:`journal` will be used to do that. + +It first looks up the last :term:`snapshot` and determines what the main +branch of origin is (the "Head branch") and what revision it points to +(the "Head"). +Intrinsic metadata for that origin will be extracted from that revision. + +It schedules a Revision Metadata Indexer task for that revision, with a +hint that the revision is the Head of that particular origin. + + +Revision and Content Metadata Indexers +______________________________________ + +These two indexers do the hard part of the work. The Revision Metadata +Indexer fetches the root directory associated with a revision, then extracts +the metadata from that directory. + +To do so, it lists files in that directory, and looks for known names, such +as `codemeta.json`, `package.json`, or `pom.xml`. If there are any, it +runs the Content Metadata Indexer on them, which in turn fetches their +contents and runs them through extraction dictionaries/mappings. +See below for details. + +Their results are saved in a database (the indexer storage), associated with +the content and revision hashes. + +If it received a hint that this revision is the head of an origin, the +Revision Metadata Indexer then schedules the Origin Metadata Indexer +to run on that origin. + + +Origin Metadata Indexer +_______________________ + +The job of this indexer is very simple: it takes an origin identifier and +a revision hash, and copies the metadata of the former to a new table, to +associate it with the latter. + +The reason for this is to be able to perform searches on metadata, and +efficiently find out which origins matched the pattern. +Running that search on the `revision_metadata` table would require either +a reverse lookup from revisions to origins, which is costly. + + +Translation from language-specific metadata to CodeMeta +------------------------------------------------------- + +Intrinsic metadata are extracted from files provided with a project's source +code, and translated using `CodeMeta`_'s `crosswalk table`_. + +All input formats supported so far are straightforward dictionaries (eg. JSON) +or can be accessed as such (eg. XML); and the first part of the translation is +to map their keys to a term in the CodeMeta vocabulary. +This is done by parsing the crosswalk table's `CSV file`_ and using it as a +map between these two vocabularies; and this does not require any +format-specific code in the indexers. + +The second part is to normalize values. As language-specific metadata files +each have their way(s) of formatting these values, we need to turn them into +the data type required by CodeMeta. +This normalization makes up for most of the code of +:py:mod:`swh.indexer.metadata_dictionary`. + +.. _CodeMeta: https://codemeta.github.io/ +.. _crosswalk table: https://codemeta.github.io/crosswalk/ +.. _CSV file: https://github.com/codemeta/codemeta/blob/master/crosswalk.csv + + +Supported intrinsic metadata +---------------------------- + +The following sources of intrinsic metadata are supported: + +* CodeMeta's `codemeta.json`_, +* Maven's `pom.xml`_, +* NPM's `package.json`_, +* Python's `PKG-INFO`_, +* Ruby's `.gemspec`_ + +.. _codemeta.json: https://codemeta.github.io/terms/ +.. _pom.xml: https://maven.apache.org/pom.html +.. _package.json: https://docs.npmjs.com/files/package.json +.. _PKG-INFO: https://www.python.org/dev/peps/pep-0314/ +.. _.gemspec: https://guides.rubygems.org/specification-reference/ + + +Supported CodeMeta terms +------------------------ + +The following terms may be found in the output of the metadata translation +(other than the `codemeta` mapping, which is the identity function, and +therefore supports all terms): + +.. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta + :nostderr: + + +Adding support for additional ecosystem-specific metadata +--------------------------------------------------------- + +This section will guide you through adding code to the metadata indexer to +detect and translate new metadata formats. + +First, you should start by picking one of the `CodeMeta crosswalks`_. +Then create a new file in `swh-indexer/swh/indexer/metadata_dictionary/`, that +will contain your code, and create a new class that inherits from helper +classes, with some documentation about your indexer: + +.. code-block:: python + + from .base import DictMapping, SingleFileMapping + from swh.indexer.codemeta import CROSSWALK_TABLE + + class MyMapping(DictMapping, SingleFileMapping): + """Dedicated class for ...""" + name = 'my-mapping' + filename = b'the-filename' + mapping = CROSSWALK_TABLE['Name of the CodeMeta crosswalk'] + +.. _CodeMeta crosswalks: https://github.com/codemeta/codemeta/tree/master/crosswalks + +Then, add a `string_fields` attribute, that is the list of all keys whose +values are simple text values. For instance, to +`translate Python PKG-INFO`_, it's: + +.. code-block:: python + + string_fields = ['name', 'version', 'description', 'summary', + 'author', 'author-email'] + + +These values will be automatically added to the above list of +supported terms. + +.. _translate Python PKG-INFO: https://forge.softwareheritage.org/source/swh-indexer/browse/master/swh/indexer/metadata_dictionary/python.py + +Last step to get your code working: add a `translate` method that will +take a single byte string as argument, turn it into a Python dictionary, +whose keys are the ones of the input document, and pass it to +`_translate_dict`. + +For instance, if the input document is in JSON, it can be as simple as: + +.. code-block:: python + + def translate(self, raw_content): + raw_content = raw_content.decode() # bytes to str + content_dict = json.loads(raw_content) # str to dict + return self._translate_dict(content_dict) # convert to CodeMeta + +`_translate_dict` will do the heavy work of reading the crosswalk table for +each of `string_fields`, read the corresponding value in the `content_dict`, +and build a CodeMeta dictionary with the corresponding names from the +crosswalk table. + +One last thing to run your code: add it to the list in +`swh-indexer/swh/indexer/metadata_dictionary/__init__.py`, so the rest of the +code is aware of it. + +Now, you can run it: + +.. code-block:: shell + + python3 -m swh.indexer.metadata_dictionary MyMapping path/to/input/file + +and it will (hopefully) returns a CodeMeta object. + +If it works, well done! + +You can now improve your translation code further, by adding methods that +will do more advanced conversion. For example, if there is a field named +`license` containing an SPDX identifier, you must convert it to an URI, +like this: + +.. code-block:: python + + def normalize_license(self, s): + if isinstance(s, str): + return {"@id": "https://spdx.org/licenses/" + s} + +This method will automatically get called by `_translate_dict` when it +finds a `license` field in `content_dict`. diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..0df07a7 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,30 @@ +[mypy] +namespace_packages = True +warn_unused_ignores = True + + +# 3rd party libraries without stubs (yet) + +[mypy-celery.*] +ignore_missing_imports = True + +[mypy-confluent_kafka.*] +ignore_missing_imports = True + +[mypy-magic.*] +ignore_missing_imports = True + +[mypy-pkg_resources.*] +ignore_missing_imports = True + +[mypy-psycopg2.*] +ignore_missing_imports = True + +[mypy-pyld.*] +ignore_missing_imports = True + +[mypy-pytest.*] +ignore_missing_imports = True + +[mypy-xmltodict.*] +ignore_missing_imports = True diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..4b8d2d3 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +addopts = -p no:pytest_swh_scheduler +norecursedirs = docs diff --git a/requirements-swh.txt b/requirements-swh.txt index 32c8593..0363717 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ swh.core[db,http] >= 0.0.87 swh.model >= 0.0.15 swh.objstorage >= 0.0.43 swh.scheduler >= 0.0.47 -swh.storage >= 0.6.0 +swh.storage >= 0.8.0 swh.journal >= 0.1.0 diff --git a/requirements-test.txt b/requirements-test.txt index 68bb694..ac0c1f0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ confluent-kafka pytest -pytest-postgresql hypothesis>=3.11.0 -swh.storage>= 0.0.178 +swh.scheduler[testing] >= 0.5.0 +swh.storage[testing] >= 0.10.0 diff --git a/setup.py b/setup.py index 1f6fd99..b0c777c 100755 --- a/setup.py +++ b/setup.py @@ -1,73 +1,73 @@ #!/usr/bin/env python3 -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from setuptools import setup, find_packages from os import path from io import open here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.indexer", description="Software Heritage Content Indexer", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/78/", packages=find_packages(), scripts=[], install_requires=parse_requirements() + parse_requirements("swh"), - setup_requires=["vcversioner"], + setup_requires=["setuptools-scm"], + use_scm_version=True, extras_require={"testing": parse_requirements("test")}, - vcversioner={}, include_package_data=True, entry_points=""" [console_scripts] swh-indexer=swh.indexer.cli:main [swh.cli.subcommands] indexer=swh.indexer.cli:cli """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-indexer", "Documentation": "https://docs.softwareheritage.org/devel/swh-indexer/", }, ) diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index a2920a6..06fbd34 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.1.0 +Version: 0.1.1 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 3f25a51..1dc3047 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,108 +1,133 @@ +.gitignore +.pre-commit-config.yaml +AUTHORS +CODE_OF_CONDUCT.md +CONTRIBUTORS +LICENSE MANIFEST.in Makefile +Makefile.local README.md +codemeta.json +conftest.py +mypy.ini pyproject.toml +pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py -version.txt +tox.ini +docs/.gitignore +docs/Makefile +docs/Makefile.local +docs/README.md +docs/conf.py +docs/dev-info.rst +docs/index.rst +docs/metadata-workflow.rst +docs/_static/.placeholder +docs/_templates/.placeholder +docs/images/.gitignore +docs/images/Makefile +docs/images/tasks-metadata-indexers.uml sql/bin/db-upgrade sql/bin/dot_add_content +sql/doc/json sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql sql/upgrades/119.sql sql/upgrades/120.sql sql/upgrades/121.sql sql/upgrades/122.sql sql/upgrades/123.sql sql/upgrades/124.sql sql/upgrades/125.sql sql/upgrades/126.sql sql/upgrades/127.sql sql/upgrades/128.sql sql/upgrades/129.sql sql/upgrades/130.sql sql/upgrades/131.sql sql/upgrades/132.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/entry_points.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/cli.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/journal_client.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/py.typed swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/metadata_dictionary/__init__.py swh/indexer/metadata_dictionary/base.py swh/indexer/metadata_dictionary/codemeta.py swh/indexer/metadata_dictionary/maven.py swh/indexer/metadata_dictionary/npm.py swh/indexer/metadata_dictionary/python.py swh/indexer/metadata_dictionary/ruby.py swh/indexer/sql/10-swh-init.sql swh/indexer/sql/20-swh-enums.sql swh/indexer/sql/30-swh-schema.sql swh/indexer/sql/40-swh-func.sql swh/indexer/sql/50-swh-data.sql swh/indexer/sql/60-swh-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/exc.py swh/indexer/storage/in_memory.py swh/indexer/storage/interface.py swh/indexer/storage/metrics.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/server.py swh/indexer/tests/__init__.py swh/indexer/tests/conftest.py swh/indexer/tests/tasks.py swh/indexer/tests/test_cli.py swh/indexer/tests/test_codemeta.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_journal_client.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/conftest.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_metrics.py swh/indexer/tests/storage/test_server.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index 0d7adeb..69ab181 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,18 +1,18 @@ vcversioner click python-magic>=0.4.13 pyld xmltodict swh.core[db,http]>=0.0.87 swh.model>=0.0.15 swh.objstorage>=0.0.43 swh.scheduler>=0.0.47 -swh.storage>=0.6.0 +swh.storage>=0.8.0 swh.journal>=0.1.0 [testing] confluent-kafka pytest -pytest-postgresql hypothesis>=3.11.0 -swh.storage>=0.0.178 +swh.scheduler[testing]>=0.5.0 +swh.storage[testing]>=0.10.0 diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 0fdb0db..0f28355 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,383 +1,381 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy from typing import Any, Callable, Dict, Iterator, List, Tuple from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil REVISION_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 def call_with_batches( f: Callable[[List[Dict[str, Any]]], Dict["str", Any]], args: List[Dict[str, str]], batch_size: int, ) -> Iterator[str]: """Calls a function with batches of args, and concatenates the results. """ groups = grouper(args, batch_size) for group in groups: yield from f(list(group)) class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing( ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) ) def index(self, id, data, log_suffix="unknown revision"): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the metadata keys will be returned as None """ result = { "id": id, "indexer_configuration_id": self.tool["id"], "metadata": None, } try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) result["metadata"] = MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id) ) if result["metadata"] is None: return None return result def persist_index_computations( self, results: List[Dict], policy_update: str ) -> Dict[str, int]: """Persist the results in storage. Args: results: list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - metadata (jsonb): detected metadata policy_update: either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ return self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == "update-dups") ) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_intrinsic_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ ADDITIONAL_CONFIG = { "tools": ( "dict", {"name": "swh-metadata-detector", "version": "0.0.2", "configuration": {},}, ), } def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_intrinsic_metadata_missing( ( {"id": sha1_git, "indexer_configuration_id": self.tool["id"],} for sha1_git in sha1_gits ) ) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (dict): revision artifact from storage Returns: dict: dictionary representing a revision_intrinsic_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - metadata: dict of retrieved metadata """ result = { "id": rev["id"], "indexer_configuration_id": self.tool["id"], "mappings": None, "metadata": None, } try: root_dir = rev["directory"] dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) if [entry["type"] for entry in dir_ls] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_ls[0]["target"] dir_ls = self.storage.directory_ls(subdir, recursive=False) files = [entry for entry in dir_ls if entry["type"] == "file"] detected_files = detect_metadata(files) (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix="revision=%s" % hashutil.hash_to_hex(rev["id"]), ) result["mappings"] = mappings result["metadata"] = metadata except Exception as e: self.log.exception("Problem when indexing rev: %r", e) return result def persist_index_computations( self, results: List[Dict], policy_update: str ) -> Dict[str, int]: """Persist the results in storage. Args: results: list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update: either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in # revision_intrinsic_metadata return self.idx_storage.revision_intrinsic_metadata_add( results, conflict_update=(policy_update == "update-dups") ) def translate_revision_intrinsic_metadata( self, detected_files: Dict[str, List[Any]], log_suffix: str ) -> Tuple[List[Any], List[Any]]: """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files: dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ used_mappings = [MAPPINGS[context].name for context in detected_files] metadata = [] tool = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {}, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]} config["tools"] = [tool] for context in detected_files.keys(): cfg = deepcopy(config) cfg["tools"][0]["configuration"]["context"] = context c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context] ) for c in metadata_generator: # extracting metadata sha1 = c["id"] sha1s_in_storage.append(sha1) local_metadata = c["metadata"] # local metadata is aggregated if local_metadata: metadata.append(local_metadata) sha1s_filtered = [ item for item in detected_files[context] if item not in sha1s_in_storage ] if sha1s_filtered: # content indexing try: c_metadata_indexer.run( sha1s_filtered, policy_update="ignore-dups", log_suffix=log_suffix, ) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result["metadata"] metadata.append(local_metadata) except Exception: self.log.exception("Exception while indexing metadata on contents") metadata = merge_documents(metadata) return (used_mappings, metadata) class OriginMetadataIndexer(OriginIndexer): ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) self.origin_head_indexer = OriginHeadIndexer(config=config) self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) def index_list(self, origin_urls, **kwargs): head_rev_ids = [] origins_with_head = [] origins = list( call_with_batches( - self.storage.origin_get, - [{"url": url} for url in origin_urls], - ORIGIN_GET_BATCH_SIZE, + self.storage.origin_get, origin_urls, ORIGIN_GET_BATCH_SIZE, ) ) for origin in origins: if origin is None: continue - head_result = self.origin_head_indexer.index(origin["url"]) + head_result = self.origin_head_indexer.index(origin.url) if head_result: origins_with_head.append(origin) head_rev_ids.append(head_result["revision_id"]) head_revs = list( call_with_batches( self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE ) ) assert len(head_revs) == len(head_rev_ids) results = [] for (origin, rev) in zip(origins_with_head, head_revs): if not rev: - self.log.warning("Missing head revision of origin %r", origin["url"]) + self.log.warning("Missing head revision of origin %r", origin.url) continue rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { "from_revision": rev_metadata["id"], - "id": origin["url"], + "id": origin.url, "metadata": rev_metadata["metadata"], "mappings": rev_metadata["mappings"], "indexer_configuration_id": rev_metadata["indexer_configuration_id"], } results.append((orig_metadata, rev_metadata)) return results def persist_index_computations( self, results: List[Dict], policy_update: str ) -> Dict[str, int]: conflict_update = policy_update == "update-dups" # Deduplicate revisions rev_metadata: List[Any] = [] orig_metadata: List[Any] = [] revs_to_delete: List[Any] = [] origs_to_delete: List[Any] = [] summary: Dict = {} for (orig_item, rev_item) in results: assert rev_item["metadata"] == orig_item["metadata"] if not rev_item["metadata"] or rev_item["metadata"].keys() <= {"@context"}: # If we didn't find any metadata, don't store a DB record # (and delete existing ones, if any) if rev_item not in revs_to_delete: revs_to_delete.append(rev_item) if orig_item not in origs_to_delete: origs_to_delete.append(orig_item) else: if rev_item not in rev_metadata: rev_metadata.append(rev_item) if orig_item not in orig_metadata: orig_metadata.append(orig_item) if rev_metadata: summary_rev = self.idx_storage.revision_intrinsic_metadata_add( rev_metadata, conflict_update=conflict_update ) summary.update(summary_rev) if orig_metadata: summary_ori = self.idx_storage.origin_intrinsic_metadata_add( orig_metadata, conflict_update=conflict_update ) summary.update(summary_ori) # revs_to_delete should always be empty unless we changed a mapping # to detect less files or less content. # However, origs_to_delete may be empty whenever an upstream deletes # a metadata file. if origs_to_delete: summary_ori = self.idx_storage.origin_intrinsic_metadata_delete( origs_to_delete ) summary.update(summary_ori) if revs_to_delete: summary_rev = self.idx_storage.revision_intrinsic_metadata_delete( revs_to_delete ) summary.update(summary_rev) return summary diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py index fb25abd..1ba1528 100644 --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -1,86 +1,74 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta from unittest.mock import patch import pytest from swh.objstorage import get_objstorage -from swh.scheduler.tests.conftest import * # noqa from swh.storage import get_storage from swh.indexer.storage import get_indexer_storage from .utils import fill_storage, fill_obj_storage TASK_NAMES = ["revision_intrinsic_metadata", "origin_intrinsic_metadata"] -storage_config = {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]} - - @pytest.fixture def indexer_scheduler(swh_scheduler): for taskname in TASK_NAMES: swh_scheduler.create_task_type( { "type": taskname, "description": "The {} indexer testing task".format(taskname), "backend_name": "swh.indexer.tests.tasks.{}".format(taskname), "default_interval": timedelta(days=1), "min_interval": timedelta(hours=6), "max_interval": timedelta(days=12), "num_retries": 3, } ) return swh_scheduler @pytest.fixture def idx_storage(): """An instance of in-memory indexer storage that gets injected into all indexers classes. """ idx_storage = get_indexer_storage("memory", {}) with patch("swh.indexer.storage.in_memory.IndexerStorage") as idx_storage_mock: idx_storage_mock.return_value = idx_storage yield idx_storage @pytest.fixture def storage(): """An instance of in-memory storage that gets injected into all indexers classes. """ - storage = get_storage(**storage_config) + storage = get_storage(cls="memory") fill_storage(storage) with patch("swh.storage.in_memory.InMemoryStorage") as storage_mock: storage_mock.return_value = storage yield storage @pytest.fixture def obj_storage(): """An instance of in-memory objstorage that gets injected into all indexers classes. """ objstorage = get_objstorage("memory", {}) fill_obj_storage(objstorage) with patch.dict( "swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage} ): yield objstorage - - -@pytest.fixture(scope="session") # type: ignore # expected redefinition -def celery_includes(): - return [ - "swh.indexer.tests.tasks", - "swh.indexer.tasks", - ] diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py index e2df26c..a67b2dc 100644 --- a/swh/indexer/tests/storage/conftest.py +++ b/swh/indexer/tests/storage/conftest.py @@ -1,73 +1,73 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os.path import join import pytest from . import SQL_DIR -from swh.storage.tests.conftest import postgresql_fact +from swh.storage.pytest_plugin import postgresql_fact from swh.indexer.storage import get_indexer_storage from swh.model.hashutil import hash_to_bytes from .generate_data_test import MIMETYPE_OBJECTS, FOSSOLOGY_LICENSES, TOOLS DUMP_FILES = join(SQL_DIR, "*.sql") class DataObj(dict): def __getattr__(self, key): return self.__getitem__(key) def __setattr__(self, key, value): return self.__setitem__(key, value) @pytest.fixture def swh_indexer_storage_with_data(swh_indexer_storage): data = DataObj() tools = { tool["tool_name"]: { "id": tool["id"], "name": tool["tool_name"], "version": tool["tool_version"], "configuration": tool["tool_configuration"], } for tool in swh_indexer_storage.indexer_configuration_add(TOOLS) } data.tools = tools data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689") data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7") data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238") data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321") data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320") data.origin_url_1 = "file:///dev/0/zero" # 44434341 data.origin_url_2 = "file:///dev/1/one" # 44434342 data.origin_url_3 = "file:///dev/2/two" # 54974445 data.mimetypes = [ {**mimetype_obj, "indexer_configuration_id": tools["file"]["id"]} for mimetype_obj in MIMETYPE_OBJECTS ] swh_indexer_storage.content_mimetype_add(data.mimetypes) data.fossology_licenses = [ {**fossology_obj, "indexer_configuration_id": tools["nomos"]["id"]} for fossology_obj in FOSSOLOGY_LICENSES ] swh_indexer_storage._test_data = data return (swh_indexer_storage, data) swh_indexer_storage_postgresql = postgresql_fact( "postgresql_proc", dump_files=DUMP_FILES ) @pytest.fixture def swh_indexer_storage(swh_indexer_storage_postgresql): storage_config = { "cls": "local", "args": {"db": swh_indexer_storage_postgresql.dsn,}, } return get_indexer_storage(**storage_config) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index c3ef250..7abb4ed 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,1210 +1,1205 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import unittest -import attr - from hypothesis import given, strategies, settings, HealthCheck from swh.model.hashutil import hash_to_bytes +from swh.model.model import Directory, DirectoryEntry, Revision from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer +from swh.indexer.tests.utils import REVISION, DIRECTORY2 + from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, YARN_PARSER_METADATA, json_document_strategy, xml_document_strategy, ) TRANSLATOR_TOOL = { "name": "swh-metadata-translator", "version": "0.0.2", "configuration": {"type": "local", "context": "NpmMapping"}, } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, "should not be called; the rev indexer configures it." REVISION_METADATA_CONFIG = { **BASE_TEST_CONFIG, "tools": TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.npm_mapping = MAPPINGS["NpmMapping"]() self.codemeta_mapping = MAPPINGS["CodemetaMapping"]() self.maven_mapping = MAPPINGS["MavenMapping"]() self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]() self.gemspec_mapping = MAPPINGS["GemspecMapping"]() def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "codeRepository": "git+https://github.com/moranegg/metadata_test", "author": [ {"type": "Person", "name": "Morane G", "email": "moranegg@example.com",} ], } # when result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping config = BASE_TEST_CONFIG.copy() config["tools"] = [TRANSLATOR_TOOL] metadata_indexer = ContentMetadataTestIndexer(config=config) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update="ignore-dups") results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ { "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", "description": "Simple package.json test for indexer", "name": "test_metadata", "version": "0.0.1", }, "id": hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), }, { "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", "author": [ { "type": "Person", "name": "Isaac Z. Schlueter", "email": "i@izs.me", "url": "http://blog.izs.me", } ], "codeRepository": "git+https://github.com/npm/npm", "description": "a package manager for JavaScript", "license": "https://spdx.org/licenses/Artistic-2.0", "version": "5.0.3", "name": "npm", "keywords": [ "install", "modules", "package manager", "package.json", ], "url": "https://docs.npmjs.com/", }, "id": hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), }, ] for result in results: del result["tool"] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_npm_bugs_normalization(self): # valid dictionary package_json = b"""{ "name": "foo", "bugs": { "url": "https://github.com/owner/project/issues", "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) # "invalid" dictionary package_json = b"""{ "name": "foo", "bugs": { "email": "foo@example.com" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # string package_json = b"""{ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "issueTracker": "https://github.com/owner/project/issues", "type": "SoftwareSourceCode", }, ) def test_npm_repository_normalization(self): # normal package_json = b"""{ "name": "foo", "repository": { "type" : "git", "url" : "https://github.com/npm/cli.git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", }, ) # missing url package_json = b"""{ "name": "foo", "repository": { "type" : "git" } }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "type": "SoftwareSourceCode", }, ) # github shortcut package_json = b"""{ "name": "foo", "repository": "github:npm/cli" }""" result = self.npm_mapping.translate(package_json) expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://github.com/npm/cli.git", "type": "SoftwareSourceCode", } self.assertEqual(result, expected_result) # github shortshortcut package_json = b"""{ "name": "foo", "repository": "npm/cli" }""" result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut package_json = b"""{ "name": "foo", "repository": "gitlab:user/repo" }""" result = self.npm_mapping.translate(package_json) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "name": "foo", "codeRepository": "git+https://gitlab.com/user/repo.git", "type": "SoftwareSourceCode", }, ) def test_detect_metadata_package_json(self): # given df = [ { "sha1_git": b"abc", "name": b"index.js", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": b"package.json", "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] # when results = detect_metadata(df) expected_results = {"NpmMapping": [b"cde"]} # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738", }, ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X", }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation", }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": ["metadata", "software"], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_codemeta_alternate_context(self): raw_content = b"""{ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", "@type": "SoftwareSourceCode", "identifier": "CodeMeta" }""" # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", } result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": "https://www.apache.org/licenses/LICENSE-2.0.txt", "codeRepository": ( "http://repo1.maven.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_almost_empty(self): raw_content = b""" """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_compute_metadata_maven_invalid_xml(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error parsing XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_unknown_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error detecting XML encoding from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_invalid_encoding(self): expected_warning = ( "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:" "Error unidecoding XML from foo" ) raw_content = b""" """ with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm: result = MAPPINGS["MavenMapping"]("foo").translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_empty_nodes(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) raw_content = b""" 1.2.3 """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "version": "1.2.3", }, ) def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 foo """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "codeRepository": ( "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app" ), }, ) def test_compute_metadata_maven_multiple(self): """Tests when there are multiple code repos and licenses.""" raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = self.maven_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "Maven Default Project", "identifier": "com.mycompany.app", "version": "1.2.3", "license": [ "https://www.apache.org/licenses/LICENSE-2.0.txt", "https://opensource.org/licenses/MIT", ], "codeRepository": [ "http://repo1.maven.org/maven2/com/mycompany/app/my-app", "http://example.org/maven2/com/mycompany/app/my-app", ], }, ) def test_compute_metadata_pkginfo(self): raw_content = b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== \x20 core library for swh's modules: - config parser - hash computations - serialization - logging mechanism \x20 Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual( result["description"], [ "Software Heritage core utilities", # note the comma here "swh-core\n" "========\n" "\n" "core library for swh's modules:\n" "- config parser\n" "- hash computations\n" "- serialization\n" "- logging mechanism\n" "", ], result, ) del result["description"] self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "url": "https://forge.softwareheritage.org/diffusion/DCORE/", "name": "swh.core", "author": [ { "type": "Person", "name": "Software Heritage developers", "email": "swh-devel@inria.fr", } ], "version": "0.0.49", }, ) def test_compute_metadata_pkginfo_utf8(self): raw_content = b"""\ Metadata-Version: 1.1 Name: snowpyt Description-Content-Type: UNKNOWN Description: foo Hydrology N\xc2\xb083 """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "snowpyt", "description": "foo\nHydrology N°83", }, ) def test_compute_metadata_pkginfo_keywords(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo Keywords: foo bar baz """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "keywords": ["foo", "bar", "baz"], }, ) def test_compute_metadata_pkginfo_license(self): raw_content = b"""\ Metadata-Version: 2.1 Name: foo License: MIT """ # noqa result = self.pkginfo_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "foo", "license": "MIT", }, ) def test_gemspec_base(self): raw_content = b""" Gem::Specification.new do |s| s.name = 'example' s.version = '0.1.0' s.licenses = ['MIT'] s.summary = "This is an example!" s.description = "Much longer explanation of the example!" s.authors = ["Ruby Coder"] s.email = 'rubycoder@example.com' s.files = ["lib/example.rb"] s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("description"), ["This is an example!", "Much longer explanation of the example!"], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder"}], "name": "example", "license": "https://spdx.org/licenses/MIT", "codeRepository": "https://rubygems.org/gems/example", "email": "rubycoder@example.com", "version": "0.1.0", }, ) def test_gemspec_two_author_fields(self): raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual( result.pop("author"), [ {"type": "Person", "name": "Ruby Coder1"}, {"type": "Person", "name": "Ruby Coder2"}, ], ) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) def test_gemspec_invalid_author(self): raw_content = b""" Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", }, ) raw_content = b""" Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [{"type": "Person", "name": "Ruby Coder1"}], }, ) def test_gemspec_alternative_header(self): raw_content = b""" require './lib/version' Gem::Specification.new { |s| s.name = 'rb-system-with-aliases' s.summary = 'execute system commands with aliases' } """ result = self.gemspec_mapping.translate(raw_content) self.assertEqual( result, { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "name": "rb-system-with-aliases", "description": "execute system commands with aliases", }, ) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=list(NpmMapping.mapping))) def test_npm_adversarial(self, doc): raw = json.dumps(doc).encode() self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( xml_document_strategy( keys=list(MavenMapping.mapping), root="project", xmlns="http://maven.apache.org/POM/4.0.0", ) ) def test_maven_adversarial(self, doc): self.maven_mapping.translate(doc) @settings(suppress_health_check=[HealthCheck.too_slow]) @given( strategies.dictionaries( # keys strategies.one_of( strategies.text(), *map(strategies.just, GemspecMapping.mapping) ), # values strategies.recursive( strategies.characters(), lambda children: strategies.lists(children, min_size=1), ), ) ) def test_gemspec_adversarial(self, doc): parts = [b"Gem::Specification.new do |s|\n"] for (k, v) in doc.items(): parts.append(" s.{} = {}\n".format(k, repr(v)).encode()) parts.append(b"end\n") self.gemspec_mapping.translate(b"".join(parts)) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( - {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()} + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None + rev = REVISION + assert rev.directory == DIRECTORY2.id metadata_indexer.idx_storage.content_metadata_add( [ { "indexer_configuration_id": tool["id"], - "id": b"cde", + "id": DIRECTORY2.entries[0].target, "metadata": YARN_PARSER_METADATA, } ] ) - sha1_gits = [ - hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - ] - metadata_indexer.run(sha1_gits, "update-dups") + metadata_indexer.run([rev.id], "update-dups") results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits) + metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id]) ) expected_results = [ { - "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), + "id": rev.id, "tool": TRANSLATOR_TOOL, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } ] for result in results: del result["tool"]["id"] # then - self.assertEqual(expected_results, results) + self.assertEqual(results, expected_results) def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") - rev = metadata_indexer.storage._revisions[rev_id] - subdir_id = rev.directory - rev = attr.evolve(rev, directory=b"123456") - metadata_indexer.storage.directory_add( - [ - { - "id": b"123456", - "entries": [ - { - "name": b"foobar-1.0.0", - "type": "dir", - "target": subdir_id, - "perms": 16384, - } - ], - } - ] + rev = REVISION + assert rev.directory == DIRECTORY2.id + + directory = Directory( + entries=( + DirectoryEntry( + name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384, + ), + ), ) + assert directory.id is not None + metadata_indexer.storage.directory_add([directory]) + + new_rev_dict = {**rev.to_dict(), "directory": directory.id} + new_rev_dict.pop("id") + new_rev = Revision.from_dict(new_rev_dict) + metadata_indexer.storage.revision_add([new_rev]) tool = metadata_indexer.idx_storage.indexer_configuration_get( - {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()} + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ { "indexer_configuration_id": tool["id"], - "id": b"cde", + "id": DIRECTORY2.entries[0].target, "metadata": YARN_PARSER_METADATA, } ] ) - sha1_gits = [ - hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - ] - metadata_indexer.run(sha1_gits, "update-dups") + metadata_indexer.run([new_rev.id], "update-dups") results = list( - metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits) + metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) ) expected_results = [ { - "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), + "id": new_rev.id, "tool": TRANSLATOR_TOOL, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } ] for result in results: del result["tool"]["id"] # then - self.assertEqual(expected_results, results) + self.assertEqual(results, expected_results) diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index a5ed93c..c137dd0 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,199 +1,170 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from datetime import datetime, timezone from swh.model.model import OriginVisit, OriginVisitStatus from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.tests.utils import BASE_TEST_CONFIG, fill_storage from swh.storage.utils import now +from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType + ORIGIN_HEAD_CONFIG = { **BASE_TEST_CONFIG, "tools": {"name": "origin-metadata", "version": "0.0.1", "configuration": {},}, "tasks": {"revision_intrinsic_metadata": None, "origin_intrinsic_metadata": None,}, } class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return ORIGIN_HEAD_CONFIG def persist_index_computations(self, results, policy_update): self.results = results class OriginHead(unittest.TestCase): def setUp(self): self.indexer = OriginHeadTestIndexer() self.indexer.catch_exceptions = False fill_storage(self.indexer.storage) def test_git(self): - self.indexer.run(["https://github.com/SoftwareHeritage/swh-storage"]) + origin_url = "https://github.com/SoftwareHeritage/swh-storage" + self.indexer.run([origin_url]) + rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{" - b"\xd7}\xac\xefrm", - "origin_url": "https://github.com/SoftwareHeritage/swh-storage", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) def test_git_partial_snapshot(self): """Checks partial snapshots are ignored.""" origin_url = "https://github.com/SoftwareHeritage/swh-core" - self.indexer.storage.origin_add_one( - {"url": origin_url,} - ) + self.indexer.storage.origin_add([Origin(url=origin_url)]) visit = self.indexer.storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="git", - status="ongoing", - snapshot=None, ) ] )[0] self.indexer.storage.snapshot_add( [ - { - "id": b"foo", - "branches": { + Snapshot( + branches={ b"foo": None, - b"HEAD": {"target_type": "alias", "target": b"foo",}, + b"HEAD": SnapshotBranch( + target_type=TargetType.ALIAS, target=b"foo", + ), }, - } + ), ] ) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="partial", snapshot=b"foo", ) self.indexer.storage.origin_visit_status_add([visit_status]) self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_vcs_missing_snapshot(self): - self.indexer.storage.origin_add( - [{"url": "https://github.com/SoftwareHeritage/swh-indexer",}] - ) - self.indexer.run(["https://github.com/SoftwareHeritage/swh-indexer"]) + origin_url = "https://github.com/SoftwareHeritage/swh-indexer" + self.indexer.storage.origin_add([Origin(url=origin_url)]) + self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_pypi_missing_branch(self): origin_url = "https://pypi.org/project/abcdef/" - self.indexer.storage.origin_add_one( - {"url": origin_url,} - ) + self.indexer.storage.origin_add([Origin(url=origin_url,)]) visit = self.indexer.storage.origin_visit_add( [ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="pypi", - status="ongoing", - snapshot=None, ) ] )[0] self.indexer.storage.snapshot_add( [ - { - "id": b"foo", - "branches": { + Snapshot( + branches={ b"foo": None, - b"HEAD": {"target_type": "alias", "target": b"foo",}, + b"HEAD": SnapshotBranch( + target_type=TargetType.ALIAS, target=b"foo", + ), }, - } + ) ] ) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="full", snapshot=b"foo", ) self.indexer.storage.origin_visit_status_add([visit_status]) self.indexer.run(["https://pypi.org/project/abcdef/"]) self.assertEqual(self.indexer.results, []) def test_ftp(self): - self.indexer.run(["rsync://ftp.gnu.org/gnu/3dldf"]) + origin_url = "rsync://ftp.gnu.org/gnu/3dldf" + self.indexer.run([origin_url]) + rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee" - b"\xcc\x1a\xb4`\x8c\x8by", - "origin_url": "rsync://ftp.gnu.org/gnu/3dldf", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) def test_ftp_missing_snapshot(self): - self.indexer.storage.origin_add([{"url": "rsync://ftp.gnu.org/gnu/foobar",}]) - self.indexer.run(["rsync://ftp.gnu.org/gnu/foobar"]) + origin_url = "rsync://ftp.gnu.org/gnu/foobar" + self.indexer.storage.origin_add([Origin(url=origin_url)]) + self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_deposit(self): - self.indexer.run(["https://forge.softwareheritage.org/source/jesuisgpl/"]) + origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" + self.indexer.storage.origin_add([Origin(url=origin_url)]) + self.indexer.run([origin_url]) + rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{" - b"\xa6\xe9\x99\xb1\x9e]q\xeb", - "origin_url": "https://forge.softwareheritage.org/source/" - "jesuisgpl/", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) def test_deposit_missing_snapshot(self): - self.indexer.storage.origin_add( - [{"url": "https://forge.softwareheritage.org/source/foobar",}] - ) - self.indexer.run(["https://forge.softwareheritage.org/source/foobar"]) + origin_url = "https://forge.softwareheritage.org/source/foobar" + self.indexer.storage.origin_add([Origin(url=origin_url,)]) + self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, []) def test_pypi(self): - self.indexer.run(["https://pypi.org/project/limnoria/"]) + origin_url = "https://pypi.org/project/limnoria/" + self.indexer.run([origin_url]) + + rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k" - b"A\x10\x9d\xc5\xfa2\xf8t", - "origin_url": "https://pypi.org/project/limnoria/", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url}], ) def test_svn(self): - self.indexer.run(["http://0-512-md.googlecode.com/svn/"]) + origin_url = "http://0-512-md.googlecode.com/svn/" + self.indexer.run([origin_url]) + rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18" self.assertEqual( - self.indexer.results, - [ - { - "revision_id": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8" - b"\xc9\xad#.\x1bw=\x18", - "origin_url": "http://0-512-md.googlecode.com/svn/", - } - ], + self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}], ) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 79e8de3..2533981 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,224 +1,212 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch -from swh.model.hashutil import hash_to_bytes - from swh.indexer.metadata import OriginMetadataIndexer -from .utils import YARN_PARSER_METADATA +from swh.model.model import Origin + +from .utils import YARN_PARSER_METADATA, REVISION from .test_metadata import REVISION_METADATA_CONFIG def test_origin_metadata_indexer(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + indexer.run([origin]) + rev_id = REVISION.id rev_metadata = { "id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } origin_metadata = { "id": origin, "from_revision": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result["tool"] assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) for result in results: del result["tool"] assert results == [origin_metadata] def test_origin_metadata_indexer_duplicate_origin(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage indexer.run(["https://github.com/librariesio/yarn-parser"]) - indexer.run(["https://github.com/librariesio/yarn-parser"] * 2) origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert len(results) == 1 def test_origin_metadata_indexer_missing_head(idx_storage, storage, obj_storage): - - storage.origin_add([{"url": "https://example.com"}]) + storage.origin_add([Origin(url="https://example.com")]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["https://example.com"]) origin = "https://example.com" results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_partial_missing_head( idx_storage, storage, obj_storage ): - storage.origin_add([{"url": "https://example.com"}]) - - indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["https://example.com", "https://github.com/librariesio/yarn-parser"]) - origin1 = "https://example.com" origin2 = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + storage.origin_add([Origin(url=origin1)]) + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + indexer.run([origin1, origin2]) - rev_metadata = { - "id": rev_id, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], - } - origin_metadata = { - "id": origin2, - "from_revision": rev_id, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], - } + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result["tool"] - assert results == [rev_metadata] + assert results == [ + {"id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"],} + ] results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) for result in results: del result["tool"] - assert results == [origin_metadata] + assert results == [ + { + "id": origin2, + "from_revision": rev_id, + "metadata": YARN_PARSER_METADATA, + "mappings": ["npm"], + } + ] def test_origin_metadata_indexer_duplicate_revision(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage - indexer.run( - [ - "https://github.com/librariesio/yarn-parser", - "https://github.com/librariesio/yarn-parser.git", - ] - ) - origin1 = "https://github.com/librariesio/yarn-parser" origin2 = "https://github.com/librariesio/yarn-parser.git" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + indexer.run([origin1, origin2]) + + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) assert len(results) == 2 def test_origin_metadata_indexer_no_metadata_file(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + origin = "https://github.com/librariesio/yarn-parser" with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_no_metadata(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + origin = "https://github.com/librariesio/yarn-parser" with patch( "swh.indexer.metadata.RevisionMetadataIndexer" ".translate_revision_intrinsic_metadata", return_value=(["npm"], {"@context": "foo"}), ): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_error(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + origin = "https://github.com/librariesio/yarn-parser" with patch( "swh.indexer.metadata.RevisionMetadataIndexer" ".translate_revision_intrinsic_metadata", return_value=None, ): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_delete_metadata(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = "https://github.com/librariesio/yarn-parser" - rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f") + indexer.run([origin]) + + rev_id = REVISION.id results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results != [] with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"): - indexer.run(["https://github.com/librariesio/yarn-parser"]) + indexer.run([origin]) results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) assert results == [] def test_origin_metadata_indexer_unknown_origin(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) result = indexer.index_list(["https://unknown.org/foo"]) assert not result diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py index 3a39558..b3f0612 100644 --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -1,740 +1,774 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import functools -import random from typing import Dict, Any import unittest from hypothesis import strategies from swh.model import hashutil from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.model import OriginVisit, OriginVisitStatus +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + Origin, + OriginVisit, + OriginVisitStatus, + Person, + Revision, + RevisionType, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) from swh.storage.utils import now from swh.indexer.storage import INDEXER_CFG_KEY BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = { - "storage": {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]}, + "storage": {"cls": "memory"}, "objstorage": {"cls": "memory", "args": {},}, INDEXER_CFG_KEY: {"cls": "memory", "args": {},}, } + +ORIGINS = [ + Origin(url="https://github.com/SoftwareHeritage/swh-storage"), + Origin(url="rsync://ftp.gnu.org/gnu/3dldf"), + Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"), + Origin(url="https://pypi.org/project/limnoria/"), + Origin(url="http://0-512-md.googlecode.com/svn/"), + Origin(url="https://github.com/librariesio/yarn-parser"), + Origin(url="https://github.com/librariesio/yarn-parser.git"), +] + + ORIGIN_VISITS = [ - {"type": "git", "url": "https://github.com/SoftwareHeritage/swh-storage"}, - {"type": "ftp", "url": "rsync://ftp.gnu.org/gnu/3dldf"}, - {"type": "deposit", "url": "https://forge.softwareheritage.org/source/jesuisgpl/"}, - {"type": "pypi", "url": "https://pypi.org/project/limnoria/"}, - {"type": "svn", "url": "http://0-512-md.googlecode.com/svn/"}, - {"type": "git", "url": "https://github.com/librariesio/yarn-parser"}, - {"type": "git", "url": "https://github.com/librariesio/yarn-parser.git"}, + {"type": "git", "origin": ORIGINS[0].url}, + {"type": "ftp", "origin": ORIGINS[1].url}, + {"type": "deposit", "origin": ORIGINS[2].url}, + {"type": "pypi", "origin": ORIGINS[3].url}, + {"type": "svn", "origin": ORIGINS[4].url}, + {"type": "git", "origin": ORIGINS[5].url}, + {"type": "git", "origin": ORIGINS[6].url}, ] + +DIRECTORY = Directory( + id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), + entries=( + DirectoryEntry( + name=b"index.js", + type="file", + target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"), + perms=0o100644, + ), + DirectoryEntry( + name=b"package.json", + type="file", + target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + perms=0o100644, + ), + DirectoryEntry( + name=b".github", + type="dir", + target=Directory(entries=()).id, + perms=0o040000, + ), + ), +) + +DIRECTORY2 = Directory( + id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6", + entries=( + DirectoryEntry( + name=b"package.json", + type="file", + target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"), + perms=0o100644, + ), + ), +) + +REVISION = Revision( + id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"), + message=b"Improve search functionality", + author=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + committer=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1380883849, microseconds=0,), + offset=120, + negative_utc=False, + ), + type=RevisionType.GIT, + synthetic=False, + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1487596456, microseconds=0,), + offset=0, + negative_utc=False, + ), + directory=DIRECTORY2.id, + parents=(), +) + +REVISIONS = [REVISION] + SNAPSHOTS = [ - { - "origin": "https://github.com/SoftwareHeritage/swh-storage", - "branches": { - b"refs/heads/add-revision-origin-cache": { - "target": b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' - b"s\xe7/\xe9l\x1e", - "target_type": "revision", - }, - b"refs/head/master": { - "target": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}" b"\xac\xefrm", - "target_type": "revision", - }, - b"HEAD": {"target": b"refs/head/master", "target_type": "alias"}, - b"refs/tags/v0.0.103": { - "target": b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b"\x0f\xdd", - "target_type": "release", - }, - }, - }, - { - "origin": "rsync://ftp.gnu.org/gnu/3dldf", - "branches": { - b"3DLDF-1.1.4.tar.gz": { - "target": b"dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc" b'"G\x99\x11', - "target_type": "revision", - }, - b"3DLDF-2.0.2.tar.gz": { - "target": b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=" - b"\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", - "target_type": "revision", - }, - b"3DLDF-2.0.3-examples.tar.gz": { - "target": b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97" - b"\xfe\xadZ\x80\x80\xc1\x83\xff", - "target_type": "revision", - }, - b"3DLDF-2.0.3.tar.gz": { - "target": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee" - b"\xcc\x1a\xb4`\x8c\x8by", - "target_type": "revision", - }, - b"3DLDF-2.0.tar.gz": { - "target": b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G" b"\xd3\xd1m", - "target_type": "revision", - }, - }, - }, - { - "origin": "https://forge.softwareheritage.org/source/jesuisgpl/", - "branches": { - b"master": { - "target": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{" - b"\xa6\xe9\x99\xb1\x9e]q\xeb", - "target_type": "revision", - } - }, - "id": b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r ", - }, - { - "origin": "https://pypi.org/project/limnoria/", - "branches": { - b"HEAD": {"target": b"releases/2018.09.09", "target_type": "alias"}, - b"releases/2018.09.01": { - "target": b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d" - b"\xbb\xdfF\xfdw\xcf", - "target_type": "revision", - }, - b"releases/2018.09.09": { - "target": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k" - b"A\x10\x9d\xc5\xfa2\xf8t", - "target_type": "revision", - }, - }, - "id": b"{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay" b"\x12\x9e\xd6\xb3", - }, - { - "origin": "http://0-512-md.googlecode.com/svn/", - "branches": { - b"master": { - "target": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8" - b"\xc9\xad#.\x1bw=\x18", - "target_type": "revision", - } + Snapshot( + id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), + branches={ + b"refs/heads/add-revision-origin-cache": SnapshotBranch( + target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e', + target_type=TargetType.REVISION, + ), + b"refs/head/master": SnapshotBranch( + target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm", + target_type=TargetType.REVISION, + ), + b"HEAD": SnapshotBranch( + target=b"refs/head/master", target_type=TargetType.ALIAS + ), + b"refs/tags/v0.0.103": SnapshotBranch( + target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd', + target_type=TargetType.RELEASE, + ), }, - "id": b"\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7" - b"\x05\xea\xb8\x1f\xc4H\xf4s", - }, - { - "origin": "https://github.com/librariesio/yarn-parser", - "branches": { - b"HEAD": { - "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - "target_type": "revision", - } + ), + Snapshot( + id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), + branches={ + b"3DLDF-1.1.4.tar.gz": SnapshotBranch( + target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11', + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.2.tar.gz": SnapshotBranch( + target=b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", # noqa + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.3-examples.tar.gz": SnapshotBranch( + target=b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97\xfe\xadZ\x80\x80\xc1\x83\xff", # noqa + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.3.tar.gz": SnapshotBranch( + target=b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by", # noqa + target_type=TargetType.REVISION, + ), + b"3DLDF-2.0.tar.gz": SnapshotBranch( + target=b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G\xd3\xd1m", + target_type=TargetType.REVISION, + ), }, - }, - { - "origin": "https://github.com/librariesio/yarn-parser.git", - "branches": { - b"HEAD": { - "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - "target_type": "revision", - } + ), + Snapshot( + id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"), + branches={ + b"master": SnapshotBranch( + target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb", # noqa + target_type=TargetType.REVISION, + ) }, - }, -] - - -REVISIONS = [ - { - "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"), - "message": b"Improve search functionality", - "author": { - "name": b"Andrew Nesbitt", - "fullname": b"Andrew Nesbitt ", - "email": b"andrewnez@gmail.com", + ), + Snapshot( + id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"), + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/2018.09.09", target_type=TargetType.ALIAS + ), + b"releases/2018.09.01": SnapshotBranch( + target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf", + target_type=TargetType.REVISION, + ), + b"releases/2018.09.09": SnapshotBranch( + target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa + target_type=TargetType.REVISION, + ), }, - "committer": { - "name": b"Andrew Nesbitt", - "fullname": b"Andrew Nesbitt ", - "email": b"andrewnez@gmail.com", + ), + Snapshot( + id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"), + branches={ + b"master": SnapshotBranch( + target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18", + target_type=TargetType.REVISION, + ) }, - "committer_date": { - "negative_utc": False, - "offset": 120, - "timestamp": {"microseconds": 0, "seconds": 1380883849,}, + ), + Snapshot( + id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), + branches={ + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ) }, - "type": "git", - "synthetic": False, - "date": { - "negative_utc": False, - "timestamp": {"seconds": 1487596456, "microseconds": 0,}, - "offset": 0, + ), + Snapshot( + id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), + branches={ + b"HEAD": SnapshotBranch( + target=REVISION.id, target_type=TargetType.REVISION, + ) }, - "directory": b"10", - "parents": (), - } + ), ] -DIRECTORY_ID = b"10" - -DIRECTORY_ENTRIES = [ - {"name": b"index.js", "type": "file", "target": b"abc", "perms": 33188,}, - {"name": b"package.json", "type": "file", "target": b"cde", "perms": 33188,}, - {"name": b".github", "type": "dir", "target": b"11", "perms": 16384,}, -] SHA1_TO_LICENSES = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], "02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"], "103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"], "688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"], "da39a3ee5e6b4b0d3255bfef95601890afd80709": [], } SHA1_TO_CTAGS = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [ {"name": "foo", "kind": "str", "line": 10, "lang": "bar",} ], "d4c647f0fc257591cc9ba1722484229780d1c607": [ {"name": "let", "kind": "int", "line": 100, "lang": "haskell",} ], "688a5ef812c53907562fe379d4b3851e69c7cb15": [ {"name": "symbol", "kind": "float", "line": 99, "lang": "python",} ], } OBJ_STORAGE_DATA = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text", "688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text", "8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text", "02fb2c89e14f7fab46701478c83779c7beb7b069": b""" import unittest import logging from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, "103bc087db1d26afc3a0283f38663d081e9b01e6": b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, "93666f74f1cf635c8c8ac118879da6ec5623c410": b""" (should 'pygments (recognize 'lisp 'easily)) """, "26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, "d4c647f0fc257591cc9ba1722484229780d1c607": b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, "a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b""" """, "da39a3ee5e6b4b0d3255bfef95601890afd80709": b"", - # 626364 - hash_to_hex(b"bcd"): b"unimportant content for bcd", - # 636465 - hash_to_hex( - b"cde" - ): b""" + # was 626364 / b'bcd' + "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd", + # was 636465 / b'cde' now yarn-parser package.json + "f5305243b3ce7ef8dc864ebc73794da304025beb": b""" { "name": "yarn-parser", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "main": "index.js", "scripts": { "start": "node index.js", "test": "mocha" }, "engines": { "node": "9.8.0" }, "repository": { "type": "git", "url": "git+https://github.com/librariesio/yarn-parser.git" }, "keywords": [ "yarn", "parse", "lock", "dependencies" ], "author": "Andrew Nesbitt", "license": "AGPL-3.0", "bugs": { "url": "https://github.com/librariesio/yarn-parser/issues" }, "homepage": "https://github.com/librariesio/yarn-parser#readme", "dependencies": { "@yarnpkg/lockfile": "^1.0.0", "body-parser": "^1.15.2", "express": "^4.14.0" }, "devDependencies": { "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", "test": "^0.6.0" } } """, } + YARN_PARSER_METADATA = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "url": "https://github.com/librariesio/yarn-parser#readme", "codeRepository": "git+git+https://github.com/librariesio/yarn-parser.git", "author": [{"type": "Person", "name": "Andrew Nesbitt"}], "license": "https://spdx.org/licenses/AGPL-3.0", "version": "1.0.0", "description": "Tiny web service for parsing yarn.lock files", "issueTracker": "https://github.com/librariesio/yarn-parser/issues", "name": "yarn-parser", "keywords": ["yarn", "parse", "lock", "dependencies"], "type": "SoftwareSourceCode", } json_dict_keys = strategies.one_of( strategies.characters(), strategies.just("type"), strategies.just("url"), strategies.just("name"), strategies.just("email"), strategies.just("@id"), strategies.just("@context"), strategies.just("repository"), strategies.just("license"), strategies.just("repositories"), strategies.just("licenses"), ) """Hypothesis strategy that generates strings, with an emphasis on those that are often used as dictionary keys in metadata files.""" generic_json_document = strategies.recursive( strategies.none() | strategies.booleans() | strategies.floats() | strategies.characters(), lambda children: ( strategies.lists(children, min_size=1) | strategies.dictionaries(json_dict_keys, children, min_size=1) ), ) """Hypothesis strategy that generates possible values for values of JSON metadata files.""" def json_document_strategy(keys=None): """Generates an hypothesis strategy that generates metadata files for a JSON-based format that uses the given keys.""" if keys is None: keys = strategies.characters() else: keys = strategies.one_of(map(strategies.just, keys)) return strategies.dictionaries(keys, generic_json_document, min_size=1) def _tree_to_xml(root, xmlns, data): def encode(s): "Skips unpaired surrogates generated by json_document_strategy" return s.encode("utf8", "replace") def to_xml(data, indent=b" "): if data is None: return b"" elif isinstance(data, (bool, str, int, float)): return indent + encode(str(data)) elif isinstance(data, list): return b"\n".join(to_xml(v, indent=indent) for v in data) elif isinstance(data, dict): lines = [] for (key, value) in data.items(): lines.append(indent + encode("<{}>".format(key))) lines.append(to_xml(value, indent=indent + b" ")) lines.append(indent + encode("".format(key))) return b"\n".join(lines) else: raise TypeError(data) return b"\n".join( [ '<{} xmlns="{}">'.format(root, xmlns).encode(), to_xml(data), "".format(root).encode(), ] ) class TreeToXmlTest(unittest.TestCase): def test_leaves(self): self.assertEqual( _tree_to_xml("root", "http://example.com", None), b'\n\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", True), b'\n True\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", "abc"), b'\n abc\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", 42), b'\n 42\n', ) self.assertEqual( _tree_to_xml("root", "http://example.com", 3.14), b'\n 3.14\n', ) def test_dict(self): self.assertIn( _tree_to_xml("root", "http://example.com", {"foo": "bar", "baz": "qux"}), [ b'\n' b" \n bar\n \n" b" \n qux\n \n" b"", b'\n' b" \n qux\n \n" b" \n bar\n \n" b"", ], ) def test_list(self): self.assertEqual( _tree_to_xml( "root", "http://example.com", [{"foo": "bar"}, {"foo": "baz"},] ), b'\n' b" \n bar\n \n" b" \n baz\n \n" b"", ) def xml_document_strategy(keys, root, xmlns): """Generates an hypothesis strategy that generates metadata files for an XML format that uses the given keys.""" return strategies.builds( functools.partial(_tree_to_xml, root, xmlns), json_document_strategy(keys) ) def filter_dict(d, keys): "return a copy of the dict with keys deleted" if not isinstance(keys, (list, tuple)): keys = (keys,) return dict((k, v) for (k, v) in d.items() if k not in keys) def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) def fill_storage(storage): - visit_types = {} - for visit in ORIGIN_VISITS: - storage.origin_add_one({"url": visit["url"]}) - visit_types[visit["url"]] = visit["type"] - for snap in SNAPSHOTS: - origin_url = snap["origin"] + storage.origin_add(ORIGINS) + storage.directory_add([DIRECTORY, DIRECTORY2]) + storage.revision_add(REVISIONS) + storage.snapshot_add(SNAPSHOTS) + + for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS): + assert snapshot.id is not None + visit = storage.origin_visit_add( - [ - OriginVisit( - origin=origin_url, - date=now(), - type=visit_types[origin_url], - status="ongoing", - snapshot=None, - ) - ] + [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])] )[0] - snap_id = snap.get("id") or bytes([random.randint(0, 255) for _ in range(32)]) - storage.snapshot_add([{"id": snap_id, "branches": snap["branches"]}]) visit_status = OriginVisitStatus( - origin=origin_url, + origin=visit.origin, visit=visit.visit, date=now(), status="full", - snapshot=snap_id, + snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status]) - storage.revision_add(REVISIONS) contents = [] for (obj_id, content) in OBJ_STORAGE_DATA.items(): content_hashes = hashutil.MultiHash.from_data(content).digest() contents.append( - { - "data": content, - "length": len(content), - "status": "visible", - "sha1": hash_to_bytes(obj_id), - "sha1_git": hash_to_bytes(obj_id), - "sha256": content_hashes["sha256"], - "blake2s256": content_hashes["blake2s256"], - } + Content( + data=content, + length=len(content), + status="visible", + sha1=hash_to_bytes(obj_id), + sha1_git=hash_to_bytes(obj_id), + sha256=content_hashes["sha256"], + blake2s256=content_hashes["blake2s256"], + ) ) storage.content_add(contents) - storage.directory_add([{"id": DIRECTORY_ID, "entries": DIRECTORY_ENTRIES,}]) class CommonContentIndexerTest(metaclass=abc.ABCMeta): legacy_get_format = False """True if and only if the tested indexer uses the legacy format. see: https://forge.softwareheritage.org/T1433 """ def get_indexer_results(self, ids): """Override this for indexers that don't have a mock storage.""" return self.indexer.idx_storage.state def assert_legacy_results_ok(self, sha1s, expected_results=None): # XXX old format, remove this when all endpoints are # updated to the new one # see: https://forge.softwareheritage.org/T1433 sha1s = [ sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s ] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual( len(expected_results), len(actual_results), (expected_results, actual_results), ) for indexed_data in actual_results: _id = indexed_data["id"] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data["id"] = _id self.assertEqual(indexed_data, expected_data) def assert_results_ok(self, sha1s, expected_results=None): if self.legacy_get_format: self.assert_legacy_results_ok(sha1s, expected_results) return sha1s = [ sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s ] actual_results = list(self.get_indexer_results(sha1s)) if expected_results is None: expected_results = self.expected_results self.assertEqual( len(expected_results), len(actual_results), (expected_results, actual_results), ) for indexed_data in actual_results: (_id, indexed_data) = list(indexed_data.items())[0] expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() expected_data = [expected_data] self.assertEqual(indexed_data, expected_data) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update="update-dups") self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update="ignore-dups") self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [ self.id1, "799a5ef812c53907562fe379d4b3851e69c7cb15", # unknown "800a5ef812c53907562fe379d4b3851e69c7cb15", ] # unknown # when self.indexer.run(sha1s, policy_update="update-dups") # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ def setUp(self): self.contents = sorted(OBJ_STORAGE_DATA) def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data["id"] assert isinstance(_id, bytes) indexed_data = indexed_data.copy() indexed_data["id"] = hash_to_hex(indexed_data["id"]) self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) self.assertTrue(start <= _id <= end) _tool_id = indexed_data["indexer_configuration_id"] self.assertEqual(_tool_id, self.indexer.tool["id"]) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = list(self.indexer._index_contents(start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( start, end, indexed=set(map(hash_to_bytes, data_indexed)) ) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok(start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end) # then self.assertEqual(actual_results, {"status": "uneventful"}) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end, skip_existing=False) # no already indexed data so same result as prior test # then self.assertEqual(actual_results, {"status": "uneventful"}) def test_generate_content_get_no_result(self): """No result indexed returns False""" _start, _end = [ "0000000000000000000000000000000000000000", "0000000000000000000000000000000000000001", ] start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end, incremental=False) # then self.assertEqual(actual_results, {"status": "uneventful"}) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..8a1495e --- /dev/null +++ b/tox.ini @@ -0,0 +1,40 @@ +[tox] +envlist=black,flake8,mypy,py3 + +[testenv] +extras = + testing +deps = + pytest-cov + swh-scheduler[testing] >= 0.5.0 + swh-storage[testing] >= 0.10.0 + dev: pdbpp +commands = + pytest --doctest-modules \ + !slow: --hypothesis-profile=fast \ + slow: --hypothesis-profile=slow \ + {envsitepackagesdir}/swh/indexer \ + --cov={envsitepackagesdir}/swh/indexer \ + --cov-branch {posargs} + +[testenv:black] +skip_install = true +deps = + black +commands = + {envpython} -m black --check swh + +[testenv:flake8] +skip_install = true +deps = + flake8 +commands = + {envpython} -m flake8 + +[testenv:mypy] +extras = + testing +deps = + mypy +commands = + mypy swh diff --git a/version.txt b/version.txt deleted file mode 100644 index a538b5a..0000000 --- a/version.txt +++ /dev/null @@ -1 +0,0 @@ -v0.1.0-0-ga8307fc \ No newline at end of file