diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 1d3b249..0000000 --- a/.gitignore +++ /dev/null @@ -1,14 +0,0 @@ -*.pyc -*.sw? -*~ -.coverage -.eggs/ -__pycache__ -*.egg-info/ -build/ -dist/ -version.txt -/sql/createdb-stamp -/sql/filldb-stamp -.tox/ -.hypothesis/ \ No newline at end of file diff --git a/AUTHORS b/AUTHORS deleted file mode 100644 index 27d038e..0000000 --- a/AUTHORS +++ /dev/null @@ -1,3 +0,0 @@ -Copyright (C) 2015-2017 The Software Heritage developers - -See http://www.softwareheritage.org/ for more information. diff --git a/CONTRIBUTORS b/CONTRIBUTORS deleted file mode 100644 index 650fb84..0000000 --- a/CONTRIBUTORS +++ /dev/null @@ -1 +0,0 @@ -Siddharth Ravikumar diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 94a9ed0..0000000 --- a/LICENSE +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/Makefile.local b/Makefile.local deleted file mode 100644 index c163514..0000000 --- a/Makefile.local +++ /dev/null @@ -1 +0,0 @@ -TESTFLAGS=--hypothesis-profile=fast diff --git a/PKG-INFO b/PKG-INFO index f936522..aea5aee 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.118 +Version: 0.0.124 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/codemeta.json b/codemeta.json deleted file mode 100644 index 28306be..0000000 --- a/codemeta.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "@context": "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld", - "@type": "SoftwareSourceCode", - "identifier": "5682a72dc61f86ae69f2841c2184d6159c0b6d5d", - "description": "Software Heritage Indexer for revisions and contents", - "name": "swh-indexer", - "isPartOf": { - "@type": "SoftwareSourceCode", - "name": "swh-environment", - "identifier": "83e766feafde91242883be1bf369ed3e6865824f" - }, - "codeRepository": "https://forge.softwareheritage.org/diffusion/78/", - "issueTracker": "https://forge.softwareheritage.org/maniphest/", - "license": "https://spdx.org/licenses/GPL-3.0.html", - "version": "0.0.35", - "author": [ - { - "@type": "Organization", - "name": "Software Heritage", - "url": "https://www.softwareheritage.org", - "email": "swh-devel@inria.fr" - }, - ], - "developmentStatus": "active", - "keywords": [ - "indexer", - "software", - "mimetype", - "ctags", - "language", - "fossology-license", - "metadata", - "metadata-detector", - "metadata-translator" - ], - "dateCreated":"2017-06-12", - "datePublished":"2017-06-12", - "programmingLanguage": "Python", -} diff --git a/conftest.py b/conftest.py deleted file mode 100644 index eb6de3d..0000000 --- a/conftest.py +++ /dev/null @@ -1,6 +0,0 @@ -from hypothesis import settings - -# define tests profile. Full documentation is at: -# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles -settings.register_profile("fast", max_examples=5, deadline=5000) -settings.register_profile("slow", max_examples=20, deadline=5000) diff --git a/debian/changelog b/debian/changelog deleted file mode 100644 index 3dce8ab..0000000 --- a/debian/changelog +++ /dev/null @@ -1,5 +0,0 @@ -swh-indexer (0.0.1-1) unstable; urgency=low - - * Import initial source package. - - -- Antoine R. Dumont (@ardumont) Tue, 27 Sep 2016 10:53:19 +0200 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index ec63514..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/debian/control b/debian/control deleted file mode 100644 index 368686e..0000000 --- a/debian/control +++ /dev/null @@ -1,50 +0,0 @@ -Source: swh-indexer -Maintainer: Software Heritage developers -Section: python -Priority: optional -Build-Depends: debhelper (>= 9), - dh-python (>= 2), - python3-all, - python3-chardet (>= 2.3.0~), - python3-click, - python3-hypothesis (>= 3.11.0~), - python3-pytest, - python3-pygments, - python3-magic, - python3-pyld, - python3-setuptools, - python3-swh.core (>= 0.0.44~), - python3-swh.model (>= 0.0.15~), - python3-swh.objstorage (>= 0.0.28~), - python3-swh.scheduler (>= 0.0.35~), - python3-swh.storage (>= 0.0.113~), - python3-vcversioner, - python3-xmltodict -Standards-Version: 3.9.6 -Homepage: https://forge.softwareheritage.org/diffusion/78/ - -Package: python3-swh.indexer.storage -Architecture: all -Depends: python3-swh.core (>= 0.0.44~), - python3-swh.model (>= 0.0.15~), - python3-swh.objstorage (>= 0.0.28~), - python3-swh.scheduler (>= 0.0.35~), - python3-swh.storage (>= 0.0.113~), - ${misc:Depends}, - ${python3:Depends} -Description: Software Heritage Content Indexer Storage - -Package: python3-swh.indexer -Architecture: all -Depends: python3-swh.scheduler (>= 0.0.14~), - python3-swh.core (>= 0.0.44~), - python3-swh.model (>= 0.0.15~), - python3-swh.objstorage (>= 0.0.28~), - python3-swh.scheduler (>= 0.0.35~), - python3-swh.storage (>= 0.0.113~), - python3-swh.indexer.storage (= ${binary:Version}), - universal-ctags (>= 0.8~), - fossology-nomossa (>= 3.1~), - ${misc:Depends}, - ${python3:Depends} -Description: Software Heritage Content Indexer diff --git a/debian/copyright b/debian/copyright deleted file mode 100644 index 81d037d..0000000 --- a/debian/copyright +++ /dev/null @@ -1,22 +0,0 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ - -Files: * -Copyright: 2015 The Software Heritage developers -License: GPL-3+ - -License: GPL-3+ - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - . - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - . - You should have received a copy of the GNU General Public License - along with this program. If not, see . - . - On Debian systems, the complete text of the GNU General Public - License version 3 can be found in `/usr/share/common-licenses/GPL-3'. diff --git a/debian/rules b/debian/rules deleted file mode 100755 index 33bf8bb..0000000 --- a/debian/rules +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/make -f - -export PYBUILD_NAME=swh.indexer -export PYBUILD_TEST_ARGS=-m 'not db and not fs' - -%: - dh $@ --with python3 --buildsystem=pybuild - -override_dh_install: - dh_install - rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/__init__.py - for pyvers in $(shell py3versions -vr); do \ - mkdir -p $(CURDIR)/debian/python3-swh.indexer.storage/usr/lib/python$$pyvers/dist-packages/swh/indexer/storage/ ; \ - mv $(CURDIR)/debian/python3-swh.indexer/usr/lib/python$$pyvers/dist-packages/swh/indexer/storage/* \ - $(CURDIR)/debian/python3-swh.indexer.storage/usr/lib/python$$pyvers/dist-packages/swh/indexer/storage/ ; \ - done diff --git a/debian/source/format b/debian/source/format deleted file mode 100644 index 163aaf8..0000000 --- a/debian/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index 58a761e..0000000 --- a/docs/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -_build/ -apidoc/ -*-stamp diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index c30c50a..0000000 --- a/docs/Makefile +++ /dev/null @@ -1 +0,0 @@ -include ../../swh-docs/Makefile.sphinx diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index f4f2481..0000000 --- a/docs/README.md +++ /dev/null @@ -1,49 +0,0 @@ -swh-indexer -============ - -Tools to compute multiple indexes on SWH's raw contents: -- content: - - mimetype - - ctags - - language - - fossology-license - - metadata -- revision: - - metadata - -An indexer is in charge of: -- looking up objects -- extracting information from those objects -- store those information in the swh-indexer db - -There are multiple indexers working on different object types: - - content indexer: works with content sha1 hashes - - revision indexer: works with revision sha1 hashes - - origin indexer: works with origin identifiers - -Indexation procedure: -- receive batch of ids -- retrieve the associated data depending on object type -- compute for that object some index -- store the result to swh's storage - -Current content indexers: - -- mimetype (queue swh_indexer_content_mimetype): detect the encoding - and mimetype - -- language (queue swh_indexer_content_language): detect the - programming language - -- ctags (queue swh_indexer_content_ctags): compute tags information - -- fossology-license (queue swh_indexer_fossology_license): compute the - license - -- metadata: translate file into translated_metadata dict - -Current revision indexers: - -- metadata: detects files containing metadata and retrieves translated_metadata - in content_metadata table in storage or run content indexer to translate - files. diff --git a/docs/_static/.placeholder b/docs/_static/.placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/docs/_templates/.placeholder b/docs/_templates/.placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 190deb7..0000000 --- a/docs/conf.py +++ /dev/null @@ -1 +0,0 @@ -from swh.docs.sphinx.conf import * # NoQA diff --git a/docs/dev-info.rst b/docs/dev-info.rst deleted file mode 100644 index 493b102..0000000 --- a/docs/dev-info.rst +++ /dev/null @@ -1,206 +0,0 @@ -Hacking on swh-indexer -====================== - -This tutorial will guide you through the hacking on the swh-indexer. -If you do not have a local copy of the Software Heritage archive, go to the -`getting started tutorial -`_ - -Configuration files -------------------- -You will need the following YAML configuration files to run the swh-indexer -commands: - -- Orchestrator at - ``~/.config/swh/indexer/orchestrator.yml`` - -.. code-block:: yaml - - indexers: - mimetype: - check_presence: false - batch_size: 100 - -- Orchestrator-text at - ``~/.config/swh/indexer/orchestrator-text.yml`` - -.. code-block:: yaml - - indexers: - # language: - # batch_size: 10 - # check_presence: false - fossology_license: - batch_size: 10 - check_presence: false - # ctags: - # batch_size: 2 - # check_presence: false - -- Mimetype indexer at - ``~/.config/swh/indexer/mimetype.yml`` - -.. code-block:: yaml - - # storage to read sha1's metadata (path) - # storage: - # cls: local - # args: - # db: "service=swh-dev" - # objstorage: - # cls: pathslicing - # args: - # root: /home/storage/swh-storage/ - # slicing: 0:1/1:5 - - storage: - cls: remote - args: - url: http://localhost:5002/ - - indexer_storage: - cls: remote - args: - url: http://localhost:5007/ - - # storage to read sha1's content - # adapt this to your need - # locally: this needs to match your storage's setup - objstorage: - cls: pathslicing - args: - slicing: 0:1/1:5 - root: /home/storage/swh-storage/ - - destination_task: swh.indexer.tasks.SWHOrchestratorTextContentsTask - rescheduling_task: swh.indexer.tasks.SWHContentMimetypeTask - - -- Fossology indexer at - ``~/.config/swh/indexer/fossology_license.yml`` - -.. code-block:: yaml - - # storage to read sha1's metadata (path) - # storage: - # cls: local - # args: - # db: "service=swh-dev" - # objstorage: - # cls: pathslicing - # args: - # root: /home/storage/swh-storage/ - # slicing: 0:1/1:5 - - storage: - cls: remote - url: http://localhost:5002/ - - indexer_storage: - cls: remote - args: - url: http://localhost:5007/ - - # storage to read sha1's content - # adapt this to your need - # locally: this needs to match your storage's setup - objstorage: - cls: pathslicing - args: - slicing: 0:1/1:5 - root: /home/storage/swh-storage/ - - workdir: /tmp/swh/worker.indexer/license/ - - tools: - name: 'nomos' - version: '3.1.0rc2-31-ga2cbb8c' - configuration: - command_line: 'nomossa ' - - -- Worker at - ``~/.config/swh/worker.yml`` - -.. code-block:: yaml - - task_broker: amqp://guest@localhost// - task_modules: - - swh.loader.svn.tasks - - swh.loader.tar.tasks - - swh.loader.git.tasks - - swh.storage.archiver.tasks - - swh.indexer.tasks - - swh.indexer.orchestrator - task_queues: - - swh_loader_svn - - swh_loader_tar - - swh_reader_git_to_azure_archive - - swh_storage_archive_worker_to_backend - - swh_indexer_orchestrator_content_all - - swh_indexer_orchestrator_content_text - - swh_indexer_content_mimetype - - swh_indexer_content_language - - swh_indexer_content_ctags - - swh_indexer_content_fossology_license - - swh_loader_svn_mount_and_load - - swh_loader_git_express - - swh_loader_git_archive - - swh_loader_svn_archive - task_soft_time_limit: 0 - - -Database --------- - -swh-indxer uses a database to store the indexed content. The default -db is expected to be called swh-indexer-dev. - -Create or add ``swh-dev`` and ``swh-indexer-dev`` to -the ``~/.pg_service.conf`` and ``~/.pgpass`` files, which are postgresql's -configuration files. - -Add data to local DB --------------------- -from within the ``swh-environment``, run the following command:: - - make rebuild-testdata - -and fetch some real data to work with, using:: - - python3 -m swh.loader.git.updater --origin-url - -Then you can list all content files using this script:: - - #!/usr/bin/env bash - - psql service=swh-dev -c "copy (select sha1 from content) to stdin" | sed -e 's/^\\\\x//g' - -Run the indexers ------------------ -Use the list off contents to feed the indexers with with the -following command:: - - ./list-sha1.sh | python3 -m swh.indexer.producer --batch 100 --task-name orchestrator_all - -Activate the workers --------------------- -To send messages to different queues using rabbitmq -(which should already be installed through dependencies installation), -run the following command in a dedicated terminal:: - - python3 -m celery worker --app=swh.scheduler.celery_backend.config.app \ - --pool=prefork \ - --concurrency=1 \ - -Ofair \ - --loglevel=info \ - --without-gossip \ - --without-mingle \ - --without-heartbeat 2>&1 - -With this command rabbitmq will consume message using the worker -configuration file. - -Note: for the fossology_license indexer, you need a package fossology-nomossa -which is in our `public debian repository -`_. diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 375ddab..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. _swh-indexer: - -Software Heritage - Indexer -=========================== - -Tools and workers used to mine the content of the archive and extract derived -information from archive source code artifacts. - - -.. toctree:: - :maxdepth: 1 - :caption: Contents: - - README.md - dev-info.rst - - -Reference Documentation ------------------------ - -.. toctree:: - :maxdepth: 2 - - /apidoc/swh.indexer diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index afa4cf3..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -norecursedirs = docs diff --git a/requirements-swh.txt b/requirements-swh.txt index 65b046a..99f0ade 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.44 swh.model >= 0.0.15 swh.objstorage >= 0.0.28 -swh.scheduler >= 0.0.35 +swh.scheduler >= 0.0.36 swh.storage >= 0.0.113 diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index d3fc701..0000000 --- a/requirements-test.txt +++ /dev/null @@ -1,2 +0,0 @@ -pytest -hypothesis (>= 3.11.0) diff --git a/sql/createdb-stamp b/sql/createdb-stamp deleted file mode 100644 index e69de29..0000000 diff --git a/sql/filldb-stamp b/sql/filldb-stamp deleted file mode 100644 index e69de29..0000000 diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index f936522..aea5aee 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.118 +Version: 0.0.124 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 62ceecb..7a5d90d 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,99 +1,72 @@ -.gitignore -AUTHORS -CONTRIBUTORS -LICENSE MANIFEST.in Makefile -Makefile.local README.md -codemeta.json -conftest.py -pytest.ini requirements-swh.txt -requirements-test.txt requirements.txt setup.py -tox.ini version.txt -debian/changelog -debian/compat -debian/control -debian/copyright -debian/rules -debian/source/format -docs/.gitignore -docs/Makefile -docs/README.md -docs/conf.py -docs/dev-info.rst -docs/index.rst -docs/_static/.placeholder -docs/_templates/.placeholder -sql/createdb-stamp -sql/filldb-stamp sql/bin/db-upgrade sql/bin/dot_add_content -sql/doc/json sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/indexer_configuration.tool_configuration.schema.json sql/doc/json/revision_metadata.translated_metadata.json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json sql/upgrades/115.sql sql/upgrades/116.sql sql/upgrades/117.sql sql/upgrades/118.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/codemeta.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/language.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/metadata_dictionary.py swh/indexer/mimetype.py swh/indexer/origin_head.py swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/data/codemeta/CITATION swh/indexer/data/codemeta/LICENSE swh/indexer/data/codemeta/codemeta.jsonld swh/indexer/data/codemeta/crosswalk.csv swh/indexer/sql/10-swh-init.sql swh/indexer/sql/20-swh-enums.sql swh/indexer/sql/30-swh-schema.sql swh/indexer/sql/40-swh-func.sql swh/indexer/sql/50-swh-data.sql swh/indexer/sql/60-swh-indexes.sql swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/in_memory.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/server.py swh/indexer/tests/__init__.py swh/indexer/tests/test_ctags.py swh/indexer/tests/test_fossology_license.py swh/indexer/tests/test_language.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_origin_head.py swh/indexer/tests/test_origin_metadata.py swh/indexer/tests/test_utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/generate_data_test.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_in_memory.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index 1d35591..5bba96f 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,16 +1,16 @@ -chardet +vcversioner +pygments click +chardet file_magic -pygments pyld +xmltodict swh.core>=0.0.44 swh.model>=0.0.15 swh.objstorage>=0.0.28 -swh.scheduler>=0.0.35 +swh.scheduler>=0.0.36 swh.storage>=0.0.113 -vcversioner -xmltodict [testing] -hypothesis>=3.11.0 pytest +hypothesis>=3.11.0 diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index 492e7c0..1ad6022 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,155 +1,156 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json from swh.model import hashutil from .language import compute_language from .indexer import ContentIndexer, DiskIndexer # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) - Returns: - ctags' output + Yields: + dict: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def compute_ctags(self, path, lang): """Compute ctags on file at path with language lang. """ return run_ctags(path, lang=lang) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - ctags ([dict]): ctags list of symbols + dict: a dict representing a content_mimetype with keys: + + - **id** (bytes): content's identifier (sha1) + - **ctags** ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) content_path = self.write_to_temp( filename=filename, data=data) result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) self.cleanup(content_path) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv index 3fc65de..0387b4c 100644 --- a/swh/indexer/data/codemeta/crosswalk.csv +++ b/swh/indexer/data/codemeta/crosswalk.csv @@ -1,77 +1,77 @@ -Parent Type,Property,Type,Description,codemeta-V1,DataCite,OntoSoft,Zenodo,GitHub,Figshare,Software Ontology,Software Discovery Index,Dublin Core,R Package Description,Debian Package,Python Distutils (PyPI),Trove Software Map,Perl Module Description (CPAN::Meta),NodeJS,Java (Maven),Octave,Ruby Gem,ASCL,DOAP,Wikidata,Citation File Format Core (CFF-Core) 1.0.2 -schema:SoftwareSourceCode,codeRepository,URL,"Link to the repository where the un-compiled, human readable code and related code is located (SVN, github, CodePlex).",codeRepository,,,relatedLink,html_url,relatedLink,,,,URL,HomePage,url,,resouces.repository,repository,repositories,,homepage,site_list,repository,source code repository,repository-code -schema:SoftwareSourceCode,programmingLanguage,ComputerLanguage or Text,The computer programming language.,programmingLanguage,Format,hasProgrammingLanguage,,languages_url,,programming language,,,,,classifiers['Programming Language'],Programming Language,,,,,,,programming-language,programming language, -schema:SoftwareSourceCode,runtimePlatform,Text,"Runtime platform or script interpreter dependencies (Example - Java v1, Python2.3, .Net Framework 3.0). Supersedes runtime.",,,,,,,,,,,,,,,,,,platform,,platform,, -schema:SoftwareSourceCode,targetProduct,SoftwareApplication,"Target Operating System / Product to which the code applies. If applies to several versions, just the product name can be used.",,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,applicationCategory,Text or URL,"Type of software application, e.g. 'Game, Multimedia'.",,,hasSoftwareCategory,communities,,categories,,,,,,classifiers['Topic'],Topic,Categories,,,Categories,,,,, -schema:SoftwareApplication,applicationSubCategory,Text or URL,"Subcategory of the application, e.g. 'Arcade Game'.",,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,downloadUrl,URL,"If the file can be downloaded, URL to download the binary.",downloadLink,,,,archive_url,,,,,,,,,,,,,,,download-page,,repository-artifact -schema:SoftwareApplication,fileSize,Text,"Size of the application / package (e.g. 18MB). In the absence of a unit (MB, KB etc.), KB will be assumed.",,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,installUrl,URL,"URL at which the app may be installed, if different from the URL of the item.",,,,,,,,,,,,,,,,,,,,download-mirror,, -schema:SoftwareApplication,memoryRequirements,Text or URL,Minimum memory requirements.,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,operatingSystem,Text,"Operating systems supported (Windows 7, OSX 10.6, Android 1.6).",operatingSystems,,SupportsOperatingSystem,,,,,,,,,classifiers['Operating System'],Operating System,OSNAMES,os,,,,,os,operating system, -schema:SoftwareApplication,permissions,Text,"Permission(s) required to run the app (for example, a mobile app may require full internet access or may run only on wifi).",,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,processorRequirements,Text,Processor architecture required to run the application (e.g. IA64).,,,,,,,,,,,,,,,cpu / engines,,,,,,, -schema:SoftwareApplication,releaseNotes,Text or URL,Description of what changed in this version.,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,softwareHelp,CreativeWork,Software application help.,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,softwareRequirements,SoftwareSourceCode,Required software dependencies,depends,,hasDependency->Software,,,,,"""Platform, environment, and dependencies""",,"Depends, SystemRequirements",,install_requires,Database Environment,prereqs,dependencies / bundledDependencies / bundleDependencies / peerDependencies,prerequisites,"Depends, SystemRequirements","requirements, add_runtime_dependency",,,depends on software, -schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,release,software version, -schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,, -schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,,,,author,,,author,,developer,,authors -schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,contributor,,,,,developer,, -schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,copyrightYear,Number,The year during which the claimed copyright for the CreativeWork was first asserted.,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,creator,Organization or Person,The creator/author of this CreativeWork. This is the same as the Author property for CreativeWork.,agent,,,,,,,,creator,[cre] in Author,,,,,author,,,,,,, -schema:CreativeWork,dateCreated,Date or DateTime,The date on which the CreativeWork was created or the item was added to a DataFeed.,dateCreated,date,,,created_at,,,,created,,Date,,,,,,,,,,, -schema:CreativeWork,dateModified,Date or DateTime,The date on which the CreativeWork was most recently modified or when the item's entry was modified within a DataFeed.,dateModified,date,,,updated_at,,,,,,,,last-updated,,,,,,,,, -schema:CreativeWork,datePublished,Date,Date of first broadcast/publication.,datePublished,publicationYear,,date_published,,date_retrieved,,,date,Date,,,,,,,Date,,,,publication date,date-released -schema:CreativeWork,editor,Person,Specifies the Person who edited the CreativeWork.,,,,,,,,,,,,,,,,,,,,,editor, -schema:CreativeWork,encoding,MediaObject,A media object that encodes this CreativeWork. This property is a synonym for associatedMedia. Supersedes encodings.,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,fileFormat,Text or URL,"Media type, typically MIME format (see IANA site) of the content e.g. application/zip of a SoftwareApplication binary. In cases where a CreativeWork has several media type representations, 'encoding' can be used to indicate each MediaObject alongside particular fileFormat information. Unregistered or niche file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia entry.",,Format,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,funder,Organization or Person,A person or organization that supports (sponsors) something through some kind of financial contribution.,fundingReference.funderName,,,contributors.Funder,,,,,,,,,,,,,,,,,, -schema:CreativeWork,keywords,Text,Keywords or tags used to describe this content. Multiple entries in a keywords list are typically delimited by commas.,controlledTerms,subject,hasDomainKeywords,keywords,,tags,,,,,,keywords,,keywords,keywords,,,,,category,,keywords -schema:CreativeWork,license,CreativeWork or URL,"A license document that applies to this content, typically indicated by URL.",licenseId,rights,License,license,license,License,software license,Software license,license,License,,license,license,license,license,licesnse,License,license/licenses,,license,license,license/license-url -schema:CreativeWork,producer,Organization or Person,"The person or organization who produced the work (e.g. music album, movie, tv/radio series etc.).",,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,provider,Organization or Person,"The service provider, service operator, or service performer; the goods producer. Another party (a seller) may offer those services or goods on behalf of the provider. A provider may also serve as the seller. Supersedes carrier.",,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,publisher,Organization or Person,The publisher of the creative work.,publisher,publisher,os:hasPublisher,,,,software publisher organization,,publisher,,,,,,,,,,,vendor,, -schema:CreativeWork,sponsor,Organization or Person,"A person or organization that supports a thing through a pledge, promise, or financial contribution. e.g. a sponsor of a Medical Study or a corporate sponsor of an event.",,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,version,Number or Text,The version of the CreativeWork embodied by a specified resource.,version,version,hasSoftwareVersion,,,,Version,Software version,dcterms:hasVersion,,numeric_version,Version,version,,version,version,version,version,,,,version -schema:CreativeWork,isAccessibleForFree,Boolean,A flag to signal that the publication is accessible for free.,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,isPartOf,CreativeWork,Indicates a CreativeWork that this CreativeWork is (in some sense) part of. Reverse property hasPart,,,,,,,,,,,,,,,,,,,,,,references -schema:CreativeWork,hasPart,CreativeWork,Indicates a CreativeWork that is (in some sense) a part of this CreativeWork. Reverse property isPartOf,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,position,Integer or Text,"The position of an item in a series or sequence of items. (While schema.org considers this a property of CreativeWork, it is also the way to indicate ordering in any list (e.g. the Authors list). By default arrays are unordered in JSON-LD",,,,,,,,,,,,,,,,,,,,,, -schema:Thing,description,Text,A description of the item.,description,description,hasShortDescription,description/notes,description,Description,software,,description,Description,Description,"description, long_description",description,"abstract, description",description,description,Description,"summary, description",abstract,,,abstract -schema:Thing,identifier,PropertyValue or URL,"The identifier property represents any kind of identifier for any kind of Thing, such as ISBNs, GTIN codes, UUIDs etc. Schema.org provides dedicated properties for representing many of these, either as textual strings or as URL (URI) links. See background notes for more details.",identifier,identifier,hasUniqueId,id,id,,,Persistent Identifier,identifier,Package,Package,,,,name,groupId,,,ascl_id,,,doi -schema:Thing,name,Text,"The name of the item (software, Organization)",name,,hasName,title,full_name,Title,SoftwareTitle,Software title,title,Title,,name,Title,name,name,name,name,name,title,,,title -schema:Thing,sameAs,URL,"URL of a reference Web page that unambiguously indicates the item's identity. E.g. the URL of the item's Wikipedia page, Wikidata entry, or official website.",,,,,,,,,,,,,,,,,,,,,, -schema:Thing,url,URL,URL of the item.,URL,,,,,,,,,URL,,,,,homepage,,URL,,,homepage,official website,url -schema:Thing,relatedLink,URL,"A link related to this object, e.g. related web pages",,RelateIdentifier,,,,,,,,,,,,,,,,,,,, -schema:Person,givenName,Text,"Given name. In the U.S., the first name of a Person. This can be used along with familyName instead of the name property",,givenName,,,,,,,,givenName,,,,,,,,,,,,person.given-names -schema:Person,familyName,Text,"Family name. In the U.S., the last name of an Person. This can be used along with givenName instead of the name property.",,familyName,,,,,,,,familyName,,,,,,,,,,,,person.name-particle + person.family-names + person.name-suffix -schema:Person,email,Text,Email address,email,,,,,,,,,email,,author_email,,email-address,author.email,,,email,email,,,person.email/entity.email -schema:Person,affiliation,Text,"An organization that this person is affiliated with. For example, a school/university",affiliation,affiliation,,affiliation,,,,,,,,,,,,,,,,,,person.affiliation -schema:Person,identifier,URL,"URL identifer, ideally an ORCID ID for individuals, a FundRef ID for funders",identifier,nameIdentifier,,ORCID,,ORCID,,,,,,,,,,,,,,,,person.orcid / entity.orcid -schema:Person,name,Text,"The name of an Organization, or if separate given and family names cannot be resolved for a Person",,,,name,,name,,,,,,,,author:contact-name,author.name,,,,,,,entity.name -schema:Person,address,PostalAddress or Text,Physical address of the item.,,,,,,,,,,,,,,,,,,,,,,person.address + person.city + person.region + person.post-code + person.country / entity.address + entity.city + entity.region + entity.post-code + entity.country -schema,type,Object Type (from context or URI),"The object type (e.g. ""Person"", ""Organization"", ""ScientificArticle"", ""SoftwareApplication"", etc).",,,,,,,,,,,,,,,,,,,,,,reference.type -schema,id,URL,Primary identifier for an object. Must be a resolvable URL or a string used to refer to this node elsewhere in the same document,,,,,,,,,,,,,,,,,,,,,, -codemeta:SoftwareSourceCode,softwareSuggestions,SoftwareSourceCode,"Optional dependencies , e.g. for optional features, code development, etc",suggests,,,,,,,,,Suggests,,,,,devDependencies / optionalDependencies,,BuildDepends,add_development_dependency,,,, -codemeta:SoftwareSourceCode,maintainer,Person,Individual responsible for maintaining the software (usually includes an email contact address),uploadedBy,,,,,,,,,Maintainer,,,,,,,,,,maintainer,, -codemeta:SoftwareSourceCode,contIntegration,URL,link to continuous integration service,contIntegration,,,,,,,,,,,,,,,ciManagement,,,,,, -codemeta:SoftwareSourceCode,buildInstructions,URL,link to installation instructions/documentation,buildInstructions,,,,,,,,,,,,,,,,,,,,, -codemeta:SoftwareSourceCode,developmentStatus,Text,"Description of development status, e.g. Active, inactive, supsended. See repostatus.org",developmentStatus,,activeDevelopment,,,,,,,,,classifiers['Development Status'],Development Status,release_status,,,,,,,, -codemeta:SoftwareSourceCode,embargoDate,Date,"Software may be embargoed from public access until a specified date (e.g. pending publication, 1 year from publication)",embargoDate,,,,,embargo_date,,,,,,,,,,,,,,,, -codemeta:SoftwareSourceCode,funding,Text,Funding source (e.g. specific grant),funding,,fundingReference.awardTitle or fundingReference.awardNumber,,,,,,,,,,,,,,,,,,, -codemeta:SoftwareSourceCode,issueTracker,URL,link to software bug reporting or issue tracking system,issueTracker,,,,issues_url,,,,,BugReports,,,,resources.bugtracker,bugs,issuesManagement,Problems,,,bug-database,bug tracking system,repository -codemeta:SoftwareSourceCode,referencePublication,ScholarlyArticle,An academic publication related to the software.,relatedPublications,,,,,,,,,,,,,,,,,,,blog,,references -codemeta:SoftwareSourceCode,readme,URL,link to software Readme file,readme,,,,,,,,,,,,,,,,,,,,, -,,,,relatedIdentifer,,,,,,,,,,,,,,,,,,,,, -,,,,relatedIdentiferType,,,,,,,,,,,,,,,,,,,,, -,,,,relationshipType,,,,,,,,,,,,,,,,,,,,, -,,,,title,,,,,,,,,,,,,,,,,,,,, -,,,,namespace,,,,,,,,,,,,,,,,,,,,, -,,,,role,,,,,,,,,,,,,,,,,,,,, -,,,,roleCode,,,,,,,,,,,,,,,,,,,,, -,,,,softwarePaperCitationIdenifiers,,,,,,,,,,,,,,,,,,,,, +Parent Type,Property,Type,Description,codemeta-V1,DataCite,OntoSoft,Zenodo,GitHub,Figshare,Software Ontology,Software Discovery Index,Dublin Core,R Package Description,Debian Package,Python Distutils (PyPI),Python PKG-INFO,Trove Software Map,Perl Module Description (CPAN::Meta),NodeJS,Java (Maven),Octave,Ruby Gem,ASCL,DOAP,Wikidata,Citation File Format Core (CFF-Core) 1.0.2 +schema:SoftwareSourceCode,codeRepository,URL,"Link to the repository where the un-compiled, human readable code and related code is located (SVN, github, CodePlex).",codeRepository,,,relatedLink,html_url,relatedLink,,,,URL,HomePage,,,,resources.repository,repository,repositories,,homepage,site_list,repository,source code repository,repository-code +schema:SoftwareSourceCode,programmingLanguage,ComputerLanguage or Text,The computer programming language.,programmingLanguage,Format,hasProgrammingLanguage,,languages_url,,programming language,,,,,classifiers['Programming Language'],,Programming Language,,,,,,,programming-language,programming language, +schema:SoftwareSourceCode,runtimePlatform,Text,"Runtime platform or script interpreter dependencies (Example - Java v1, Python2.3, .Net Framework 3.0). Supersedes runtime.",,,,,,,,,,,,,,,,,,,platform,,platform,, +schema:SoftwareSourceCode,targetProduct,SoftwareApplication,"Target Operating System / Product to which the code applies. If applies to several versions, just the product name can be used.",,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,applicationCategory,Text or URL,"Type of software application, e.g. 'Game, Multimedia'.",,,hasSoftwareCategory,communities,,categories,,,,,,classifiers['Topic'],,Topic,Categories,,,Categories,,,,, +schema:SoftwareApplication,applicationSubCategory,Text or URL,"Subcategory of the application, e.g. 'Arcade Game'.",,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,downloadUrl,URL,"If the file can be downloaded, URL to download the binary.",downloadLink,,,,archive_url,,,,,,,download_url,Download-URL,,,,,,,,download-page,,repository-artifact +schema:SoftwareApplication,fileSize,Text,"Size of the application / package (e.g. 18MB). In the absence of a unit (MB, KB etc.), KB will be assumed.",,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,installUrl,URL,"URL at which the app may be installed, if different from the URL of the item.",,,,,,,,,,,,,,,,,,,,,download-mirror,, +schema:SoftwareApplication,memoryRequirements,Text or URL,Minimum memory requirements.,,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,operatingSystem,Text,"Operating systems supported (Windows 7, OSX 10.6, Android 1.6).",operatingSystems,,SupportsOperatingSystem,,,,,,,,,classifiers['Operating System'],,Operating System,OSNAMES,os,,,,,os,operating system, +schema:SoftwareApplication,permissions,Text,"Permission(s) required to run the app (for example, a mobile app may require full internet access or may run only on wifi).",,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,processorRequirements,Text,Processor architecture required to run the application (e.g. IA64).,,,,,,,,,,,,,,,,cpu / engines,,,,,,, +schema:SoftwareApplication,releaseNotes,Text or URL,Description of what changed in this version.,,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,softwareHelp,CreativeWork,Software application help.,,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,softwareRequirements,SoftwareSourceCode,Required software dependencies,depends,,hasDependency->Software,,,,,"""Platform, environment, and dependencies""",,"Depends, SystemRequirements",,install_requires,Requires,Database Environment,prereqs,dependencies / bundledDependencies / bundleDependencies / peerDependencies,prerequisites,"Depends, SystemRequirements","requirements, add_runtime_dependency",,,depends on software, +schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version, +schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,, +schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author,,developer,,authors +schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,, +schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,copyrightYear,Number,The year during which the claimed copyright for the CreativeWork was first asserted.,,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,creator,Organization or Person,The creator/author of this CreativeWork. This is the same as the Author property for CreativeWork.,agent,,,,,,,,creator,[cre] in Author,,,,,,author,,,,,,, +schema:CreativeWork,dateCreated,Date or DateTime,The date on which the CreativeWork was created or the item was added to a DataFeed.,dateCreated,date,,,created_at,,,,created,,Date,,,,,,,,,,,, +schema:CreativeWork,dateModified,Date or DateTime,The date on which the CreativeWork was most recently modified or when the item's entry was modified within a DataFeed.,dateModified,date,,,updated_at,,,,,,,,,last-updated,,,,,,,,, +schema:CreativeWork,datePublished,Date,Date of first broadcast/publication.,datePublished,publicationYear,,date_published,,date_retrieved,,,date,Date,,,,,,,,Date,,,,publication date,date-released +schema:CreativeWork,editor,Person,Specifies the Person who edited the CreativeWork.,,,,,,,,,,,,,,,,,,,,,,editor, +schema:CreativeWork,encoding,MediaObject,A media object that encodes this CreativeWork. This property is a synonym for associatedMedia. Supersedes encodings.,,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,fileFormat,Text or URL,"Media type, typically MIME format (see IANA site) of the content e.g. application/zip of a SoftwareApplication binary. In cases where a CreativeWork has several media type representations, 'encoding' can be used to indicate each MediaObject alongside particular fileFormat information. Unregistered or niche file formats can be indicated instead via the most appropriate URL, e.g. defining Web page or a Wikipedia entry.",,Format,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,funder,Organization or Person,A person or organization that supports (sponsors) something through some kind of financial contribution.,fundingReference.funderName,,,contributors.Funder,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,keywords,Text,Keywords or tags used to describe this content. Multiple entries in a keywords list are typically delimited by commas.,controlledTerms,subject,hasDomainKeywords,keywords,,tags,,,,,,keywords,Keywords,,keywords,keywords,,,,,category,,keywords +schema:CreativeWork,license,CreativeWork or URL,"A license document that applies to this content, typically indicated by URL.",licenseId,rights,License,license,license,License,software license,Software license,license,License,,license,License,license,license,license,license,License,license/licenses,,license,license,license/license-url +schema:CreativeWork,producer,Organization or Person,"The person or organization who produced the work (e.g. music album, movie, tv/radio series etc.).",,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,provider,Organization or Person,"The service provider, service operator, or service performer; the goods producer. Another party (a seller) may offer those services or goods on behalf of the provider. A provider may also serve as the seller. Supersedes carrier.",,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,publisher,Organization or Person,The publisher of the creative work.,publisher,publisher,os:hasPublisher,,,,software publisher organization,,publisher,,,,,,,,,,,,vendor,, +schema:CreativeWork,sponsor,Organization or Person,"A person or organization that supports a thing through a pledge, promise, or financial contribution. e.g. a sponsor of a Medical Study or a corporate sponsor of an event.",,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,version,Number or Text,The version of the CreativeWork embodied by a specified resource.,version,version,hasSoftwareVersion,,,,Version,Software version,dcterms:hasVersion,,numeric_version,Version,Version,version,,version,version,version,version,,,,version +schema:CreativeWork,isAccessibleForFree,Boolean,A flag to signal that the publication is accessible for free.,,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,isPartOf,CreativeWork,Indicates a CreativeWork that this CreativeWork is (in some sense) part of. Reverse property hasPart,,,,,,,,,,,,,,,,,,,,,,,references +schema:CreativeWork,hasPart,CreativeWork,Indicates a CreativeWork that is (in some sense) a part of this CreativeWork. Reverse property isPartOf,,,,,,,,,,,,,,,,,,,,,,, +schema:CreativeWork,position,Integer or Text,"The position of an item in a series or sequence of items. (While schema.org considers this a property of CreativeWork, it is also the way to indicate ordering in any list (e.g. the Authors list). By default arrays are unordered in JSON-LD",,,,,,,,,,,,,,,,,,,,,,, +schema:Thing,description,Text,A description of the item.,description,description,hasShortDescription,description/notes,description,Description,software,,description,Description,Description,"description, long_description",Summary / Description,description,"abstract, description",description,description,Description,"summary, description",abstract,,,abstract +schema:Thing,identifier,PropertyValue or URL,"The identifier property represents any kind of identifier for any kind of Thing, such as ISBNs, GTIN codes, UUIDs etc. Schema.org provides dedicated properties for representing many of these, either as textual strings or as URL (URI) links. See background notes for more details.",identifier,identifier,hasUniqueId,id,id,,,Persistent Identifier,identifier,Package,Package,,,,,name,groupId,,,ascl_id,,,doi +schema:Thing,name,Text,"The name of the item (software, Organization)",name,,hasName,title,full_name,Title,SoftwareTitle,Software title,title,Title,,name,Name,Title,name,name,name,name,name,title,,,title +schema:Thing,sameAs,URL,"URL of a reference Web page that unambiguously indicates the item's identity. E.g. the URL of the item's Wikipedia page, Wikidata entry, or official website.",,,,,,,,,,,,,,,,,,,,,,, +schema:Thing,url,URL,URL of the item.,URL,,,,,,,,,URL,,url,Home-Page,,,homepage,,URL,,,homepage,official website,url +schema:Thing,relatedLink,URL,"A link related to this object, e.g. related web pages",,RelateIdentifier,,,,,,,,,,,,,,,,,,,,, +schema:Person,givenName,Text,"Given name. In the U.S., the first name of a Person. This can be used along with familyName instead of the name property",,givenName,,,,,,,,givenName,,,,,,,,,,,,,person.given-names +schema:Person,familyName,Text,"Family name. In the U.S., the last name of an Person. This can be used along with givenName instead of the name property.",,familyName,,,,,,,,familyName,,,,,,,,,,,,,person.name-particle + person.family-names + person.name-suffix +schema:Person,email,Text,Email address,email,,,,,,,,,email,,author_email,Author-email,,email-address,author.email,,,email,email,,,person.email/entity.email +schema:Person,affiliation,Text,"An organization that this person is affiliated with. For example, a school/university",affiliation,affiliation,,affiliation,,,,,,,,,,,,,,,,,,,person.affiliation +schema:Person,identifier,URL,"URL identifer, ideally an ORCID ID for individuals, a FundRef ID for funders",identifier,nameIdentifier,,ORCID,,ORCID,,,,,,,,,,,,,,,,,person.orcid / entity.orcid +schema:Person,name,Text,"The name of an Organization, or if separate given and family names cannot be resolved for a Person",,,,name,,name,,,,,,,,,author:contact-name,author.name,,,,,,,entity.name +schema:Person,address,PostalAddress or Text,Physical address of the item.,,,,,,,,,,,,,,,,,,,,,,,person.address + person.city + person.region + person.post-code + person.country / entity.address + entity.city + entity.region + entity.post-code + entity.country +schema,type,Object Type (from context or URI),"The object type (e.g. ""Person"", ""Organization"", ""ScientificArticle"", ""SoftwareApplication"", etc).",,,,,,,,,,,,,,,,,,,,,,,reference.type +schema,id,URL,Primary identifier for an object. Must be a resolvable URL or a string used to refer to this node elsewhere in the same document,,,,,,,,,,,,,,,,,,,,,,, +codemeta:SoftwareSourceCode,softwareSuggestions,SoftwareSourceCode,"Optional dependencies , e.g. for optional features, code development, etc",suggests,,,,,,,,,Suggests,,,,,,devDependencies / optionalDependencies,,BuildDepends,add_development_dependency,,,, +codemeta:SoftwareSourceCode,maintainer,Person,Individual responsible for maintaining the software (usually includes an email contact address),uploadedBy,,,,,,,,,Maintainer,,maintainer / maintainer_email,,,,,,,,,maintainer,, +codemeta:SoftwareSourceCode,contIntegration,URL,link to continuous integration service,contIntegration,,,,,,,,,,,,,,,,ciManagement,,,,,, +codemeta:SoftwareSourceCode,buildInstructions,URL,link to installation instructions/documentation,buildInstructions,,,,,,,,,,,,,,,,,,,,,, +codemeta:SoftwareSourceCode,developmentStatus,Text,"Description of development status, e.g. Active, inactive, supsended. See repostatus.org",developmentStatus,,activeDevelopment,,,,,,,,,classifiers['Development Status'],,Development Status,release_status,,,,,,,, +codemeta:SoftwareSourceCode,embargoDate,Date,"Software may be embargoed from public access until a specified date (e.g. pending publication, 1 year from publication)",embargoDate,,,,,embargo_date,,,,,,,,,,,,,,,,, +codemeta:SoftwareSourceCode,funding,Text,Funding source (e.g. specific grant),funding,,fundingReference.awardTitle or fundingReference.awardNumber,,,,,,,,,,,,,,,,,,,, +codemeta:SoftwareSourceCode,issueTracker,URL,link to software bug reporting or issue tracking system,issueTracker,,,,issues_url,,,,,BugReports,,,,,resources.bugtracker,bugs,issuesManagement,Problems,,,bug-database,bug tracking system,repository +codemeta:SoftwareSourceCode,referencePublication,ScholarlyArticle,An academic publication related to the software.,relatedPublications,,,,,,,,,,,,,,,,,,,,blog,,references +codemeta:SoftwareSourceCode,readme,URL,link to software Readme file,readme,,,,,,,,,,,,,,,,,,,,,, +,,,,relatedIdentifer,,,,,,,,,,,,,,,,,,,,,, +,,,,relatedIdentiferType,,,,,,,,,,,,,,,,,,,,,, +,,,,relationshipType,,,,,,,,,,,,,,,,,,,,,, +,,,,title,,,,,,,,,,,,,,,,,,,,,, +,,,,namespace,,,,,,,,,,,,,,,,,,,,,, +,,,,role,,,,,,,,,,,,,,,,,,,,,, +,,,,roleCode,,,,,,,,,,,,,,,,,,,,,, +,,,,softwarePaperCitationIdenifiers,,,,,,,,,,,,,,,,,,,,,, diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 58e341f..3a5cefb 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,185 +1,192 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer def compute_license(path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: - A dict with the following keys: + dict: A dict with the following keys: + - licenses ([str]): associated detected licenses to path - path (bytes): content filepath """ try: properties = subprocess.check_output(['nomossa', path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') else: licenses = [] return { 'licenses': licenses, 'path': path, } except subprocess.CalledProcessError: if log: from os import path as __path log.exception('Problem during license detection for sha1 %s' % __path.basename(path)) return { 'licenses': [], 'path': path, } class MixinFossologyLicenseIndexer: """Mixin fossology license indexer. - See :class:`ContentFossologyLicenseIndexer` and + See :class:`FossologyLicenseIndexer` and :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.tool = self.tools[0] def compute_license(self, path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: - A dict with the following keys: + dict: A dict with the following keys: + - licenses ([str]): associated detected licenses to path - path (bytes): content filepath """ return compute_license(path, log=log) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier raw_content (bytes): associated raw content to content id Returns: - A dict, representing a content_license, with keys: - - id (bytes): content's identifier (sha1) - - license (bytes): license in bytes - - path (bytes): path - - indexer_configuration_id (int): tool used to compute the output + dict: A dict, representing a content_license, with keys: + + - id (bytes): content's identifier (sha1) + - license (bytes): license in bytes + - path (bytes): path + - indexer_configuration_id (int): tool used to compute the output """ + assert isinstance(id, bytes) content_path = self.write_to_temp( filename=hashutil.hash_to_hex(id), # use the id as pathname data=data) try: properties = self.compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the - following keys: + following keys: + - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path + policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) -class ContentFossologyLicenseIndexer( +class FossologyLicenseIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): """Indexer in charge of: + - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class FossologyLicenseRangeIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): """FossologyLicense Range Indexer working on range of content identifiers. - It: - filters out the non textual content - (optionally) filters out content already indexed (cf - :func:`indexed_contents_in_range`) + :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Returns: - a dict with keys: + dict: a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any + this sha1 if any """ return self.idx_storage.content_fossology_license_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 65946b5..30ce4ad 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,610 +1,621 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc +import ast import os import logging import shutil import tempfile import datetime from copy import deepcopy from swh.scheduler import get_scheduler from swh.storage import get_storage from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY from swh.model import hashutil from swh.core import utils class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary - file + file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement indexing: - :func:`run`: + :meth:`~BaseIndexer.run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, :class:`OriginIndexer`. Then you need to implement the following functions: - :func:`filter`: + :meth:`~BaseIndexer.filter`: filter out data already indexed (in storage). - :func:`index_object`: + :meth:`~BaseIndexer.index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. - :func:`persist_index_computations`: + :meth:`~BaseIndexer.persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: - :func:`prepare`: + :meth:`~BaseIndexer.prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. - :func:`check`: + :meth:`~BaseIndexer.check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. - :func:`register_tools`: + :meth:`~BaseIndexer.register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5007/' } }), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'objstorage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5003/', } }) } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ # HACK to deal with edge case (e.g revision metadata indexer) if not hasattr(self, 'config'): self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) config_storage = self.config.get('storage') if config_storage: self.storage = get_storage(**config_storage) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) idx_storage = self.config[INDEXER_CFG_KEY] self.idx_storage = get_indexer_storage(**idx_storage) _log = logging.getLogger('requests.packages.urllib3.connectionpool') _log.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) def check(self, *, check_tools=True): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if check_tools and not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: - List of dict with additional id key. + list: List of dicts with additional id key. Raises: - ValueError if not a list nor a dict. + ValueError: if not a list nor a dict. """ if isinstance(tools, list): tools = list(map(self._prepare_tool, tools)) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') if tools: return self.idx_storage.indexer_configuration_add(tools) else: return [] @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on - object type + object type Returns: - a dict that makes sense for the persist_index_computations - function. + dict: a dict that makes sense for the + :meth:`.persist_index_computations` method. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the - result of the index function. + result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore - them + respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results, task): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned - by index function. + by index function. task (dict): a dict in the form expected by - `scheduler.backend.SchedulerBackend.create_tasks` - without `next_run`, plus an optional `result_name` key. + `scheduler.backend.SchedulerBackend.create_tasks` + without `next_run`, plus an optional `result_name` key. Returns: None """ if task: if getattr(self, 'scheduler', None): scheduler = self.scheduler else: scheduler = get_scheduler(**self.config['scheduler']) task = deepcopy(task) result_name = task.pop('result_name', None) task['next_run'] = datetime.datetime.now() if result_name: task['arguments']['kwargs'][result_name] = self.results scheduler.create_tasks([task]) @abc.abstractmethod def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them next_step (dict): a dict in the form expected by - `scheduler.backend.SchedulerBackend.create_tasks` - without `next_run`, plus a `result_name` key. + `scheduler.backend.SchedulerBackend.create_tasks` + without `next_run`, plus a `result_name` key. **kwargs: passed to the `index` method """ pass class ContentIndexer(BaseIndexer): """A content indexer working on a list of ids directly. To work on indexer range, use the :class:`ContentRangeIndexer` instead. Note: :class:`ContentIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass def run(self, ids, policy_update, next_step=None, **kwargs): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: - ids ([bytes]): sha1's identifier list + ids (Iterable[Union[bytes, str]]): sha1's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them next_step (dict): a dict in the form expected by `scheduler.backend.SchedulerBackend.create_tasks` without `next_run`, plus an optional `result_name` key. **kwargs: passed to the `index` method """ + ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ + for id_ in ids] results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) except Exception: self.log.exception( 'Problem when reading contents metadata.') class ContentRangeIndexer(BaseIndexer): """A content range indexer. This expects as input a range of ids to index. To work on a list of ids, use the :class:`ContentIndexer` instead. Note: :class:`ContentRangeIndexer` is not an instantiable object. To use it, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ @abc.abstractmethod def indexed_contents_in_range(self, start, end): """Retrieve indexed contents within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Yields: - Content identifier (bytes) present in the range [start, end] + bytes: Content identifier present in the range ``[start, end]`` """ pass def _list_contents_to_index(self, start, end, indexed): """Compute from storage the new contents to index in the range [start, end]. The already indexed contents are skipped. Args: - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier - **indexed** (Set[bytes]): Set of content already indexed. + start (bytes): Starting bound from range identifier + end (bytes): End range identifier + indexed (Set[bytes]): Set of content already indexed. Yields: - Identifier (bytes) of contents to index. + bytes: Identifier of contents to index. """ + if not isinstance(start, bytes) or not isinstance(end, bytes): + raise TypeError('identifiers must be bytes, not %r and %r.' % + (start, end)) while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: - _id = c['sha1'] + _id = hashutil.hash_to_bytes(c['sha1']) if _id in indexed: continue yield _id start = result['next'] def _index_contents(self, start, end, indexed, **kwargs): """Index the contents from within range [start, end] Args: - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier - **indexed** (Set[bytes]): Set of content already indexed. + start (bytes): Starting bound from range identifier + end (bytes): End range identifier + indexed (Set[bytes]): Set of content already indexed. Yields: - Data indexed (dict) to persist using the indexer storage + dict: Data indexed to persist using the indexer storage """ for sha1 in self._list_contents_to_index(start, end, indexed): try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) if res: + if not isinstance(res['id'], bytes): + raise TypeError( + '%r.index should return ids as bytes, not %r' % + (self.__class__.__name__, res['id'])) yield res def _index_with_skipping_already_done(self, start, end): """Index not already indexed contents in range [start, end]. Args: - **start** (Union[bytes, str]): Starting range identifier - **end** (Union[bytes, str]): Ending range identifier + start** (Union[bytes, str]): Starting range identifier + end (Union[bytes, str]): Ending range identifier Yields: - Content identifier (bytes) present in the range [start, - end] which are not already indexed. + bytes: Content identifier present in the range + ``[start, end]`` which are not already indexed. """ while start: indexed_page = self.indexed_contents_in_range(start, end) contents = indexed_page['ids'] _end = contents[-1] if contents else end yield from self._index_contents( start, _end, contents) start = indexed_page['next'] def run(self, start, end, skip_existing=True, **kwargs): """Given a range of content ids, compute the indexing computations on the contents within. Either the indexer is incremental (filter out existing computed data) or not (compute everything from scratch). Args: - **start** (Union[bytes, str]): Starting range identifier - **end** (Union[bytes, str]): Ending range identifier - **skip_existing** (bool): Skip existing indexed data - (default) or not + start (Union[bytes, str]): Starting range identifier + end (Union[bytes, str]): Ending range identifier + skip_existing (bool): Skip existing indexed data + (default) or not **kwargs: passed to the `index` method Returns: - a boolean. True if data was indexed, False otherwise. + bool: True if data was indexed, False otherwise. """ with_indexed_data = False try: if isinstance(start, str): start = hashutil.hash_to_bytes(start) if isinstance(end, str): end = hashutil.hash_to_bytes(end) if skip_existing: gen = self._index_with_skipping_already_done(start, end) else: gen = self._index_contents(start, end, indexed=[]) for results in utils.grouper(gen, n=self.config['write_batch_size']): self.persist_index_computations( results, policy_update='update-dups') with_indexed_data = True except Exception: self.log.exception( 'Problem when computing metadata.') finally: return with_indexed_data class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method Note: the :class:`OriginIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update='update-dups', parse_ids=True, next_step=None, **kwargs): """Given a list of origin ids: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or - (type, url) tuples. + (type, url) tuples. policy_update (str): either 'update-dups' or 'ignore-dups' to - respectively update duplicates (default) - or ignore them + respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by - `scheduler.backend.SchedulerBackend.create_tasks` - without `next_run`, plus an optional `result_name` key. + `scheduler.backend.SchedulerBackend.create_tasks` without + `next_run`, plus an optional `result_name` key. parse_ids (bool): Do we need to parse id or not (default) **kwargs: passed to the `index` method """ if parse_ids: ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id for o in ids] results = [] for id_ in ids: + if isinstance(id_, str): + # Data coming from JSON, which requires string keys, so + # one extra level of deserialization is needed + id_ = ast.literal_eval(id_) if isinstance(id_, (tuple, list)): if len(id_) != 2: raise TypeError('Expected a (type, url) tuple.') (type_, url) = id_ params = {'type': type_, 'url': url} elif isinstance(id_, int): params = {'id': id_} else: raise TypeError('Invalid value in "ids": %r' % id_) origin = self.storage.origin_get(params) if not origin: - self.log.warning('Origins %s not found in storage' % - list(ids)) + self.log.warning('Origin %s not found in storage' % + list(id_)) continue try: res = self.index(origin, **kwargs) if origin: # If no results, skip it results.append(res) except Exception: self.log.exception( - 'Problem when processing origin %s' % id_) + 'Problem when processing origin %s' % (id_,)) self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update, next_step=None): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes or str]): sha1_git's identifier list policy_update (str): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore - them + respectively update duplicates or ignore them """ results = [] ids = [hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warning('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) self.results = results return self.next_step(results, task=next_step) diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 5ac61ec..5076de8 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,209 +1,209 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from pygments.lexers import guess_lexer from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector from .indexer import ContentIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def _read_raw(raw_content, size=2048): """Read raw content in chunk. """ bs = io.BytesIO(raw_content) while True: chunk = bs.read(size) if not chunk: break yield chunk def _detect_encoding(raw_content): """Given a raw content, try and detect its encoding. """ detector = UniversalDetector() for chunk in _read_raw(raw_content): detector.feed(chunk) if detector.done: break detector.close() return detector.result['encoding'] def compute_language_from_chunk(encoding, length, raw_content, max_size, log=None): """Determine the raw content's language. Args: encoding (str): Encoding to use to decode the content length (int): raw_content's length raw_content (bytes): raw content to work with max_size (int): max size to split the raw content at Returns: - Dict with keys: - - lang: None if nothing found or the possible language + dict: Dict with keys: + - **lang**: None if nothing found or the possible language """ try: if max_size <= length: raw_content = raw_content[0:max_size] content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except UnicodeDecodeError: raise except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } def compute_language(raw_content, encoding=None, log=None): """Determine the raw content's language. Args: raw_content (bytes): raw content to work with Returns: - Dict with keys: - - lang: None if nothing found or the possible language + dict: Dict with keys: + - **lang**: None if nothing found or the possible language """ try: encoding = _detect_encoding(raw_content) content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } -class ContentLanguageIndexer(ContentIndexer): +class LanguageIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/language' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, }), } def prepare(self): super().prepare() c = self.config self.max_content_size = c['tools']['configuration']['max_content_size'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_language_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'] } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language + dict: Dict that represents a content_mimetype, with keys: + - id (bytes): content's identifier (sha1) + - lang (bytes): detected language """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'lang': None, } encoding = _detect_encoding(data) if not encoding: return result _len = len(data) for i in range(0, 9): max_size = self.max_content_size + i try: result = compute_language_from_chunk( encoding, _len, data, max_size, log=self.log) except UnicodeDecodeError: self.log.warning( 'Decoding failed on wrong byte chunk at [0-%s]' ', trying again at next ending byte.' % max_size) continue # we found something, so we return it result.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) break return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_language_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 84827fc..0b6fcb1 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,337 +1,336 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import itertools import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ # Note: This used when the content metadata indexer is used alone # (not the case for example in the case of the RevisionMetadataIndexer) CONFIG_BASE_FILENAME = 'indexer/content_metadata' def __init__(self, tool, config): # FIXME: Simplify this twisted way to use the exact same # config of RevisionMetadataIndexer object that uses # internally ContentMetadataIndexer self.config = config self.config['tools'] = tool self.results = [] super().__init__() self.tool = self.tools[0] # Tool is now registered (cf. prepare call) def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] result['translated_metadata'] = MAPPINGS[mapping_name] \ .translate(data) except Exception: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/revision_metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': ['NpmMapping', 'CodemetaMapping'] }, }), } ContentMetadataIndexer = ContentMetadataIndexer def prepare(self): super().prepare() self.tool = self.tools[0] def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - - id (str): rev's identifier (sha1_git) - - indexer_configuration_id (bytes): tool used - - translated_metadata: dict of retrieved metadata + - id (str): rev's identifier (sha1_git) + - indexer_configuration_id (bytes): tool used + - translated_metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the - following keys: + following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = self.ContentMetadataIndexer(tool, config) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata class OriginMetadataIndexer(OriginIndexer): CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' ADDITIONAL_CONFIG = { 'tools': ('list', []) } def check(self, **kwargs): kwargs['check_tools'] = False super().check(**kwargs) def filter(self, ids): return ids def run(self, origin_head, policy_update): """Expected to be called with the result of RevisionMetadataIndexer as first argument; ie. not a list of ids as other indexers would. Args: - - * `origin_head` (dict): {str(origin_id): rev_id} + origin_head (dict): {str(origin_id): rev_id} keys `origin_id` and `revision_id`, which is the result of OriginHeadIndexer. - * `policy_update`: `'ignore-dups'` or `'update-dups'` + policy_update (str): `'ignore-dups'` or `'update-dups'` """ - origin_head_map = {int(origin_id): hashutil.hash_to_bytes(rev_id) + origin_head_map = {origin_id: hashutil.hash_to_bytes(rev_id) for (origin_id, rev_id) in origin_head.items()} # Fix up the argument order. revisions_metadata has to be the # first argument because of celery.chain; the next line calls # run() with the usual order, ie. origin ids first. return super().run(ids=list(origin_head_map), policy_update=policy_update, parse_ids=False, origin_head_map=origin_head_map) def index(self, origin, *, origin_head_map): # Get the last revision of the origin. - revision_id = origin_head_map[origin['id']] + revision_id = origin_head_map[str(origin['id'])] revision_metadata = self.idx_storage \ .revision_metadata_get([revision_id]) results = [] for item in revision_metadata: assert item['id'] == revision_id # Get the metadata of that revision, and return it results.append({ 'origin_id': origin['id'], 'metadata': item['translated_metadata'], 'from_revision': revision_id, 'indexer_configuration_id': item['tool']['id'], }) return results def persist_index_computations(self, results, policy_update): self.idx_storage.origin_intrinsic_metadata_add( list(itertools.chain(*results)), conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py index 629974a..fb7fc3f 100644 --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -1,60 +1,62 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.indexer.codemeta import compact, expand from swh.indexer.codemeta import make_absolute_uri from swh.indexer.metadata_dictionary import MAPPINGS def detect_metadata(files): """ Detects files potentially containing metadata + Args: - - file_entries (list): list of files + file_entries (list): list of files Returns: - - empty list if nothing was found - - dictionary {mapping_filenames[name]:f['sha1']} + dict: {mapping_filenames[name]:f['sha1']} (may be empty) """ results = {} for (mapping_name, mapping) in MAPPINGS.items(): matches = mapping.detect_metadata_files(files) if matches: results[mapping_name] = matches return results _MINIMAL_PROPERTY_SET = { "developmentStatus", "version", "operatingSystem", "description", "keywords", "issueTracker", "name", "author", "relatedLink", "url", "license", "maintainer", "email", "identifier", "codeRepository"} MINIMAL_METADATA_SET = {make_absolute_uri(prop) for prop in _MINIMAL_PROPERTY_SET} def extract_minimal_metadata_dict(metadata_list): """ Every item in the metadata_list is a dict of translated_metadata in the - CodeMeta vocabulary - we wish to extract a minimal set of terms and keep all values corresponding - to this term without duplication + CodeMeta vocabulary. + + We wish to extract a minimal set of terms and keep all values corresponding + to this term without duplication. + Args: - - metadata_list (list): list of dicts of translated_metadata + metadata_list (list): list of dicts of translated_metadata Returns: - - minimal_dict (dict): one dict with selected values of metadata + dict: minimal_dict; dict with selected values of metadata """ minimal_dict = {} for document in metadata_list: for metadata_item in expand(document): for (term, value) in metadata_item.items(): if term in MINIMAL_METADATA_SET: if term not in minimal_dict: minimal_dict[term] = [value] elif value not in minimal_dict[term]: minimal_dict[term].append(value) return compact(minimal_dict) diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index b8e01b9..300fa46 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,284 +1,405 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re import abc import json import logging +import email.parser + import xmltodict from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.codemeta import compact, expand MAPPINGS = {} def register_mapping(cls): MAPPINGS[cls.__name__] = cls() return cls class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self): self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) @abc.abstractmethod def detect_metadata_files(self, files): """ Detects files potentially containing metadata + Args: - - file_entries (list): list of files + file_entries (list): list of files Returns: - - empty list if nothing was found - - list of sha1 otherwise + list: list of sha1 (possibly empty) """ pass @abc.abstractmethod def translate(self, file_content): pass def normalize_translation(self, metadata): return compact(metadata) class SingleFileMapping(BaseMapping): """Base class for all mappings that use a single file as input.""" @property @abc.abstractmethod def filename(self): """The .json file to extract metadata from.""" pass def detect_metadata_files(self, file_entries): for entry in file_entries: if entry['name'] == self.filename: return [entry['sha1']] return [] class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @property @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" pass def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: - content_dict (dict) + content_dict (dict): content dict to translate Returns: dict: translated metadata in json-friendly form needed for - the indexer + the indexer """ translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key - translation_method = getattr(self, 'translate_' + k, None) + translation_method = getattr( + self, 'translate_' + k.replace('-', '_'), None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table # if there is a normalization method, use it on the value - normalization_method = getattr(self, 'normalize_' + k, None) + normalization_method = getattr( + self, 'normalize_' + k.replace('-', '_'), None) if normalization_method: v = normalization_method(v) # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v if normalize: return self.normalize_translation(translated_metadata) else: return translated_metadata class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: - raw_content: bytes + raw_content (bytes): raw content to translate Returns: dict: translated metadata in json-friendly form needed for - the indexer + the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning('Error unidecoding %r', raw_content) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning('Error unjsoning %r' % raw_content) return return self.translate_dict(content_dict) @register_mapping class NpmMapping(JsonMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' _schema_shortcuts = { 'github': 'https://github.com/', 'gist': 'https://gist.github.com/', 'bitbucket': 'https://bitbucket.org/', 'gitlab': 'https://gitlab.com/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository""" if isinstance(d, dict): - return '{type}+{url}'.format(**d) + url = '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: - return d + url = d elif ':' in d: (schema, rest) = d.split(':', 1) if schema in self._schema_shortcuts: - return self._schema_shortcuts[schema] + rest + url = self._schema_shortcuts[schema] + rest else: return None else: - return self._schema_shortcuts['github'] + d + url = self._schema_shortcuts['github'] + d else: return None + return {'@id': url} + def normalize_bugs(self, d): - return '{url}'.format(**d) + return {'@id': '{url}'.format(**d)} _parse_author = re.compile(r'^ *' r'(?P.*?)' r'( +<(?P.*)>)?' r'( +\((?P.*)\))?' r' *$') def normalize_author(self, d): 'https://docs.npmjs.com/files/package.json' \ '#people-fields-author-contributors' author = {'@type': SCHEMA_URI+'Person'} if isinstance(d, dict): name = d.get('name', None) email = d.get('email', None) url = d.get('url', None) elif isinstance(d, str): match = self._parse_author.match(d) name = match.group('name') email = match.group('email') url = match.group('url') else: return None if name: author[SCHEMA_URI+'name'] = name if email: author[SCHEMA_URI+'email'] = email if url: - author[SCHEMA_URI+'url'] = url - return author + author[SCHEMA_URI+'url'] = {'@id': url} + return {"@list": [author]} + + def normalize_license(self, s): + return {"@id": "https://spdx.org/licenses/" + s} + + def normalize_homepage(self, s): + return {"@id": s} @register_mapping class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ filename = b'codemeta.json' def translate(self, content): return self.normalize_translation(expand(json.loads(content.decode()))) @register_mapping class MavenMapping(DictMapping, SingleFileMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] def translate(self, content): d = xmltodict.parse(content)['project'] metadata = self.translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) + metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) return self.normalize_translation(metadata) _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories""" if 'repositories' not in d: return [self.parse_repository(d, self._default_repository)] else: repositories = d['repositories'].get('repository', []) if not isinstance(repositories, list): repositories = [repositories] results = [] for repo in repositories: res = self.parse_repository(d, repo) if res: results.append(res) return results def parse_repository(self, d, repo): if repo.get('layout', 'default') != 'default': return # TODO ? url = repo['url'] if d['groupId']: url = os.path.join(url, *d['groupId'].split('.')) if d['artifactId']: url = os.path.join(url, d['artifactId']) - return url + return {"@id": url} + + def normalize_groupId(self, id_): + return {"@id": id_} + + def parse_licenses(self, d): + """https://maven.apache.org/pom.html#Licenses + + The origin XML has the form: + + + + Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + Which was translated to a dict by xmltodict and is given as `d`: + + >>> d = { + ... # ... + ... "licenses": { + ... "license": { + ... "name": "Apache License, Version 2.0", + ... "url": + ... "https://www.apache.org/licenses/LICENSE-2.0.txt" + ... } + ... } + ... } + >>> MavenMapping().parse_licenses(d) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] + + or, if there are more than one license: + + >>> from pprint import pprint + >>> d = { + ... # ... + ... "licenses": { + ... "license": [ + ... { + ... "name": "Apache License, Version 2.0", + ... "url": + ... "https://www.apache.org/licenses/LICENSE-2.0.txt" + ... }, + ... { + ... "name": "MIT License, ", + ... "url": "https://opensource.org/licenses/MIT" + ... } + ... ] + ... } + ... } + >>> pprint(MavenMapping().parse_licenses(d)) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, + {'@id': 'https://opensource.org/licenses/MIT'}] + """ + + licenses = d.get('licenses', {}).get('license', []) + if isinstance(licenses, dict): + licenses = [licenses] + return [{"@id": license['url']} for license in licenses] + + +_normalize_pkginfo_key = str.lower + + +@register_mapping +class PythonPkginfoMapping(DictMapping, SingleFileMapping): + """Dedicated class for Python's PKG-INFO mapping and translation. + + https://www.python.org/dev/peps/pep-0314/""" + filename = b'PKG-INFO' + mapping = {_normalize_pkginfo_key(k): v + for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} + + _parser = email.parser.BytesHeaderParser() + + def translate(self, content): + msg = self._parser.parsebytes(content) + d = {} + for (key, value) in msg.items(): + key = _normalize_pkginfo_key(key) + if value != 'UNKNOWN': + d.setdefault(key, []).append(value) + metadata = self.translate_dict(d, normalize=False) + if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: + metadata[SCHEMA_URI+'author'] = { + '@list': [{ + '@type': SCHEMA_URI+'Person', + SCHEMA_URI+'name': + metadata.pop(SCHEMA_URI+'author', [None])[0], + SCHEMA_URI+'email': + metadata.pop(SCHEMA_URI+'email', [None])[0], + }] + } + return self.normalize_translation(metadata) + + def translate_summary(self, translated_metadata, v): + k = self.mapping['summary'] + translated_metadata.setdefault(k, []).append(v) + + def translate_description(self, translated_metadata, v): + k = self.mapping['description'] + translated_metadata.setdefault(k, []).append(v) + + def normalize_home_page(self, urls): + return [{'@id': url} for url in urls] + + def normalize_license(self, licenses): + return [{'@id': license} for license in licenses] def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = MAPPINGS["NpmMapping"].translate(raw_content) result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 1877644..af957c3 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,153 +1,150 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import magic from swh.model import hashutil from .indexer import ContentIndexer, ContentRangeIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: - A dict with mimetype and encoding key and corresponding values + dict: mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type, 'encoding': r.encoding, } class MixinMimetypeIndexer: """Mixin mimetype indexer. - See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer` + See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` """ ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), 'write_batch_size': ('int', 1000), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() self.tool = self.tools[0] def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: + dict: content's mimetype; dict keys being - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes + - **id** (bytes): content's identifier (sha1) + - **mimetype** (bytes): mimetype in bytes + - **encoding** (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: - results ([dict]): list of content_mimetype, dict with the - following keys: - - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes + results ([dict]): list of content's mimetype dicts + (see :meth:`.index`) policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) -class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): +class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): """Mimetype Indexer working on list of content identifiers. It: - - (optionally) filters out content already indexed (cf. :callable:`filter`) + + - (optionally) filters out content already indexed (cf. + :meth:`.filter`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage - FIXME: - - 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer - - 2. Do we keep it afterwards? ~> i think this can be used with the journal - """ def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): """Mimetype Range Indexer working on range of content identifiers. It: - - (optionally) filters out content already indexed (cf :callable:`range`) + + - (optionally) filters out content already indexed (cf + :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage """ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Returns: - a dict with keys: + dict: a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any + this sha1 if any """ return self.idx_storage.content_mimetype_get_range( start, end, self.tool['id']) diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py index d2697e0..b01b326 100644 --- a/swh/indexer/rehash.py +++ b/swh/indexer/rehash.py @@ -1,172 +1,172 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import itertools from collections import defaultdict from swh.core import utils from swh.core.config import SWHConfig from swh.model import hashutil from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.storage import get_storage class RecomputeChecksums(SWHConfig): """Class in charge of (re)computing content's hashes. Hashes to compute are defined across 2 configuration options: compute_checksums ([str]) list of hash algorithms that py:func:`swh.model.hashutil.MultiHash.from_data` function should be able to deal with. For variable-length checksums, a desired checksum length should also be provided. Their format is : e.g: blake2:512 recompute_checksums (bool) a boolean to notify that we also want to recompute potential existing hashes specified in compute_checksums. Default to False. """ DEFAULT_CONFIG = { # The storage to read from or update metadata to 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/' }, }), # The objstorage to read contents' data from 'objstorage': ('dict', { 'cls': 'pathslicing', 'args': { 'root': '/srv/softwareheritage/objects', 'slicing': '0:2/2:4/4:6', }, }), # the set of checksums that should be computed. # Examples: 'sha1_git', 'blake2b512', 'blake2s256' 'compute_checksums': ( 'list[str]', []), # whether checksums that already exist in the DB should be # recomputed/updated or left untouched 'recompute_checksums': ('bool', False), # Number of contents to retrieve blobs at the same time 'batch_size_retrieve_content': ('int', 10), # Number of contents to update at the same time 'batch_size_update': ('int', 100), } CONFIG_BASE_FILENAME = 'indexer/rehash' def __init__(self): self.config = self.parse_config_file() self.storage = get_storage(**self.config['storage']) self.objstorage = get_objstorage(**self.config['objstorage']) self.compute_checksums = self.config['compute_checksums'] self.recompute_checksums = self.config[ 'recompute_checksums'] self.batch_size_retrieve_content = self.config[ 'batch_size_retrieve_content'] self.batch_size_update = self.config[ 'batch_size_update'] self.log = logging.getLogger('swh.indexer.rehash') if not self.compute_checksums: raise ValueError('Checksums list should not be empty.') def _read_content_ids(self, contents): """Read the content identifiers from the contents. """ for c in contents: h = c['sha1'] if isinstance(h, str): h = hashutil.hash_to_bytes(h) yield h def get_new_contents_metadata(self, all_contents): """Retrieve raw contents and compute new checksums on the contents. Unknown or corrupted contents are skipped. Args: all_contents ([dict]): List of contents as dictionary with - the necessary primary keys + the necessary primary keys checksum_algorithms ([str]): List of checksums to compute Yields: - tuple of: content to update, list of checksums computed + tuple: tuple of (content to update, list of checksums computed) """ content_ids = self._read_content_ids(all_contents) for contents in utils.grouper(content_ids, self.batch_size_retrieve_content): contents_iter = itertools.tee(contents, 2) try: content_metadata = self.storage.content_get_metadata( [s for s in contents_iter[0]]) except Exception: self.log.exception( 'Problem when reading contents metadata.') continue for content in content_metadata: if self.recompute_checksums: # Recompute checksums provided # in compute_checksums options checksums_to_compute = list(self.compute_checksums) else: # Compute checksums provided in compute_checksums # options not already defined for that content checksums_to_compute = [h for h in self.compute_checksums if not content.get(h)] if not checksums_to_compute: # Nothing to recompute continue try: raw_content = self.objstorage.get(content['sha1']) except ObjNotFoundError: self.log.warning('Content %s not found in objstorage!' % content['sha1']) continue content_hashes = hashutil.MultiHash.from_data( raw_content, hash_names=checksums_to_compute).digest() content.update(content_hashes) yield content, checksums_to_compute def run(self, contents): """Given a list of content: - (re)compute a given set of checksums on contents available in our object storage - update those contents with the new metadata Args: contents (dict): contents as dictionary with necessary keys. key present in such dictionary should be the ones defined in the 'primary_key' option. """ for data in utils.grouper( self.get_new_contents_metadata(contents), self.batch_size_update): groups = defaultdict(list) for content, keys_to_update in data: keys = ','.join(keys_to_update) groups[keys].append(content) for keys_to_update, contents in groups.items(): keys = keys_to_update.split(',') try: self.storage.content_update(contents, keys=keys) except Exception: self.log.exception('Problem during update.') continue diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 7f4e9eb..2548a8c 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,746 +1,745 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import psycopg2 from collections import defaultdict from swh.core.api import remote_api_endpoint from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db from . import converters INDEXER_CFG_KEY = 'indexer_storage' def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: cls (str): storage's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the storage class constructor Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage elif cls == 'memory': from .in_memory import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) @remote_api_endpoint('check_config') def check_config(self, *, check_write): """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables with self.get_db().transaction() as cur: if check_write: check = 'INSERT' else: check = 'SELECT' cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,) ) return cur.fetchone()[0] return True @remote_api_endpoint('content_mimetype/missing') @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): """Generate mimetypes missing from storage. Args: mimetypes (iterable): iterable of dict with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: tuple (id, indexer_configuration_id): missing id """ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, db=None, cur=None): """Retrieve ids of type content_type within range [start, end] bound by limit. Args: **content_type** (str): content's type (mimetype, language, etc...) **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) **with_textual_data** (bool): Deal with only textual content (True) or all content (all contents by defaults, False) Raises: ValueError for; - limit to None - wrong content_type provided Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ if limit is None: raise ValueError('Development error: limit should not be None') if content_type not in db.content_indexer_names: err = 'Development error: Wrong type. Should be one of [%s]' % ( ','.join(db.content_indexer_names)) raise ValueError(err) ids = [] next_id = None for counter, obj in enumerate(db.content_get_range( content_type, start, end, indexer_configuration_id, limit=limit+1, with_textual_data=with_textual_data, cur=cur)): _id = obj[0] if counter >= limit: next_id = _id break ids.append(_id) return { 'ids': ids, 'next': next_id } @remote_api_endpoint('content_mimetype/range') @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): """Retrieve mimetypes within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self._content_get_range('mimetype', start, end, indexer_configuration_id, limit=limit, db=db, cur=cur) @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): """Add mimetypes not present in storage. Args: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **indexer_configuration_id** (int): tool's id used to compute the results - **conflict_update** (bool): Flag to determine if we want to overwrite (``True``) or skip duplicates (``False``, the default) """ db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) @remote_api_endpoint('content_mimetype') @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): """Retrieve full content mimetype per ids. Args: ids (iterable): sha1 identifier Yields: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **tool** (dict): Tool used to compute the language """ for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @remote_api_endpoint('content_language/missing') @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): """List languages missing from storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @remote_api_endpoint('content_language') @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): """Retrieve full content language per ids. Args: ids (iterable): sha1 identifier Yields: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **lang** (bytes): raw content's language - **tool** (dict): Tool used to compute the language """ for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @remote_api_endpoint('content_language/add') @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): """Add languages not present in storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 - **lang** (bytes): language detected conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/ctags/missing') @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @remote_api_endpoint('content/ctags') @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums Yields: Dictionaries with keys: - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - - **language** (str): language for that content + - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info """ for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @remote_api_endpoint('content/ctags/add') @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, - line, language + line, lang """ def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/ctags/search') @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) @remote_api_endpoint('content/fossology_license') @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): """Retrieve licenses per id. Args: ids (iterable): sha1 checksums Yields: - list: dictionaries with the following keys: + `{id: facts}` where `facts` is a dict with the following keys: - - **id** (bytes) - **licenses** ([str]): associated licenses for that content - **tool** (dict): Tool used to compute the license """ d = defaultdict(list) for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) id_ = license['id'] d[id_].append(converters.db_to_fossology_license(license)) for id_, facts in d.items(): yield {id_: facts} @remote_api_endpoint('content/fossology_license/add') @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): """Add licenses not present in storage. Args: licenses (iterable): dictionaries with keys: - **id**: sha1 - - **license** ([bytes]): List of licenses associated to sha1 + - **licenses** ([bytes]): List of licenses associated to sha1 - **tool** (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) Returns: list: content_license entries which failed due to unknown licenses """ # Then, we add the correct ones db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) @remote_api_endpoint('content/fossology_license/range') @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): """Retrieve licenses within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] @remote_api_endpoint('content_metadata') @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: id (bytes) translated_metadata (str): associated metadata tool (dict): tool used to compute metadata """ for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @remote_api_endpoint('content_metadata/add') @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1 - **translated_metadata**: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('revision_metadata/missing') @db_transaction_generator() def revision_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1_git revision identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing ids """ for obj in db.revision_metadata_missing_from_list(metadata, cur): yield obj[0] @remote_api_endpoint('revision_metadata') @db_transaction_generator() def revision_metadata_get(self, ids, db=None, cur=None): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ for c in db.revision_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_metadata_cols, c))) @remote_api_endpoint('revision_metadata/add') @db_transaction() def revision_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_revision_metadata(cur) db.copy_to(metadata, 'tmp_revision_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): """Retrieve origin metadata per id. Args: ids (iterable): origin identifiers Yields: list: dictionaries with the following keys: - - **id** (int) - - **translated_metadata** (str): associated metadata + - **origin_id** (int) + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('origin_intrinsic_metadata/add') @db_transaction() def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): """Add origin metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **origin_id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['origin_id', 'metadata', 'indexer_configuration_id', 'from_revision'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: conjunction (List[str]): List of terms to be searched for. limit (int): The maximum number of results to return Yields: list: dictionaries with the following keys: - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) @remote_api_endpoint('indexer_configuration/data') @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py index 14a358a..8a90dad 100644 --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -1,75 +1,75 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click from swh.core import config from swh.core.api import (SWHServerAPIApp, error_handler, encode_data_server as encode_data) from swh.indexer.storage import ( get_indexer_storage, INDEXER_CFG_KEY, IndexerStorage ) DEFAULT_CONFIG_PATH = 'storage/indexer' DEFAULT_CONFIG = { INDEXER_CFG_KEY: ('dict', { 'cls': 'local', 'args': { 'db': 'dbname=softwareheritage-indexer-dev', }, }) } def get_storage(): global storage if not storage: storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY]) return storage app = SWHServerAPIApp(__name__, backend_class=IndexerStorage, backend_factory=get_storage) storage = None @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) @app.route('/') def index(): return 'SWH Indexer Storage API server' def run_from_webserver(environ, start_response, config_path=DEFAULT_CONFIG_PATH): """Run the WSGI app from the webserver, loading the configuration.""" cfg = config.load_named_config(config_path, DEFAULT_CONFIG) app.config.update(cfg) handler = logging.StreamHandler() app.logger.addHandler(handler) return app(environ, start_response) @click.command() +@click.argument('config-path', required=1) @click.option('--host', default='0.0.0.0', help="Host to run the server") @click.option('--port', default=5007, type=click.INT, help="Binding port of the server") @click.option('--debug/--nodebug', default=True, help="Indicates if the server should run in debug mode") -def launch(host, port, debug): - cfg = config.load_named_config(DEFAULT_CONFIG_PATH, DEFAULT_CONFIG) - app.config.update(cfg) +def launch(config_path, host, port, debug): + app.config.update(config.read(config_path, DEFAULT_CONFIG)) app.run(host, port=int(port), debug=bool(debug)) if __name__ == '__main__': launch() diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py index 65859fc..177dd53 100644 --- a/swh/indexer/storage/converters.py +++ b/swh/indexer/storage/converters.py @@ -1,138 +1,140 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def ctags_to_db(ctags): """Convert a ctags entry into a ready ctags entry. Args: ctags (dict): ctags entry with the following keys: - id (bytes): content's identifier - tool_id (int): tool id used to compute ctags - ctags ([dict]): List of dictionary with the following keys: - name (str): symbol's name - kind (str): symbol's kind - line (int): symbol's line in the content - language (str): language Returns: list: list of ctags entries as dicts with the following keys: - - id (bytes): content's identifier - - name (str): symbol's name - - kind (str): symbol's kind - - language (str): language for that content - - tool_id (int): tool id used to compute ctags + - id (bytes): content's identifier + - name (str): symbol's name + - kind (str): symbol's kind + - language (str): language for that content + - tool_id (int): tool id used to compute ctags """ id = ctags['id'] tool_id = ctags['indexer_configuration_id'] for ctag in ctags['ctags']: yield { 'id': id, 'name': ctag['name'], 'kind': ctag['kind'], 'line': ctag['line'], 'lang': ctag['lang'], 'indexer_configuration_id': tool_id, } def db_to_ctags(ctag): """Convert a ctags entry into a ready ctags entry. Args: ctags (dict): ctags entry with the following keys: - - id (bytes): content's identifier - - ctags ([dict]): List of dictionary with the following keys: - - name (str): symbol's name - - kind (str): symbol's kind - - line (int): symbol's line in the content - - language (str): language + + - id (bytes): content's identifier + - ctags ([dict]): List of dictionary with the following keys: + - name (str): symbol's name + - kind (str): symbol's kind + - line (int): symbol's line in the content + - language (str): language Returns: - List of ctags ready entry (dict with the following keys): + list: list of ctags ready entry (dict with the following keys): + - id (bytes): content's identifier - name (str): symbol's name - kind (str): symbol's kind - language (str): language for that content - tool (dict): tool used to compute the ctags """ return { 'id': ctag['id'], 'name': ctag['name'], 'kind': ctag['kind'], 'line': ctag['line'], 'lang': ctag['lang'], 'tool': { 'id': ctag['tool_id'], 'name': ctag['tool_name'], 'version': ctag['tool_version'], 'configuration': ctag['tool_configuration'] } } def db_to_mimetype(mimetype): """Convert a ctags entry into a ready ctags output. """ return { 'id': mimetype['id'], 'encoding': mimetype['encoding'], 'mimetype': mimetype['mimetype'], 'tool': { 'id': mimetype['tool_id'], 'name': mimetype['tool_name'], 'version': mimetype['tool_version'], 'configuration': mimetype['tool_configuration'] } } def db_to_language(language): """Convert a language entry into a ready language output. """ return { 'id': language['id'], 'lang': language['lang'], 'tool': { 'id': language['tool_id'], 'name': language['tool_name'], 'version': language['tool_version'], 'configuration': language['tool_configuration'] } } def db_to_metadata(metadata): """Convert a metadata entry into a ready metadata output. """ metadata['tool'] = { 'id': metadata['tool_id'], 'name': metadata['tool_name'], 'version': metadata['tool_version'], 'configuration': metadata['tool_configuration'] } del metadata['tool_id'], metadata['tool_configuration'] del metadata['tool_version'], metadata['tool_name'] return metadata def db_to_fossology_license(license): return { 'licenses': license['licenses'], 'tool': { 'id': license['tool_id'], 'name': license['tool_name'], 'version': license['tool_version'], 'configuration': license['tool_configuration'], } } diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py index 0fea30c..68ee6c9 100644 --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,396 +1,397 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model import hashutil from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes from swh.storage.db import line_to_bytes, execute_values_to_bytes class Db(BaseDb): """Proxy to the SWH Indexer DB, with wrappers around stored procedures """ content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] def _missing_from_list(self, table, data, hash_keys, cur=None): """Read from table the data with hash_keys that are missing. Args: table (str): Table name (e.g content_mimetype, content_language, - etc...) + etc...) data (dict): Dict of data to read from hash_keys ([str]): List of keys to read in the data dict. Yields: The data which is missing from the db. """ cur = self._cursor(cur) keys = ', '.join(hash_keys) equality = ' AND '.join( ('t.%s = c.%s' % (key, key)) for key in hash_keys ) yield from execute_values_to_bytes( cur, """ select %s from (values %%s) as t(%s) where not exists ( select 1 from %s c where %s ) """ % (keys, keys, table, equality), (tuple(m[k] for k in hash_keys) for m in data) ) def content_mimetype_missing_from_list(self, mimetypes, cur=None): """List missing mimetypes. """ yield from self._missing_from_list( 'content_mimetype', mimetypes, self.content_mimetype_hash_keys, cur=cur) content_mimetype_cols = [ 'id', 'mimetype', 'encoding', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_mimetype') def mktemp_content_mimetype(self, cur=None): pass def content_mimetype_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", (conflict_update, )) def _convert_key(self, key, main_table='c'): """Convert keys according to specific use in the module. + Args: key (str): Key expression to change according to the alias - used in the query + used in the query main_table (str): Alias to use for the main table. Default - to c for content_{something}. + to c for content_{something}. Expected: Tables content_{something} being aliased as 'c' (something in {language, mimetype, ...}), table indexer_configuration being aliased as 'i'. """ if key == 'id': return '%s.id' % main_table elif key == 'tool_id': return 'i.id as tool_id' elif key == 'licenses': return ''' array(select name from fossology_license where id = ANY( array_agg(%s.license_id))) as licenses''' % main_table return key def _get_from_list(self, table, ids, cols, cur=None, id_col='id'): """Fetches entries from the `table` such that their `id` field (or whatever is given to `id_col`) is in `ids`. Returns the columns `cols`. The `cur`sor is used to connect to the database. """ cur = self._cursor(cur) keys = map(self._convert_key, cols) query = """ select {keys} from (values %s) as t(id) inner join {table} c on c.{id_col}=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id; """.format( keys=', '.join(keys), id_col=id_col, table=table) yield from execute_values_to_bytes( cur, query, ((_id,) for _id in ids) ) content_indexer_names = { 'mimetype': 'content_mimetype', 'fossology_license': 'content_fossology_license', } def content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, cur=None): """Retrieve contents with content_type, within range [start, end] bound by limit and associated to the given indexer configuration id. When asking to work on textual content, that filters on the mimetype table with any mimetype that is not binary. """ cur = self._cursor(cur) table = self.content_indexer_names[content_type] if with_textual_data: extra = """inner join content_mimetype cm on (t.id=cm.id and cm.mimetype like 'text/%%')""" else: extra = "" query = """select t.id from %s t inner join indexer_configuration ic on t.indexer_configuration_id=ic.id %s where ic.id=%%s and %%s <= t.id and t.id <= %%s order by t.indexer_configuration_id, t.id limit %%s""" % (table, extra) cur.execute(query, (indexer_configuration_id, start, end, limit)) yield from cursor_to_bytes(cur) def content_mimetype_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_mimetype', ids, self.content_mimetype_cols, cur=cur) content_language_hash_keys = ['id', 'indexer_configuration_id'] def content_language_missing_from_list(self, languages, cur=None): """List missing languages. """ yield from self._missing_from_list( 'content_language', languages, self.content_language_hash_keys, cur=cur) content_language_cols = [ 'id', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_language') def mktemp_content_language(self, cur=None): pass def content_language_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_language_add(%s)", (conflict_update, )) def content_language_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_language', ids, self.content_language_cols, cur=cur) content_ctags_hash_keys = ['id', 'indexer_configuration_id'] def content_ctags_missing_from_list(self, ctags, cur=None): """List missing ctags. """ yield from self._missing_from_list( 'content_ctags', ctags, self.content_ctags_hash_keys, cur=cur) content_ctags_cols = [ 'id', 'name', 'kind', 'line', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_ctags') def mktemp_content_ctags(self, cur=None): pass def content_ctags_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", (conflict_update, )) def content_ctags_get_from_list(self, ids, cur=None): cur = self._cursor(cur) keys = map(self._convert_key, self.content_ctags_cols) yield from execute_values_to_bytes( cur, """ select %s from (values %%s) as t(id) inner join content_ctags c on c.id=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id order by line """ % ', '.join(keys), ((_id,) for _id in ids) ) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) if not last_sha1: query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit)) else: if last_sha1 and isinstance(last_sha1, bytes): last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1) elif last_sha1: last_sha1 = '\\x%s' % last_sha1 query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit, last_sha1)) yield from cursor_to_bytes(cur) content_fossology_license_cols = [ 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', 'licenses'] @stored_procedure('swh_mktemp_content_fossology_license') def mktemp_content_fossology_license(self, cur=None): pass def content_fossology_license_add_from_temp(self, conflict_update, cur=None): """Add new licenses per content. """ self._cursor(cur).execute( "SELECT swh_content_fossology_license_add(%s)", (conflict_update, )) def content_fossology_license_get_from_list(self, ids, cur=None): """Retrieve licenses per id. """ cur = self._cursor(cur) keys = map(self._convert_key, self.content_fossology_license_cols) yield from execute_values_to_bytes( cur, """ select %s from (values %%s) as t(id) inner join content_fossology_license c on t.id=c.id inner join indexer_configuration i on i.id=c.indexer_configuration_id group by c.id, i.id, i.tool_name, i.tool_version, i.tool_configuration; """ % ', '.join(keys), ((_id,) for _id in ids) ) content_metadata_hash_keys = ['id', 'indexer_configuration_id'] def content_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( 'content_metadata', metadata, self.content_metadata_hash_keys, cur=cur) content_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_metadata') def mktemp_content_metadata(self, cur=None): pass def content_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", (conflict_update, )) def content_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] def revision_metadata_missing_from_list(self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( 'revision_metadata', metadata, self.revision_metadata_hash_keys, cur=cur) revision_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') def mktemp_revision_metadata(self, cur=None): pass def revision_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", (conflict_update, )) def revision_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ 'origin_id', 'metadata', 'from_revision', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' """The dictionary used to normalize 'metadata' and queries. 'pg_catalog.simple' provides no stopword, so it should be suitable for proper names and non-English content. When updating this value, make sure to add a new index on origin_intrinsic_metadata.metadata.""" @stored_procedure('swh_mktemp_origin_intrinsic_metadata') def mktemp_origin_intrinsic_metadata(self, cur=None): pass def origin_intrinsic_metadata_add_from_temp( self, conflict_update, cur=None): cur = self._cursor(cur) cur.execute( "SELECT swh_origin_intrinsic_metadata_add(%s)", (conflict_update, )) def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None): yield from self._get_from_list( 'origin_intrinsic_metadata', orig_ids, self.origin_intrinsic_metadata_cols, cur=cur, id_col='origin_id') def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur=None): regconfig = self.origin_intrinsic_metadata_regconfig tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig for _ in terms) tsquery_args = [(term,) for term in terms] keys = map(self._convert_key, self.origin_intrinsic_metadata_cols) query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim " "INNER JOIN indexer_configuration AS i " "ON oim.indexer_configuration_id=i.id " "JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true " "WHERE to_tsvector('{regconfig}', metadata) @@ tsq " "ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC " "LIMIT %s;" ).format(keys=', '.join(keys), regconfig=regconfig, tsquery_template=tsquery_template) cur.execute(query, tsquery_args + [limit]) yield from cursor_to_bytes(cur) indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_indexer_configuration') def mktemp_indexer_configuration(self, cur=None): pass def indexer_configuration_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( ','.join(self.indexer_configuration_cols), )) yield from cursor_to_bytes(cur) def indexer_configuration_get(self, tool_name, tool_version, tool_configuration, cur=None): cur = self._cursor(cur) cur.execute('''select %s from indexer_configuration where tool_name=%%s and tool_version=%%s and tool_configuration=%%s''' % ( ','.join(self.indexer_configuration_cols)), (tool_name, tool_version, tool_configuration)) data = cur.fetchone() if not data: return None return line_to_bytes(data) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index 1398330..35d63f5 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,249 +1,712 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from collections import defaultdict +import bisect +from collections import defaultdict, Counter +import itertools import json +import operator +import math +import re +SHA1_DIGEST_SIZE = 160 -class MetadataStorage: - """Implements missing/get/add logic for both content_metadata and - revision_metadata.""" + +def _transform_tool(tool): + return { + 'id': tool['id'], + 'name': tool['tool_name'], + 'version': tool['tool_version'], + 'configuration': tool['tool_configuration'], + } + + +class SubStorage: + """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools - self._metadata = {} # map (id_, tool_id) -> metadata_dict + self._sorted_ids = [] + self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] - def _transform_tool(self, tool): - return { - 'id': tool['id'], - 'name': tool['tool_name'], - 'version': tool['tool_version'], - 'configuration': tool['tool_configuration'], - } - def missing(self, ids): - """List metadata missing from storage. + """List data missing from storage. Args: - metadata (iterable): dictionaries with keys: + data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for id_ in ids: tool_id = id_['indexer_configuration_id'] id_ = id_['id'] if tool_id not in self._tools_per_id.get(id_, set()): yield id_ def get(self, ids): - """Retrieve metadata per id. + """Retrieve data per id. Args: ids (iterable): sha1 checksums Yields: - dictionaries with the following keys: + dict: dictionaries with the following keys: - id (bytes) - translated_metadata (str): associated metadata - tool (dict): tool used to compute metadata + - **id** (bytes) + - **tool** (dict): tool used to compute metadata + - arbitrary data (as provided to `add`) """ for id_ in ids: for tool_id in self._tools_per_id.get(id_, set()): key = (id_, tool_id) yield { 'id': id_, - 'tool': self._transform_tool(self._tools[tool_id]), - 'translated_metadata': self._metadata[key], + 'tool': _transform_tool(self._tools[tool_id]), + **self._data[key], } - def add(self, metadata, conflict_update): - """Add metadata not present in storage. + def get_all(self): + yield from self.get(list(self._tools_per_id)) + + def get_range(self, start, end, indexer_configuration_id, limit): + """Retrieve data within range [start, end] bound by limit. Args: - metadata (iterable): dictionaries with keys: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result - - **id**: sha1 - - **translated_metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results + Raises: + ValueError for limit to None - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false) + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any """ - for item in metadata: - tool_id = item['indexer_configuration_id'] - data = item['translated_metadata'] - id_ = item['id'] + if limit is None: + raise ValueError('Development error: limit should not be None') + from_index = bisect.bisect_left(self._sorted_ids, start) + to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index) + if to_index - from_index >= limit: + return { + 'ids': self._sorted_ids[from_index:from_index+limit], + 'next': self._sorted_ids[from_index+limit], + } + else: + return { + 'ids': self._sorted_ids[from_index:to_index], + 'next': None, + } + + def add(self, data, conflict_update): + """Add data not present in storage. + + Args: + data (iterable): dictionaries with keys: + + - **id**: sha1 + - **indexer_configuration_id**: tool used to compute the + results + - arbitrary data + + conflict_update (bool): Flag to determine if we want to overwrite + (true) or skip duplicates (false) + + """ + for item in data: + item = item.copy() + tool_id = item.pop('indexer_configuration_id') + id_ = item.pop('id') + data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) - self._metadata[key] = data + self._data[key] = data self._tools_per_id[id_].add(tool_id) + if id_ not in self._sorted_ids: + bisect.insort(self._sorted_ids, id_) + + def add_merge(self, new_data, conflict_update, merged_key): + for new_item in new_data: + id_ = new_item['id'] + tool_id = new_item['indexer_configuration_id'] + if conflict_update: + all_subitems = [] + else: + existing = list(self.get([id_])) + all_subitems = [ + old_subitem + for existing_item in existing + if existing_item['tool']['id'] == tool_id + for old_subitem in existing_item[merged_key] + ] + for new_subitem in new_item[merged_key]: + if new_subitem not in all_subitems: + all_subitems.append(new_subitem) + self.add([ + { + 'id': id_, + 'indexer_configuration_id': tool_id, + merged_key: all_subitems, + } + ], conflict_update=True) + if id_ not in self._sorted_ids: + bisect.insort(self._sorted_ids, id_) class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} - self._content_metadata = MetadataStorage(self._tools) - self._revision_metadata = MetadataStorage(self._tools) + self._mimetypes = SubStorage(self._tools) + self._languages = SubStorage(self._tools) + self._content_ctags = SubStorage(self._tools) + self._licenses = SubStorage(self._tools) + self._content_metadata = SubStorage(self._tools) + self._revision_metadata = SubStorage(self._tools) + self._origin_intrinsic_metadata = SubStorage(self._tools) - def content_metadata_missing(self, metadata): - """List metadata missing from storage. + def content_mimetype_missing(self, mimetypes): + """Generate mimetypes missing from storage. Args: - metadata (iterable): dictionaries with keys: + mimetypes (iterable): iterable of dict with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute the + results + + Yields: + tuple (id, indexer_configuration_id): missing id + + """ + yield from self._mimetypes.missing(mimetypes) + + def content_mimetype_get_range( + self, start, end, indexer_configuration_id, limit=1000): + """Retrieve mimetypes within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + return self._mimetypes.get_range( + start, end, indexer_configuration_id, limit) + + def content_mimetype_add(self, mimetypes, conflict_update=False): + """Add mimetypes not present in storage. + + Args: + mimetypes (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **mimetype** (bytes): raw content's mimetype + - **encoding** (bytes): raw content's encoding + - **indexer_configuration_id** (int): tool's id used to + compute the results + - **conflict_update** (bool): Flag to determine if we want to + overwrite (``True``) or skip duplicates (``False``, the + default) + + """ + if not all(isinstance(x['id'], bytes) for x in mimetypes): + raise TypeError('identifiers must be bytes.') + self._mimetypes.add(mimetypes, conflict_update) + + def content_mimetype_get(self, ids, db=None, cur=None): + """Retrieve full content mimetype per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + mimetypes (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **mimetype** (bytes): raw content's mimetype + - **encoding** (bytes): raw content's encoding + - **tool** (dict): Tool used to compute the language + + """ + yield from self._mimetypes.get(ids) + + def content_language_missing(self, languages): + """List languages missing from storage. + + Args: + languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) + + """ + yield from self._languages.missing(languages) + + def content_language_get(self, ids): + """Retrieve full content language per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + languages (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **lang** (bytes): raw content's language + - **tool** (dict): Tool used to compute the language + + """ + yield from self._languages.get(ids) + + def content_language_add(self, languages, conflict_update=False): + """Add languages not present in storage. + + Args: + languages (iterable): dictionaries with keys: + + - **id** (bytes): sha1 + - **lang** (bytes): language detected + + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates (false, the + default) + + """ + if not all(isinstance(x['id'], bytes) for x in languages): + raise TypeError('identifiers must be bytes.') + self._languages.add(languages, conflict_update) + + def content_ctags_missing(self, ctags): + """List ctags missing from storage. + + Args: + ctags (iterable): dicts with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) + + """ + yield from self._content_ctags.missing(ctags) + + def content_ctags_get(self, ids): + """Retrieve ctags per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + Dictionaries with keys: + + - **id** (bytes): content's identifier + - **name** (str): symbol's name + - **kind** (str): symbol's kind + - **lang** (str): language for that content + - **tool** (dict): tool used to compute the ctags' info + + + """ + for item in self._content_ctags.get(ids): + for item_ctags_item in item['ctags']: + yield { + 'id': item['id'], + 'tool': item['tool'], + **item_ctags_item + } + + def content_ctags_add(self, ctags, conflict_update=False): + """Add ctags not present in storage + + Args: + ctags (iterable): dictionaries with keys: + + - **id** (bytes): sha1 + - **ctags** ([list): List of dictionary with keys: name, kind, + line, lang + - **indexer_configuration_id**: tool used to compute the + results + + """ + if not all(isinstance(x['id'], bytes) for x in ctags): + raise TypeError('identifiers must be bytes.') + self._content_ctags.add_merge(ctags, conflict_update, 'ctags') + + def content_ctags_search(self, expression, + limit=10, last_sha1=None, db=None, cur=None): + """Search through content's raw ctags symbols. + + Args: + expression (str): Expression to search for + limit (int): Number of rows to return (default to 10). + last_sha1 (str): Offset from which retrieving data (default to ''). + + Yields: + rows of ctags including id, name, lang, kind, line, etc... + + """ + nb_matches = 0 + for ((id_, tool_id), item) in \ + sorted(self._content_ctags._data.items()): + if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): + continue + for ctags_item in item['ctags']: + if ctags_item['name'] != expression: + continue + nb_matches += 1 + yield { + 'id': id_, + 'tool': _transform_tool(self._tools[tool_id]), + **ctags_item + } + if nb_matches >= limit: + return + + def content_fossology_license_get(self, ids): + """Retrieve licenses per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + `{id: facts}` where `facts` is a dict with the following keys: + + - **licenses** ([str]): associated licenses for that content + - **tool** (dict): Tool used to compute the license + + """ + # Rewrites the output of SubStorage.get from the old format to + # the new one. SubStorage.get should be updated once all other + # *_get methods use the new format. + # See: https://forge.softwareheritage.org/T1433 + res = {} + for d in self._licenses.get(ids): + res.setdefault(d.pop('id'), []).append(d) + for (id_, facts) in res.items(): + yield {id_: facts} + + def content_fossology_license_add(self, licenses, conflict_update=False): + """Add licenses not present in storage. + + Args: + licenses (iterable): dictionaries with keys: + + - **id**: sha1 + - **licenses** ([bytes]): List of licenses associated to sha1 + - **tool** (str): nomossa + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + Returns: + list: content_license entries which failed due to unknown licenses + + """ + if not all(isinstance(x['id'], bytes) for x in licenses): + raise TypeError('identifiers must be bytes.') + self._licenses.add_merge(licenses, conflict_update, 'licenses') + + def content_fossology_license_get_range( + self, start, end, indexer_configuration_id, limit=1000): + """Retrieve licenses within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + return self._licenses.get_range( + start, end, indexer_configuration_id, limit) + + def content_metadata_missing(self, metadata): + """List metadata missing from storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + Yields: missing sha1s """ yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - id (bytes) - translated_metadata (str): associated metadata - tool (dict): tool used to compute metadata + - **id** (bytes) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata """ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - - **id**: sha1 - - **translated_metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results + - **id**: sha1 + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute the + results conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ + if not all(isinstance(x['id'], bytes) for x in metadata): + raise TypeError('identifiers must be bytes.') self._content_metadata.add(metadata, conflict_update) def revision_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - - **id** (bytes): sha1_git revision identifier - - **indexer_configuration_id** (int): tool used to compute - the results + - **id** (bytes): sha1_git revision identifier + - **indexer_configuration_id** (int): tool used to compute + the results Yields: missing ids """ yield from self._revision_metadata.missing(metadata) def revision_metadata_get(self, ids): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: - - **id** (bytes) - - **translated_metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata + - **id** (bytes) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata """ yield from self._revision_metadata.get(ids) def revision_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - - **id**: sha1_git of revision - - **translated_metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata + - **id**: sha1_git of revision + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ + if not all(isinstance(x['id'], bytes) for x in metadata): + raise TypeError('identifiers must be bytes.') self._revision_metadata.add(metadata, conflict_update) + def origin_intrinsic_metadata_get(self, ids): + """Retrieve origin metadata per id. + + Args: + ids (iterable): origin identifiers + + Yields: + list: dictionaries with the following keys: + + - **origin_id** (int) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + + """ + for item in self._origin_intrinsic_metadata.get(ids): + item['origin_id'] = item.pop('id') + yield item + + def origin_intrinsic_metadata_add(self, metadata, + conflict_update=False): + """Add origin metadata not present in storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **origin_id**: origin identifier + - **from_revision**: sha1 id of the revision used to generate + these metadata. + - **metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute metadata + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + + for item in metadata: + item = item.copy() + item['id'] = item.pop('origin_id') + self._origin_intrinsic_metadata.add([item], conflict_update) + + def origin_intrinsic_metadata_search_fulltext( + self, conjunction, limit=100): + """Returns the list of origins whose metadata contain all the terms. + + Args: + conjunction (List[str]): List of terms to be searched for. + limit (int): The maximum number of results to return + + Yields: + list: dictionaries with the following keys: + + - **id** (int) + - **metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + + """ + # A very crude fulltext search implementation, but that's enough + # to work on English metadata + tokens_re = re.compile('[a-zA-Z0-9]+') + search_tokens = list(itertools.chain( + *map(tokens_re.findall, conjunction))) + + def rank(data): + # Tokenize the metadata + text = json.dumps(data['metadata']) + text_tokens = tokens_re.findall(text) + text_token_occurences = Counter(text_tokens) + + # Count the number of occurences of search tokens in the text + score = 0 + for search_token in search_tokens: + if text_token_occurences[search_token] == 0: + # Search token is not in the text. + return 0 + score += text_token_occurences[search_token] + + # Normalize according to the text's length + return score / math.log(len(text_tokens)) + + results = [(rank(data), data) + for data in self._origin_intrinsic_metadata.get_all()] + results = [(rank_, data) for (rank_, data) in results if rank_ > 0] + results.sort(key=operator.itemgetter(0), # Don't try to order 'data' + reverse=True) + for (rank_, result) in results[:limit]: + result = result.copy() + result['origin_id'] = result.pop('id') + yield result + def indexer_configuration_add(self, tools): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to - insert in the db. Dictionary with the following keys: + insert in the db. Dictionary with the following keys: - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) + - **tool_name** (str): tool's name + - **tool_version** (str): tool's version + - **tool_configuration** (dict): tool's configuration + (free form dict) Returns: - List of dict inserted in the db (holding the id key as - well). The order of the list is not guaranteed to match + list: List of dict inserted in the db (holding the id key as + well). The order of the list is not guaranteed to match the order of the initial list. """ inserted = [] for tool in tools: tool = tool.copy() id_ = self._tool_key(tool) tool['id'] = id_ self._tools[id_] = tool inserted.append(tool) return inserted def indexer_configuration_get(self, tool): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the - following keys: + following keys: - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) + - **tool_name** (str): tool's name + - **tool_version** (str): tool's version + - **tool_configuration** (dict): tool's configuration + (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): return (tool['tool_name'], tool['tool_version'], json.dumps(tool['tool_configuration'], sort_keys=True)) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index 6a6919e..6b7372f 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,119 +1,119 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.scheduler.task import Task as SchedulerTask -from .mimetype import ContentMimetypeIndexer, MimetypeRangeIndexer -from .language import ContentLanguageIndexer +from .mimetype import MimetypeIndexer, MimetypeRangeIndexer +from .language import LanguageIndexer from .ctags import CtagsIndexer from .fossology_license import ( - ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer + FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer from .origin_head import OriginHeadIndexer logging.basicConfig(level=logging.INFO) class Task(SchedulerTask): """Task whose results is needed for other computations. """ def run_task(self, *args, **kwargs): indexer = self.Indexer().run(*args, **kwargs) if hasattr(indexer, 'results'): # indexer tasks return indexer.results return indexer class StatusTask(SchedulerTask): """Task which returns a status either eventful or uneventful. """ def run_task(self, *args, **kwargs): results = self.Indexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} class RevisionMetadata(Task): task_queue = 'swh_indexer_revision_metadata' serializer = 'msgpack' Indexer = RevisionMetadataIndexer class OriginMetadata(Task): task_queue = 'swh_indexer_origin_intrinsic_metadata' Indexer = OriginMetadataIndexer class OriginHead(Task): task_queue = 'swh_indexer_origin_head' Indexer = OriginHeadIndexer class ContentMimetype(StatusTask): """Compute (mimetype, encoding) on a list of sha1s' content. """ task_queue = 'swh_indexer_content_mimetype' - Indexer = ContentMimetypeIndexer + Indexer = MimetypeIndexer class ContentRangeMimetype(StatusTask): """Compute (mimetype, encoding) on a range of sha1s. """ task_queue = 'swh_indexer_content_mimetype_range' Indexer = MimetypeRangeIndexer class ContentLanguage(Task): """Task which computes the language from the sha1's content. """ task_queue = 'swh_indexer_content_language' - Indexer = ContentLanguageIndexer + Indexer = LanguageIndexer class Ctags(Task): """Task which computes ctags from the sha1's content. """ task_queue = 'swh_indexer_content_ctags' Indexer = CtagsIndexer class ContentFossologyLicense(Task): """Compute fossology licenses on a list of sha1s' content. """ task_queue = 'swh_indexer_content_fossology_license' - Indexer = ContentFossologyLicenseIndexer + Indexer = FossologyLicenseIndexer class ContentRangeFossologyLicense(StatusTask): """Compute fossology license on a range of sha1s. """ task_queue = 'swh_indexer_content_fossology_license_range' Indexer = FossologyLicenseRangeIndexer class RecomputeChecksums(Task): """Task which recomputes hashes and possibly new ones. """ task_queue = 'swh_indexer_content_rehash' Indexer = RecomputeChecksums diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py index 4dd7af4..8992bff 100644 --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -1,143 +1,19 @@ from unittest import TestCase -import pytest from .test_storage import CommonTestStorage class IndexerTestInMemoryStorage(CommonTestStorage, TestCase): def setUp(self): self.storage_config = { 'cls': 'memory', 'args': { }, } super().setUp() - @pytest.mark.xfail - def test_check_config(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_missing(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_add__drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_get(self): - pass - - @pytest.mark.xfail - def test_content_language_missing(self): - pass - - @pytest.mark.xfail - def test_content_language_get(self): - pass - - @pytest.mark.xfail - def test_content_language_add__drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_language_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_ctags_missing(self): - pass - - @pytest.mark.xfail - def test_content_ctags_get(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search_no_result(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__add_new_ctags_added(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__update_in_place(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_get(self): - pass + def reset_storage_tables(self): + self.storage = self.storage.__class__() - @pytest.mark.xfail - def test_content_fossology_license_add__new_license_added(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_origin_intrinsic_metadata_get(self): - pass - - @pytest.mark.xfail - def test_origin_intrinsic_metadata_add_drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_origin_intrinsic_metadata_search_fulltext(self): - pass - - @pytest.mark.xfail - def test_origin_intrinsic_metadata_search_fulltext_rank(self): - pass - - @pytest.mark.xfail - def test_indexer_configuration_metadata_get_missing_context(self): - pass - - @pytest.mark.xfail - def test_indexer_configuration_metadata_get(self): - pass - - @pytest.mark.xfail - def test_generate_content_mimetype_get_range_limit_none(self): - pass - - @pytest.mark.xfail - def test_generate_content_mimetype_get_range_no_limit(self, mimetypes): - pass - - @pytest.mark.xfail - def test_generate_content_mimetype_get_range_limit(self, mimetypes): - pass - - @pytest.mark.xfail - def test_generate_content_fossology_license_get_range_limit_none(self): - pass - - @pytest.mark.xfail - def test_generate_content_fossology_license_get_range_no_limit(self): - pass - - @pytest.mark.xfail - def test_generate_content_fossology_license_get_range_no_limit_with_filter( - self): - pass - - @pytest.mark.xfail - def test_generate_fossology_license_get_range_limit(self): + def test_check_config(self): pass diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index ecd696c..707ecf8 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,2015 +1,2049 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import unittest from hypothesis import given from swh.model.hashutil import hash_to_bytes from swh.indexer.storage import get_indexer_storage from swh.core.tests.db_testing import SingleDbTestFixture from swh.indexer.tests.storage.generate_data_test import ( gen_content_mimetypes, gen_content_fossology_licenses ) from swh.indexer.tests.storage import SQL_DIR TOOLS = [ { 'tool_name': 'universal-ctags', 'tool_version': '~git7859817b', 'tool_configuration': { "command_line": "ctags --fields=+lnz --sort=no --links=no " "--output-format=json "} }, { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "NpmMapping"}, }, { 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { "type": "local", "context": ["NpmMapping", "CodemetaMapping"]}, }, { 'tool_name': 'file', 'tool_version': '5.22', 'tool_configuration': {"command_line": "file --mime "}, }, { 'tool_name': 'pygments', 'tool_version': '2.0.1+dfsg-1.1+deb8u1', 'tool_configuration': { "type": "library", "debian-package": "python3-pygments"}, }, { 'tool_name': 'pygments', 'tool_version': '2.0.1+dfsg-1.1+deb8u1', 'tool_configuration': { "type": "library", "debian-package": "python3-pygments", "max_content_size": 10240 }, }, { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } ] @pytest.mark.db class BasePgTestStorage(SingleDbTestFixture): """Base test class for most indexer tests. It adds support for Storage testing to the SingleDbTestFixture class. It will also build the database from the swh-indexed/sql/*.sql files. """ TEST_DB_NAME = 'softwareheritage-test-indexer' TEST_DB_DUMP = os.path.join(SQL_DIR, '*.sql') def setUp(self): super().setUp() self.storage_config = { 'cls': 'local', 'args': { 'db': 'dbname=%s' % self.TEST_DB_NAME, }, } def tearDown(self): self.reset_storage_tables() self.storage = None super().tearDown() def reset_storage_tables(self): excluded = {'indexer_configuration'} self.reset_db_tables(self.TEST_DB_NAME, excluded=excluded) db = self.test_db[self.TEST_DB_NAME] db.conn.commit() class CommonTestStorage: """Base class for Indexer Storage testing. """ def setUp(self): super().setUp() self.storage = get_indexer_storage(**self.storage_config) tools = self.storage.indexer_configuration_add(TOOLS) self.tools = {} for tool in tools: tool_name = tool['tool_name'] while tool_name in self.tools: tool_name += '_' self.tools[tool_name] = { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') self.revision_id_1 = hash_to_bytes( '7026b7c1a2af56521e951c01ed20f255fa054238') self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') self.origin_id_1 = 54974445 self.origin_id_2 = 44434342 def test_check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) self.assertTrue(self.storage.check_config(check_write=False)) def test_content_mimetype_missing(self): # given tool_id = self.tools['file']['id'] mimetypes = [ { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }] # when actual_missing = self.storage.content_mimetype_missing(mimetypes) # then self.assertEqual(list(actual_missing), [ self.sha1_1, self.sha1_2, ]) # given self.storage.content_mimetype_add([{ 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, }]) # when actual_missing = self.storage.content_mimetype_missing(mimetypes) # then self.assertEqual(list(actual_missing), [self.sha1_1]) def test_content_mimetype_add__drop_duplicate(self): # given tool_id = self.tools['file']['id'] mimetype_v1 = { 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, } # given self.storage.content_mimetype_add([mimetype_v1]) # when actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) # then expected_mimetypes_v1 = [{ 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'tool': self.tools['file'], }] self.assertEqual(actual_mimetypes, expected_mimetypes_v1) # given mimetype_v2 = mimetype_v1.copy() mimetype_v2.update({ 'mimetype': 'text/html', 'encoding': 'us-ascii', }) self.storage.content_mimetype_add([mimetype_v2]) actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) # mimetype did not change as the v2 was dropped. self.assertEqual(actual_mimetypes, expected_mimetypes_v1) def test_content_mimetype_add__update_in_place_duplicate(self): # given tool_id = self.tools['file']['id'] mimetype_v1 = { 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, } # given self.storage.content_mimetype_add([mimetype_v1]) # when actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) expected_mimetypes_v1 = [{ 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'tool': self.tools['file'], }] # then self.assertEqual(actual_mimetypes, expected_mimetypes_v1) # given mimetype_v2 = mimetype_v1.copy() mimetype_v2.update({ 'mimetype': 'text/html', 'encoding': 'us-ascii', }) self.storage.content_mimetype_add([mimetype_v2], conflict_update=True) actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) expected_mimetypes_v2 = [{ 'id': self.sha1_2, 'mimetype': 'text/html', 'encoding': 'us-ascii', 'tool': { 'id': self.tools['file']['id'], 'name': 'file', 'version': '5.22', 'configuration': {'command_line': 'file --mime '} } }] # mimetype did change as the v2 was used to overwrite v1 self.assertEqual(actual_mimetypes, expected_mimetypes_v2) def test_content_mimetype_get(self): # given tool_id = self.tools['file']['id'] mimetypes = [self.sha1_2, self.sha1_1] mimetype1 = { 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': tool_id, } # when self.storage.content_mimetype_add([mimetype1]) # then actual_mimetypes = list(self.storage.content_mimetype_get(mimetypes)) # then expected_mimetypes = [{ 'id': self.sha1_2, 'mimetype': 'text/plain', 'encoding': 'utf-8', 'tool': self.tools['file'] }] self.assertEqual(actual_mimetypes, expected_mimetypes) def test_content_language_missing(self): # given tool_id = self.tools['pygments']['id'] languages = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, } ] # when actual_missing = list(self.storage.content_language_missing(languages)) # then self.assertEqual(list(actual_missing), [ self.sha1_2, self.sha1_1, ]) # given self.storage.content_language_add([{ 'id': self.sha1_2, 'lang': 'haskell', 'indexer_configuration_id': tool_id, }]) # when actual_missing = list(self.storage.content_language_missing(languages)) # then self.assertEqual(actual_missing, [self.sha1_1]) def test_content_language_get(self): # given tool_id = self.tools['pygments']['id'] language1 = { 'id': self.sha1_2, 'lang': 'common-lisp', 'indexer_configuration_id': tool_id, } # when self.storage.content_language_add([language1]) # then actual_languages = list(self.storage.content_language_get( [self.sha1_2, self.sha1_1])) # then expected_languages = [{ 'id': self.sha1_2, 'lang': 'common-lisp', 'tool': self.tools['pygments'] }] self.assertEqual(actual_languages, expected_languages) def test_content_language_add__drop_duplicate(self): # given tool_id = self.tools['pygments']['id'] language_v1 = { 'id': self.sha1_2, 'lang': 'emacslisp', 'indexer_configuration_id': tool_id, } # given self.storage.content_language_add([language_v1]) # when actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # then expected_languages_v1 = [{ 'id': self.sha1_2, 'lang': 'emacslisp', 'tool': self.tools['pygments'] }] self.assertEqual(actual_languages, expected_languages_v1) # given language_v2 = language_v1.copy() language_v2.update({ 'lang': 'common-lisp', }) self.storage.content_language_add([language_v2]) actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # language did not change as the v2 was dropped. self.assertEqual(actual_languages, expected_languages_v1) def test_content_language_add__update_in_place_duplicate(self): # given tool_id = self.tools['pygments']['id'] language_v1 = { 'id': self.sha1_2, 'lang': 'common-lisp', 'indexer_configuration_id': tool_id, } # given self.storage.content_language_add([language_v1]) # when actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # then expected_languages_v1 = [{ 'id': self.sha1_2, 'lang': 'common-lisp', 'tool': self.tools['pygments'] }] self.assertEqual(actual_languages, expected_languages_v1) # given language_v2 = language_v1.copy() language_v2.update({ 'lang': 'emacslisp', }) self.storage.content_language_add([language_v2], conflict_update=True) actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # language did not change as the v2 was dropped. expected_languages_v2 = [{ 'id': self.sha1_2, 'lang': 'emacslisp', 'tool': self.tools['pygments'] }] # language did change as the v2 was used to overwrite v1 self.assertEqual(actual_languages, expected_languages_v2) def test_content_ctags_missing(self): # given tool_id = self.tools['universal-ctags']['id'] ctags = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, } ] # when actual_missing = self.storage.content_ctags_missing(ctags) # then self.assertEqual(list(actual_missing), [ self.sha1_2, self.sha1_1 ]) # given self.storage.content_ctags_add([ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 119, 'lang': 'OCaml', }] }, ]) # when actual_missing = self.storage.content_ctags_missing(ctags) # then self.assertEqual(list(actual_missing), [self.sha1_1]) def test_content_ctags_get(self): # given tool_id = self.tools['universal-ctags']['id'] ctags = [self.sha1_2, self.sha1_1] ctag1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', }] } # when self.storage.content_ctags_add([ctag1]) # then actual_ctags = list(self.storage.content_ctags_get(ctags)) # then expected_ctags = [ { 'id': self.sha1_2, 'tool': self.tools['universal-ctags'], 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'id': self.sha1_2, 'tool': self.tools['universal-ctags'], 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', } ] self.assertEqual(actual_ctags, expected_ctags) def test_content_ctags_search(self): # 1. given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag1 = { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }, + { + 'name': 'hello', + 'kind': 'variable', + 'line': 210, + 'lang': 'Python', + }, ] } ctag2 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, + { + 'name': 'result', + 'kind': 'variable', + 'line': 120, + 'lang': 'C', + }, ] } self.storage.content_ctags_add([ctag1, ctag2]) # 1. when actual_ctags = list(self.storage.content_ctags_search('hello', limit=1)) # 1. then self.assertEqual(actual_ctags, [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', } ]) # 2. when actual_ctags = list(self.storage.content_ctags_search( 'hello', limit=1, last_sha1=ctag1['id'])) # 2. then self.assertEqual(actual_ctags, [ { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', } ]) # 3. when actual_ctags = list(self.storage.content_ctags_search('hello')) # 3. then self.assertEqual(actual_ctags, [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, + { + 'id': ctag1['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'variable', + 'line': 210, + 'lang': 'Python', + }, { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, ]) # 4. when actual_ctags = list(self.storage.content_ctags_search('counter')) # then self.assertEqual(actual_ctags, [{ 'id': ctag1['id'], 'tool': tool, 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }]) + # 5. when + actual_ctags = list(self.storage.content_ctags_search('result', + limit=1)) + + # then + self.assertEqual(actual_ctags, [{ + 'id': ctag2['id'], + 'tool': tool, + 'name': 'result', + 'kind': 'variable', + 'line': 120, + 'lang': 'C', + }]) + def test_content_ctags_search_no_result(self): actual_ctags = list(self.storage.content_ctags_search('counter')) self.assertEqual(actual_ctags, []) def test_content_ctags_add__add_new_ctags_added(self): # given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given self.storage.content_ctags_add([ctag_v1]) self.storage.content_ctags_add([ctag_v1]) # conflict does nothing # when actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # then expected_ctags = [{ 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }] self.assertEqual(actual_ctags, expected_ctags) # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) self.storage.content_ctags_add([ctag_v2]) expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': self.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) self.assertEqual(actual_ctags, expected_ctags) def test_content_ctags_add__update_in_place(self): # given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given self.storage.content_ctags_add([ctag_v1]) # when actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # then expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool } ] self.assertEqual(actual_ctags, expected_ctags) # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }, { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) self.storage.content_ctags_add([ctag_v2], conflict_update=True) actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # ctag did change as the v2 was used to overwrite v1 expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': self.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] self.assertEqual(actual_ctags, expected_ctags) def test_content_fossology_license_get(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license1 = { 'id': self.sha1_1, 'licenses': ['GPL-2.0+'], 'indexer_configuration_id': tool_id, } # when self.storage.content_fossology_license_add([license1]) # then actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_2, self.sha1_1])) expected_license = { self.sha1_1: [{ 'licenses': ['GPL-2.0+'], 'tool': tool, }] } # then self.assertEqual(actual_licenses, [expected_license]) def test_content_fossology_license_add__new_license_added(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': self.sha1_1, 'licenses': ['Apache-2.0'], 'indexer_configuration_id': tool_id, } # given self.storage.content_fossology_license_add([license_v1]) # conflict does nothing self.storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # then expected_license = { self.sha1_1: [{ 'licenses': ['Apache-2.0'], 'tool': tool, }] } self.assertEqual(actual_licenses, [expected_license]) # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['BSD-2-Clause'], }) self.storage.content_fossology_license_add([license_v2]) actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) expected_license = { self.sha1_1: [{ 'licenses': ['Apache-2.0', 'BSD-2-Clause'], 'tool': tool }] } # license did not change as the v2 was dropped. self.assertEqual(actual_licenses, [expected_license]) def test_content_fossology_license_add__update_in_place_duplicate(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': self.sha1_1, 'licenses': ['CECILL'], 'indexer_configuration_id': tool_id, } # given self.storage.content_fossology_license_add([license_v1]) # conflict does nothing self.storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # then expected_license = { self.sha1_1: [{ 'licenses': ['CECILL'], 'tool': tool, }] } self.assertEqual(actual_licenses, [expected_license]) # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['CECILL-2.0'] }) self.storage.content_fossology_license_add([license_v2], conflict_update=True) actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # license did change as the v2 was used to overwrite v1 expected_license = { self.sha1_1: [{ 'licenses': ['CECILL-2.0'], 'tool': tool, }] } self.assertEqual(actual_licenses, [expected_license]) def test_content_metadata_missing(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, } ] # when actual_missing = list(self.storage.content_metadata_missing(metadata)) # then self.assertEqual(list(actual_missing), [ self.sha1_2, self.sha1_1, ]) # given self.storage.content_metadata_add([{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id }]) # when actual_missing = list(self.storage.content_metadata_missing(metadata)) # then self.assertEqual(actual_missing, [self.sha1_1]) def test_content_metadata_get(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata1 = { 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id, } # when self.storage.content_metadata_add([metadata1]) # then actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2, self.sha1_1])) expected_metadata = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] self.assertEqual(actual_metadata, expected_metadata) def test_content_metadata_add_drop_duplicate(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata_v1 = { 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id, } # given self.storage.content_metadata_add([metadata_v1]) # when actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) expected_metadata_v1 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'other': {}, 'name': 'test_drop_duplicated_metadata', 'version': '0.0.1' }, }) self.storage.content_metadata_add([metadata_v2]) # then actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # metadata did not change as the v2 was dropped. self.assertEqual(actual_metadata, expected_metadata_v1) def test_content_metadata_add_update_in_place_duplicate(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata_v1 = { 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id, } # given self.storage.content_metadata_add([metadata_v1]) # when actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # then expected_metadata_v1 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'other': {}, 'name': 'test_update_duplicated_metadata', 'version': '0.0.1' }, }) self.storage.content_metadata_add([metadata_v2], conflict_update=True) actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # language did not change as the v2 was dropped. expected_metadata_v2 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_update_duplicated_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) def test_revision_metadata_missing(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata = [ { 'id': self.revision_id_1, 'indexer_configuration_id': tool_id, }, { 'id': self.revision_id_2, 'indexer_configuration_id': tool_id, } ] # when actual_missing = list(self.storage.revision_metadata_missing( metadata)) # then self.assertEqual(list(actual_missing), [ self.revision_id_1, self.revision_id_2, ]) # given self.storage.revision_metadata_add([{ 'id': self.revision_id_1, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id }]) # when actual_missing = list(self.storage.revision_metadata_missing( metadata)) # then self.assertEqual(actual_missing, [self.revision_id_2]) def test_revision_metadata_get(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_rev = { 'id': self.revision_id_2, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id } # when self.storage.revision_metadata_add([metadata_rev]) # then actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2, self.revision_id_1])) expected_metadata = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_rev['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] self.assertEqual(actual_metadata, expected_metadata) def test_revision_metadata_add_drop_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'id': self.revision_id_1, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id, } # given self.storage.revision_metadata_add([metadata_v1]) # when actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_1])) expected_metadata_v1 = [{ 'id': self.revision_id_1, 'translated_metadata': metadata_v1['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'name': 'test_metadata', 'author': 'MG', }, }) self.storage.revision_metadata_add([metadata_v2]) # then actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_1])) # metadata did not change as the v2 was dropped. self.assertEqual(actual_metadata, expected_metadata_v1) def test_revision_metadata_add_update_in_place_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'id': self.revision_id_2, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id, } # given self.storage.revision_metadata_add([metadata_v1]) # when actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) # then expected_metadata_v1 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v1['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'name': 'test_update_duplicated_metadata', 'author': 'MG' }, }) self.storage.revision_metadata_add([metadata_v2], conflict_update=True) actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) def test_origin_intrinsic_metadata_get(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata = { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None, } metadata_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata, 'indexer_configuration_id': tool_id, } metadata_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } # when self.storage.revision_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([metadata_origin]) # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1, 42])) expected_metadata = [{ 'origin_id': self.origin_id_1, 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, }] self.assertEqual(actual_metadata, expected_metadata) def test_origin_intrinsic_metadata_add_drop_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None } metadata_rev_v1 = { 'id': self.revision_id_1, 'translated_metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } # given self.storage.revision_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1, 42])) expected_metadata_v1 = [{ 'origin_id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['translated_metadata'] = metadata_v2 metadata_origin_v2['translated_metadata'] = metadata_v2 self.storage.revision_metadata_add([metadata_rev_v2]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v2]) # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) # metadata did not change as the v2 was dropped. self.assertEqual(actual_metadata, expected_metadata_v1) def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None } metadata_rev_v1 = { 'id': self.revision_id_2, 'translated_metadata': metadata_v1, 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'origin_id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } # given self.storage.revision_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) # then expected_metadata_v1 = [{ 'origin_id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, }] self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_update_duplicated_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['translated_metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 self.storage.revision_metadata_add([metadata_rev_v2], conflict_update=True) self.storage.origin_intrinsic_metadata_add([metadata_origin_v2], conflict_update=True) actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) expected_metadata_v2 = [{ 'origin_id': self.origin_id_1, 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, }] # metadata did change as the v2 was used to overwrite v1 self.assertEqual(actual_metadata, expected_metadata_v2) def test_origin_intrinsic_metadata_search_fulltext(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata1 = { 'author': 'John Doe', } metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } metadata2 = { 'author': 'Jane Doe', } metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } # when self.storage.revision_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) self.storage.revision_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertCountEqual( [res['origin_id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( [res['origin_id'] for res in search(['John', 'Doe'])], [self.origin_id_1]) self.assertEqual( [res['origin_id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( [res['origin_id'] for res in search(['John', 'Jane'])], []) def test_origin_intrinsic_metadata_search_fulltext_rank(self): # given tool_id = self.tools['swh-metadata-detector']['id'] # The following authors have "Random Person" to add some more content # to the JSON data, to work around normalization quirks when there # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words # for small values of nb_words). metadata1 = { 'author': [ 'Random Person', 'John Doe', 'Jane Doe', ] } metadata1_rev = { 'id': self.revision_id_1, 'translated_metadata': metadata1, 'indexer_configuration_id': tool_id, } metadata1_origin = { 'origin_id': self.origin_id_1, 'metadata': metadata1, 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_1, } metadata2 = { 'author': [ 'Random Person', 'Jane Doe', ] } metadata2_rev = { 'id': self.revision_id_2, 'translated_metadata': metadata2, 'indexer_configuration_id': tool_id, } metadata2_origin = { 'origin_id': self.origin_id_2, 'metadata': metadata2, 'indexer_configuration_id': tool_id, 'from_revision': self.revision_id_2, } # when self.storage.revision_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) self.storage.revision_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertEqual( [res['origin_id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( [res['origin_id'] for res in search(['Doe'], limit=1)], [self.origin_id_1]) self.assertEqual( [res['origin_id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( [res['origin_id'] for res in search(['Jane'])], [self.origin_id_2, self.origin_id_1]) self.assertEqual( [res['origin_id'] for res in search(['John', 'Jane'])], [self.origin_id_1]) def test_indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) # does not exist # add it actual_tools = list(self.storage.indexer_configuration_add([tool])) self.assertEqual(len(actual_tools), 1) actual_tool = actual_tools[0] self.assertIsNotNone(actual_tool) # now it exists new_id = actual_tool.pop('id') self.assertEqual(actual_tool, tool) actual_tools2 = list(self.storage.indexer_configuration_add([tool])) actual_tool2 = actual_tools2[0] self.assertIsNotNone(actual_tool2) # now it exists new_id2 = actual_tool2.pop('id') self.assertEqual(new_id, new_id2) self.assertEqual(actual_tool, actual_tool2) def test_indexer_configuration_add_multiple(self): tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tools = list(self.storage.indexer_configuration_add([tool])) self.assertEqual(len(actual_tools), 1) new_tools = [tool, { 'tool_name': 'yet-another-tool', 'tool_version': 'version', 'tool_configuration': {}, }] actual_tools = list(self.storage.indexer_configuration_add(new_tools)) self.assertEqual(len(actual_tools), 2) # order not guaranteed, so we iterate over results to check for tool in actual_tools: _id = tool.pop('id') self.assertIsNotNone(_id) self.assertIn(tool, new_tools) def test_indexer_configuration_get_missing(self): tool = { 'tool_name': 'unknown-tool', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) def test_indexer_configuration_get(self): tool = { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } self.storage.indexer_configuration_add([tool]) actual_tool = self.storage.indexer_configuration_get(tool) expected_tool = tool.copy() del actual_tool['id'] self.assertEqual(expected_tool, actual_tool) def test_indexer_configuration_metadata_get_missing_context(self): tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"context": "unknown-context"}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) def test_indexer_configuration_metadata_get(self): tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "NpmMapping"}, } self.storage.indexer_configuration_add([tool]) actual_tool = self.storage.indexer_configuration_get(tool) expected_tool = tool.copy() expected_tool['id'] = actual_tool['id'] self.assertEqual(expected_tool, actual_tool) @pytest.mark.property_based def test_generate_content_mimetype_get_range_limit_none(self): """mimetype_get_range call with wrong limit input should fail""" with self.assertRaises(ValueError) as e: self.storage.content_mimetype_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) self.assertEqual(e.exception.args, ( 'Development error: limit should not be None',)) @pytest.mark.property_based @given(gen_content_mimetypes(min_size=1, max_size=4)) def test_generate_content_mimetype_get_range_no_limit(self, mimetypes): """mimetype_get_range returns mimetypes within range provided""" self.reset_storage_tables() # add mimetypes to storage self.storage.content_mimetype_add(mimetypes) # All ids from the db content_ids = sorted([c['id'] for c in mimetypes]) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes tool_id = mimetypes[0]['indexer_configuration_id'] actual_result = self.storage.content_mimetype_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(len(mimetypes), len(actual_ids)) self.assertIsNone(actual_next) self.assertEqual(content_ids, actual_ids) @pytest.mark.property_based @given(gen_content_mimetypes(min_size=4, max_size=4)) def test_generate_content_mimetype_get_range_limit(self, mimetypes): """mimetype_get_range paginates results if limit exceeded""" self.reset_storage_tables() # add mimetypes to storage self.storage.content_mimetype_add(mimetypes) # input the list of sha1s we want from storage content_ids = sorted([c['id'] for c in mimetypes]) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes limited to 3 results limited_results = len(mimetypes) - 1 tool_id = mimetypes[0]['indexer_configuration_id'] actual_result = self.storage.content_mimetype_get_range( start, end, indexer_configuration_id=tool_id, limit=limited_results) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(limited_results, len(actual_ids)) self.assertIsNotNone(actual_next) self.assertEqual(actual_next, content_ids[-1]) expected_mimetypes = content_ids[:-1] self.assertEqual(expected_mimetypes, actual_ids) # retrieve next part actual_results2 = self.storage.content_mimetype_get_range( start=end, end=end, indexer_configuration_id=tool_id) actual_ids2 = actual_results2['ids'] actual_next2 = actual_results2['next'] self.assertIsNone(actual_next2) expected_mimetypes2 = [content_ids[-1]] self.assertEqual(expected_mimetypes2, actual_ids2) @pytest.mark.property_based def test_generate_content_fossology_license_get_range_limit_none(self): """license_get_range call with wrong limit input should fail""" with self.assertRaises(ValueError) as e: self.storage.content_fossology_license_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) self.assertEqual(e.exception.args, ( 'Development error: limit should not be None',)) @pytest.mark.property_based def prepare_mimetypes_from(self, fossology_licenses): """Fossology license needs some consistent data in db to run. """ mimetypes = [] for c in fossology_licenses: mimetypes.append({ 'id': c['id'], 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': c['indexer_configuration_id'], }) return mimetypes @pytest.mark.property_based @given(gen_content_fossology_licenses(min_size=1, max_size=4)) def test_generate_content_fossology_license_get_range_no_limit( self, fossology_licenses): """license_get_range returns licenses within range provided""" self.reset_storage_tables() # craft some consistent mimetypes mimetypes = self.prepare_mimetypes_from(fossology_licenses) self.storage.content_mimetype_add(mimetypes) # add fossology_licenses to storage self.storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = self.storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(len(fossology_licenses), len(actual_ids)) self.assertIsNone(actual_next) self.assertEqual(content_ids, actual_ids) @pytest.mark.property_based @given(gen_content_fossology_licenses(min_size=1, max_size=4), gen_content_mimetypes(min_size=1, max_size=1)) def test_generate_content_fossology_license_get_range_no_limit_with_filter( self, fossology_licenses, mimetypes): """This filters non textual, then returns results within range""" self.reset_storage_tables() # craft some consistent mimetypes _mimetypes = self.prepare_mimetypes_from(fossology_licenses) # add binary mimetypes which will get filtered out in results for m in mimetypes: _mimetypes.append({ 'mimetype': 'binary', **m, }) self.storage.content_mimetype_add(_mimetypes) # add fossology_licenses to storage self.storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = self.storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(len(fossology_licenses), len(actual_ids)) self.assertIsNone(actual_next) self.assertEqual(content_ids, actual_ids) @pytest.mark.property_based @given(gen_content_fossology_licenses(min_size=4, max_size=4)) def test_generate_fossology_license_get_range_limit( self, fossology_licenses): """fossology_license_get_range paginates results if limit exceeded""" self.reset_storage_tables() # craft some consistent mimetypes mimetypes = self.prepare_mimetypes_from(fossology_licenses) # add fossology_licenses to storage self.storage.content_mimetype_add(mimetypes) self.storage.content_fossology_license_add(fossology_licenses) # input the list of sha1s we want from storage content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses limited to 3 results limited_results = len(fossology_licenses) - 1 tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = self.storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id, limit=limited_results) actual_ids = actual_result['ids'] actual_next = actual_result['next'] self.assertEqual(limited_results, len(actual_ids)) self.assertIsNotNone(actual_next) self.assertEqual(actual_next, content_ids[-1]) expected_fossology_licenses = content_ids[:-1] self.assertEqual(expected_fossology_licenses, actual_ids) # retrieve next part actual_results2 = self.storage.content_fossology_license_get_range( start=end, end=end, indexer_configuration_id=tool_id) actual_ids2 = actual_results2['ids'] actual_next2 = actual_results2['next'] self.assertIsNone(actual_next2) expected_fossology_licenses2 = [content_ids[-1]] self.assertEqual(expected_fossology_licenses2, actual_ids2) @pytest.mark.db class IndexerTestStorage(CommonTestStorage, BasePgTestStorage, unittest.TestCase): """Running the tests locally. For the client api tests (remote storage), see `class`:swh.indexer.storage.test_api_client:TestRemoteStorage class. """ pass diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py index e0ad775..eb1fa17 100644 --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -1,152 +1,192 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json import unittest - from unittest.mock import patch + +import swh.indexer.ctags from swh.indexer.ctags import ( CtagsIndexer, run_ctags ) from swh.indexer.tests.test_utils import ( - BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest, + CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, - SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG + SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG, + OBJ_STORAGE_DATA, fill_storage, fill_obj_storage ) class BasicTest(unittest.TestCase): @patch('swh.indexer.ctags.subprocess') def test_run_ctags(self, mock_subprocess): """Computing licenses from a raw content should return results """ output0 = """ {"name":"defun","kind":"function","line":1,"language":"scheme"} {"name":"name","kind":"symbol","line":5,"language":"else"}""" output1 = """ {"name":"let","kind":"var","line":10,"language":"something"}""" expected_result0 = [ { 'name': 'defun', 'kind': 'function', 'line': 1, 'lang': 'scheme' }, { 'name': 'name', 'kind': 'symbol', 'line': 5, 'lang': 'else' } ] expected_result1 = [ { 'name': 'let', 'kind': 'var', 'line': 10, 'lang': 'something' } ] for path, lang, intermediary_result, expected_result in [ (b'some/path', 'lisp', output0, expected_result0), (b'some/path/2', 'markdown', output1, expected_result1) ]: mock_subprocess.check_output.return_value = intermediary_result actual_result = list(run_ctags(path, lang=lang)) self.assertEqual(actual_result, expected_result) class InjectCtagsIndexer: """Override ctags computations. """ def compute_ctags(self, path, lang): """Inject fake ctags given path (sha1 identifier). """ return { 'lang': lang, **SHA1_TO_CTAGS.get(path) } class CtagsIndexerTest(NoDiskIndexer, InjectCtagsIndexer, CtagsIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no ''' ''' --links=no ''', 'max_content_size': 1000, }, }, 'languages': { 'python': 'python', 'haskell': 'haskell', 'bar': 'bar', }, 'workdir': '/nowhere', } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.objstorage = MockObjStorage() - self.tool_config = self.config['tools']['configuration'] - class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): """Ctags indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ + + legacy_get_format = True + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_ctags_get(ids) + def setUp(self): + super().setUp() self.indexer = CtagsIndexerTest() + self.idx_storage = self.indexer.idx_storage + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) # Prepare test input self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = 'd4c647f0fc257591cc9ba1722484229780d1c607' self.id2 = '688a5ef812c53907562fe379d4b3851e69c7cb15' - tool_id = self.indexer.tool['id'] + tool = {k.replace('tool_', ''): v + for (k, v) in self.indexer.tool.items()} + self.expected_results = { self.id0: { 'id': self.id0, - 'indexer_configuration_id': tool_id, - 'ctags': SHA1_TO_CTAGS[self.id0], + 'tool': tool, + **SHA1_TO_CTAGS[self.id0][0], }, self.id1: { 'id': self.id1, - 'indexer_configuration_id': tool_id, - 'ctags': SHA1_TO_CTAGS[self.id1], + 'tool': tool, + **SHA1_TO_CTAGS[self.id1][0], }, self.id2: { 'id': self.id2, - 'indexer_configuration_id': tool_id, - 'ctags': SHA1_TO_CTAGS[self.id2], + 'tool': tool, + **SHA1_TO_CTAGS[self.id2][0], } } + self._set_mocks() + + def _set_mocks(self): + def find_ctags_for_content(raw_content): + for (sha1, ctags) in SHA1_TO_CTAGS.items(): + if OBJ_STORAGE_DATA[sha1] == raw_content: + return ctags + else: + raise ValueError(('%r not found in objstorage, can\'t mock ' + 'its ctags.') % raw_content) + + def fake_language(raw_content, *args, **kwargs): + ctags = find_ctags_for_content(raw_content) + return {'lang': ctags[0]['lang']} + self._real_compute_language = swh.indexer.ctags.compute_language + swh.indexer.ctags.compute_language = fake_language + + def fake_check_output(cmd, *args, **kwargs): + print(cmd) + id_ = cmd[-1] # when using NoDiskIndexer, path is replaced by id + return '\n'.join( + json.dumps({'language': ctag['lang'], **ctag}) + for ctag in SHA1_TO_CTAGS[id_]) + self._real_check_output = swh.indexer.ctags.subprocess.check_output + swh.indexer.ctags.subprocess.check_output = fake_check_output + + def tearDown(self): + swh.indexer.ctags.compute_language = self._real_compute_language + swh.indexer.ctags.subprocess.check_output = self._real_check_output + super().tearDown() + class CtagsIndexerUnknownToolTestStorage( CommonIndexerNoTool, CtagsIndexerTest): """Fossology license indexer with wrong configuration""" class TestCtagsIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = CtagsIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py index 1d2fa72..0a61ed3 100644 --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -1,206 +1,194 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest -import logging from unittest.mock import patch from swh.indexer.fossology_license import ( - ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer, + FossologyLicenseIndexer, FossologyLicenseRangeIndexer, compute_license ) from swh.indexer.tests.test_utils import ( - MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer, - BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) class BasicTest(unittest.TestCase): @patch('swh.indexer.fossology_license.subprocess') def test_compute_license(self, mock_subprocess): """Computing licenses from a raw content should return results """ for path, intermediary_result, output in [ (b'some/path', None, []), (b'some/path/2', [], []), (b'other/path', ' contains license(s) GPL,AGPL', ['GPL', 'AGPL'])]: mock_subprocess.check_output.return_value = intermediary_result actual_result = compute_license(path, log=None) self.assertEqual(actual_result, { 'licenses': output, 'path': path, }) class InjectLicenseIndexer: """Override license computations. """ def compute_license(self, path, log=None): """path is the content identifier """ if isinstance(id, bytes): path = path.decode('utf-8') return { 'licenses': SHA1_TO_LICENSES.get(path) } class FossologyLicenseTestIndexer( - NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer): + NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseIndexer): """Specific fossology license whose configuration is enough to satisfy the indexing checks. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'workdir': '/nowhere', 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - self.objstorage = MockObjStorage() - class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_fossology_license_get(ids) + def setUp(self): + super().setUp() self.indexer = FossologyLicenseTestIndexer() + self.idx_storage = self.indexer.idx_storage + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content - tool_id = self.indexer.tool['id'] + + tool = {k.replace('tool_', ''): v + for (k, v) in self.indexer.tool.items()} # then self.expected_results = { self.id0: { - 'id': self.id0, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id0], }, self.id1: { - 'id': self.id1, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id1], }, self.id2: { - 'id': self.id2, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'licenses': SHA1_TO_LICENSES[self.id2], } } class FossologyLicenseRangeIndexerTest( NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): """Testing the range indexer on fossology license. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'workdir': '/nowhere', 'tools': { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }, 'write_batch_size': 100, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - # this hardcodes some contents, will use this to setup the storage - self.objstorage = MockObjStorage() - contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = BasicMockStorage(contents) - class TestFossologyLicenseRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Fossology License Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): + super().setUp() self.indexer = FossologyLicenseRangeIndexerTest() - # will play along with the objstorage's mocked contents for now - self.contents = sorted(self.indexer.objstorage) - # FIXME: leverage swh.objstorage.in_memory_storage's - # InMemoryObjStorage, swh.storage.tests's gen_contents, and - # hypothesis to generate data to actually run indexer on those + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id0] }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id1] }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'licenses': SHA1_TO_LICENSES[self.id2] } } class FossologyLicenseIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseTestIndexer): """Fossology license indexer with wrong configuration""" class FossologyLicenseRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, FossologyLicenseRangeIndexerTest): """Fossology license range indexer with wrong configuration""" class TestFossologyLicenseIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = FossologyLicenseIndexerUnknownToolTestStorage RangeIndexer = FossologyLicenseRangeIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py index dbe1e57..4ba2c3e 100644 --- a/swh/indexer/tests/test_language.py +++ b/swh/indexer/tests/test_language.py @@ -1,98 +1,102 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.indexer import language -from swh.indexer.language import ContentLanguageIndexer +from swh.indexer.language import LanguageIndexer from swh.indexer.tests.test_utils import ( - BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest, - CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG + CommonContentIndexerTest, CommonIndexerWithErrorsTest, + CommonIndexerNoTool, BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) -class LanguageTestIndexer(ContentLanguageIndexer): +class LanguageTestIndexer(LanguageIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, } } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.objstorage = MockObjStorage() - self.tool_config = self.config['tools']['configuration'] - class Language(unittest.TestCase): """Tests pygments tool for language detection """ def test_compute_language_none(self): # given self.content = "" self.declared_language = { 'lang': None } # when result = language.compute_language(self.content) # then self.assertEqual(self.declared_language, result) class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ + + legacy_get_format = True + + def get_indexer_results(self, ids): + yield from self.indexer.idx_storage.content_language_get(ids) + def setUp(self): self.indexer = LanguageTestIndexer() + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6' self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607' - tool_id = self.indexer.tool['id'] + + tool = {k.replace('tool_', ''): v + for (k, v) in self.indexer.tool.items()} self.expected_results = { self.id0: { 'id': self.id0, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'lang': 'python', }, self.id1: { 'id': self.id1, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'lang': 'c' }, self.id2: { 'id': self.id2, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'lang': 'text-only' } } class LanguageIndexerUnknownToolTestStorage( CommonIndexerNoTool, LanguageTestIndexer): """Fossology license indexer with wrong configuration""" class TestLanguageIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = LanguageIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 91b6b35..85630d9 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,506 +1,657 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .test_utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' - def prepare(self): - super().prepare() - class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, } - def prepare(self): - super().prepare() - self.tools = list(self.register_tools(self.config['tools'])) - class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', - 'schema:codeRepository': + 'codeRepository': 'git+https://github.com/moranegg/metadata_test', - 'schema:author': { + 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', - }, + }], } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', - 'schema:codeRepository': + 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', - 'schema:codeRepository': + 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', - 'schema:author': 'moranegg', + 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], - "schema:author": 'moranegg', - "schema:codeRepository": + "author": ['moranegg'], + "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = ContentMetadataTestIndexer( tool=TRANSLATOR_TOOL, config=BASE_TEST_CONFIG.copy()) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', - 'schema:codeRepository': + 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', - 'codemeta:issueTracker': + 'issueTracker': 'https://github.com/npm/npm/issues', - 'schema:author': { + 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', - 'schema:url': 'http://blog.izs.me', - }, - 'schema:codeRepository': + 'url': 'http://blog.izs.me', + }], + 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', - 'schema:license': 'Artistic-2.0', + 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], - 'schema:url': 'https://docs.npmjs.com/' + 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') }, { 'translated_metadata': None, 'id': hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = MAPPINGS["CodemetaMapping"].translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false + + + Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', - 'schema:identifier': 'com.mycompany.app', + 'identifier': 'com.mycompany.app', 'version': '1.2.3', - 'schema:codeRepository': + 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', + 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', - }) + }) + + def test_compute_metadata_maven_minimal(self): + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + """ + result = MAPPINGS["MavenMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'Maven Default Project', + 'identifier': 'com.mycompany.app', + 'version': '1.2.3', + 'codeRepository': + 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', + 'license': [], + }) + + def test_compute_metadata_maven_multiple(self): + '''Tests when there are multiple code repos and licenses.''' + raw_content = b""" + + Maven Default Project + 4.0.0 + com.mycompany.app + my-app + 1.2.3 + + + central + Maven Repository Switchboard + default + http://repo1.maven.org/maven2 + + false + + + + example + Example Maven Repo + default + http://example.org/maven2 + + + + + Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + MIT license + https://opensource.org/licenses/MIT + + + """ + result = MAPPINGS["MavenMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'Maven Default Project', + 'identifier': 'com.mycompany.app', + 'version': '1.2.3', + 'license': [ + 'https://www.apache.org/licenses/LICENSE-2.0.txt', + 'https://opensource.org/licenses/MIT', + ], + 'codeRepository': [ + 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', + 'http://example.org/maven2/com/mycompany/app/my-app', + ] + }) + + def test_compute_metadata_pkginfo(self): + raw_content = (b"""\ +Metadata-Version: 2.1 +Name: swh.core +Version: 0.0.49 +Summary: Software Heritage core utilities +Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ +Author: Software Heritage developers +Author-email: swh-devel@inria.fr +License: UNKNOWN +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Funding, https://www.softwareheritage.org/donate +Project-URL: Source, https://forge.softwareheritage.org/source/swh-core +Description: swh-core + ======== + + core library for swh's modules: + - config parser + - hash computations + - serialization + - logging mechanism + +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Operating System :: OS Independent +Classifier: Development Status :: 5 - Production/Stable +Description-Content-Type: text/markdown +Provides-Extra: testing +""") # noqa + result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + self.assertCountEqual(result['description'], [ + 'Software Heritage core utilities', # note the comma here + 'swh-core\n' + ' ========\n' + ' \n' + " core library for swh's modules:\n" + ' - config parser\n' + ' - hash computations\n' + ' - serialization\n' + ' - logging mechanism\n' + ' '], + result) + del result['description'] + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', + 'name': 'swh.core', + 'author': [{ + 'type': 'Person', + 'name': 'Software Heritage developers', + 'email': 'swh-devel@inria.fr', + }], + 'version': '0.0.49', + }) + + def test_compute_metadata_pkginfo_license(self): + raw_content = (b"""\ +Metadata-Version: 2.1 +Name: foo +License: MIT +""") # noqa + result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'foo', + 'license': 'MIT', + }) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', - 'codemeta:issueTracker': + 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', - 'schema:author': 'Andrew Nesbitt', + 'author': ['Andrew Nesbitt'], 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], - 'schema:codeRepository': + 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list(metadata_indexer.idx_storage.revision_metadata_get( sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'schema:codeRepository': + 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', - 'schema:author': 'Andrew Nesbitt', + 'author': ['Andrew Nesbitt'], 'license': 'AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', - 'codemeta:issueTracker': + 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index c621717..5eea4b9 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,192 +1,184 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest -import logging from unittest.mock import patch from swh.indexer.mimetype import ( - ContentMimetypeIndexer, MimetypeRangeIndexer, compute_mimetype_encoding + MimetypeIndexer, MimetypeRangeIndexer, compute_mimetype_encoding ) from swh.indexer.tests.test_utils import ( - MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, - BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) class FakeMagicResult: def __init__(self, mimetype, encoding): self.mime_type = mimetype self.encoding = encoding class BasicTest(unittest.TestCase): @patch('swh.indexer.mimetype.magic') def test_compute_mimetype_encoding(self, mock_magic): """Compute mimetype encoding should return results""" for _input, _mimetype, _encoding in [ (b'some-content', 'text/plain', 'utf-8'), (b'raw-content', 'application/json', 'ascii')]: mock_magic.detect_from_content.return_value = FakeMagicResult( _mimetype, _encoding) actual_result = compute_mimetype_encoding(_input) self.assertEqual(actual_result, { 'mimetype': _mimetype, 'encoding': _encoding }) -class MimetypeTestIndexer(ContentMimetypeIndexer): +class MimetypeTestIndexer(MimetypeIndexer): """Specific mimetype indexer instance whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - self.objstorage = MockObjStorage() - class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): """Mimetype indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ + legacy_get_format = True + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_mimetype_get(ids) + def setUp(self): self.indexer = MimetypeTestIndexer() + self.idx_storage = self.indexer.idx_storage + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' - tool_id = self.indexer.tool['id'] + + tool = {k.replace('tool_', ''): v + for (k, v) in self.indexer.tool.items()} + self.expected_results = { self.id0: { 'id': self.id0, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'mimetype': 'text/plain', 'encoding': 'us-ascii', }, self.id1: { 'id': self.id1, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'mimetype': 'text/plain', 'encoding': 'us-ascii', }, self.id2: { 'id': self.id2, - 'indexer_configuration_id': tool_id, + 'tool': tool, 'mimetype': 'application/x-empty', 'encoding': 'binary', } } class MimetypeRangeIndexerTest(MimetypeRangeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, 'write_batch_size': 100, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - # this hardcodes some contents, will use this to setup the storage - self.objstorage = MockObjStorage() - # sync objstorage and storage - contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = BasicMockStorage(contents) - class TestMimetypeRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): """Range Mimetype Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): + super().setUp() self.indexer = MimetypeRangeIndexerTest() - # will play along with the objstorage's mocked contents for now - self.contents = sorted(self.indexer.objstorage) - # FIXME: leverage swh.objstorage.in_memory_storage's - # InMemoryObjStorage, swh.storage.tests's gen_contents, and - # hypothesis to generate data to actually run indexer on those + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'encoding': 'us-ascii', 'id': self.id0, 'indexer_configuration_id': tool_id, 'mimetype': 'text/plain'}, self.id1: { 'encoding': 'us-ascii', 'id': self.id1, 'indexer_configuration_id': tool_id, 'mimetype': 'text/x-python'}, self.id2: { 'encoding': 'us-ascii', 'id': self.id2, 'indexer_configuration_id': tool_id, 'mimetype': 'text/plain'} } class MimetypeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeTestIndexer): """Mimetype indexer with wrong configuration""" class MimetypeRangeIndexerUnknownToolTestStorage( CommonIndexerNoTool, MimetypeRangeIndexerTest): """Mimetype range indexer with wrong configuration""" class TestMimetypeIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = MimetypeIndexerUnknownToolTestStorage RangeIndexer = MimetypeRangeIndexerUnknownToolTestStorage diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py index 1558eb3..5252b21 100644 --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,86 +1,95 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.tests.test_utils import ( - MockIndexerStorage, MockStorage, BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage ) class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'origin-metadata', 'version': '0.0.1', 'configuration': {}, }, 'tasks': { 'revision_metadata': None, 'origin_intrinsic_metadata': None, } } - def prepare(self): - super().prepare() - self.storage = MockStorage() - self.idx_storage = MockIndexerStorage() - def persist_index_computations(self, results, policy_update): self.results = results class OriginHead(unittest.TestCase): + def setUp(self): + self.indexer = OriginHeadTestIndexer() + fill_storage(self.indexer.storage) + + def _get_origin_id(self, type_, url): + origin = self.indexer.storage.origin_get({ + 'type': type_, 'url': url}) + return origin['id'] + def test_git(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['git+https://github.com/SoftwareHeritage/swh-storage']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'git', 'https://github.com/SoftwareHeritage/swh-storage') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' b'\xd7}\xac\xefrm', - 'origin_id': 52189575}]) + 'origin_id': origin_id}]) def test_ftp(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['ftp+rsync://ftp.gnu.org/gnu/3dldf']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'ftp', 'rsync://ftp.gnu.org/gnu/3dldf') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', - 'origin_id': 4423668}]) + 'origin_id': origin_id}]) def test_deposit(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['deposit+https://forge.softwareheritage.org/source/' 'jesuisgpl/']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'deposit', 'https://forge.softwareheritage.org/source/jesuisgpl/') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', - 'origin_id': 77775770}]) + 'origin_id': origin_id}]) def test_pypi(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['pypi+https://pypi.org/project/limnoria/']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'pypi', 'https://pypi.org/project/limnoria/') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', - 'origin_id': 85072327}]) + 'origin_id': origin_id}]) def test_svn(self): - indexer = OriginHeadTestIndexer() - indexer.run( + self.indexer.run( ['svn+http://0-512-md.googlecode.com/svn/']) - self.assertEqual(indexer.results, [{ + origin_id = self._get_origin_id( + 'svn', 'http://0-512-md.googlecode.com/svn/') + self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', - 'origin_id': 49908349}]) + 'origin_id': origin_id}]) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index b11665f..5053bd1 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,156 +1,172 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time import unittest from celery import task from swh.model.hashutil import hash_to_bytes +from swh.storage.in_memory import Storage from swh.indexer.metadata import ( OriginMetadataIndexer, RevisionMetadataIndexer ) +from swh.indexer.storage.in_memory import IndexerStorage +from swh.objstorage.objstorage_in_memory import InMemoryObjStorage + from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture from .test_utils import ( - MockObjStorage, MockStorage, MockIndexerStorage, - BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) from .test_origin_head import OriginHeadTestIndexer from .test_metadata import ContentMetadataTestIndexer class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } } - def prepare(self): - super().prepare() - self.idx_storage = MockIndexerStorage() - self.storage = MockStorage() - self.objstorage = MockObjStorage() - @task def revision_metadata_test_task(*args, **kwargs): indexer = RevisionMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results class OriginMetadataTestIndexer(OriginMetadataIndexer): def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': [] } - def prepare(self): - super().prepare() - self.storage = MockStorage() - self.objstorage = MockObjStorage() - self.idx_storage = MockIndexerStorage() - @task def origin_intrinsic_metadata_test_task(*args, **kwargs): indexer = OriginMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results class OriginHeadTestIndexer(OriginHeadTestIndexer): def prepare(self): super().prepare() self.config['tasks'] = { 'revision_metadata': 'revision_metadata_test_task', 'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task', } class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase): def setUp(self): super().setUp() self.maxDiff = None - # FIXME: Improve mock indexer storage reset behavior - MockIndexerStorage.added_data = [] - MockIndexerStorage.revision_metadata = {} self.add_scheduler_task_type( 'revision_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'revision_metadata_test_task') self.add_scheduler_task_type( 'origin_intrinsic_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'origin_intrinsic_metadata_test_task') RevisionMetadataTestIndexer.scheduler = self.scheduler def tearDown(self): del RevisionMetadataTestIndexer.scheduler super().tearDown() - def test_pipeline(self): - indexer = OriginHeadTestIndexer() - indexer.scheduler = self.scheduler - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) - - self.run_ready_tasks() # Run the first task - time.sleep(0.1) # Give it time to complete and schedule the 2nd one - self.run_ready_tasks() # Run the second task + @unittest.mock.patch('swh.indexer.storage.in_memory.IndexerStorage') + @unittest.mock.patch('swh.storage.in_memory.Storage') + def test_pipeline(self, storage_mock, idx_storage_mock): + # Always returns the same instance of the idx storage, because + # this function is called by each of the three indexers. + objstorage = InMemoryObjStorage() + storage = Storage() + idx_storage = IndexerStorage() + + storage_mock.return_value = storage + idx_storage_mock.return_value = idx_storage + + fill_obj_storage(objstorage) + fill_storage(storage) + + # TODO: find a better way to share the ContentMetadataIndexer use + # the same objstorage instance. + import swh.objstorage + old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] + swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage + try: + indexer = OriginHeadTestIndexer() + indexer.scheduler = self.scheduler + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + self.run_ready_tasks() # Run the first task + # Give it time to complete and schedule the 2nd one + time.sleep(0.1) + self.run_ready_tasks() # Run the second task + finally: + swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'schema:codeRepository': - 'git+https://github.com/librariesio/yarn-parser.git', - 'schema:author': 'Andrew Nesbitt', - 'license': 'AGPL-3.0', + 'codeRepository': + 'git+git+https://github.com/librariesio/yarn-parser.git', + 'author': [{ + 'type': 'Person', + 'name': 'Andrew Nesbitt' + }], + 'license': 'https://spdx.org/licenses/AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', - 'codemeta:issueTracker': + 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { - 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'id': rev_id, 'translated_metadata': metadata, } origin_metadata = { - 'origin_id': 54974445, - 'from_revision': hash_to_bytes( - '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'origin_id': origin['id'], + 'from_revision': rev_id, 'metadata': metadata, } - expected_results = [ - ('revision_metadata', True, [rev_metadata]), - ('origin_intrinsic_metadata', True, [origin_metadata]), - ] - results = list(indexer.idx_storage.added_data) + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) for result in results: - metadata = result[2] - for item in metadata: - # cannot check those (generated ids) - del item['indexer_configuration_id'] + del result['tool'] + self.assertEqual(results, [rev_metadata]) - self.assertCountEqual(expected_results, results) + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + for result in results: + del result['tool'] + self.assertEqual(results, [origin_metadata]) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 0358415..8fdb308 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,806 +1,667 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import abc import datetime +import hashlib +import random -from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.indexer.storage import INDEXER_CFG_KEY BASE_TEST_CONFIG = { 'storage': { 'cls': 'memory', 'args': { }, }, 'objstorage': { 'cls': 'memory', 'args': { }, }, INDEXER_CFG_KEY: { 'cls': 'memory', 'args': { }, }, } ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': hash_to_bytes( '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'target_type': 'revision'}}} } REVISIONS = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] DIRECTORY_ID = b'10' DIRECTORY = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'sha1': b'cde' }, { 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None } ] SHA1_TO_LICENSES = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], } SHA1_TO_CTAGS = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{ 'name': 'foo', 'kind': 'str', 'line': 10, 'lang': 'bar', }], 'd4c647f0fc257591cc9ba1722484229780d1c607': [{ 'name': 'let', 'kind': 'int', 'line': 100, 'lang': 'haskell', }], '688a5ef812c53907562fe379d4b3851e69c7cb15': [{ 'name': 'symbol', 'kind': 'float', 'line': 99, 'lang': 'python', }], } OBJ_STORAGE_DATA = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging - from swh.indexer.mimetype import ContentMimetypeIndexer + from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', + '636465': b""" + { + "name": "yarn-parser", + "version": "1.0.0", + "description": "Tiny web service for parsing yarn.lock files", + "main": "index.js", + "scripts": { + "start": "node index.js", + "test": "mocha" + }, + "engines": { + "node": "9.8.0" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/librariesio/yarn-parser.git" + }, + "keywords": [ + "yarn", + "parse", + "lock", + "dependencies" + ], + "author": "Andrew Nesbitt", + "license": "AGPL-3.0", + "bugs": { + "url": "https://github.com/librariesio/yarn-parser/issues" + }, + "homepage": "https://github.com/librariesio/yarn-parser#readme", + "dependencies": { + "@yarnpkg/lockfile": "^1.0.0", + "body-parser": "^1.15.2", + "express": "^4.14.0" + }, + "devDependencies": { + "chai": "^4.1.2", + "mocha": "^5.2.0", + "request": "^2.87.0", + "test": "^0.6.0" + } + } +""" } CONTENT_METADATA = [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'schema:author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }] def fill_obj_storage(obj_storage): """Add some content in an object storage.""" for (obj_id, content) in OBJ_STORAGE_DATA.items(): obj_storage.add(content, obj_id=hash_to_bytes(obj_id)) -class MockObjStorage: - """Mock an swh-objstorage objstorage with predefined contents. - - """ - data = {} - - def __init__(self): - self.data = OBJ_STORAGE_DATA.copy() - - def __iter__(self): - yield from self.data.keys() - - def __contains__(self, sha1): - return self.data.get(sha1) is not None - - def get(self, sha1): - raw_content = self.data.get(sha1) - if raw_content is None: - raise ObjNotFoundError(sha1) - return raw_content - - -class MockIndexerStorage(): - """Mock an swh-indexer storage. - - """ - added_data = [] - revision_metadata = {} - tools = {} - - def indexer_configuration_add(self, tools): - results = [] - for tool in tools: - results.append(self._indexer_configuration_add_one(tool)) - return results - - def _indexer_configuration_add_one(self, tool): - if tool['tool_name'] == 'swh-metadata-translator': - tool2 = { - 'id': 30, - 'tool_name': 'swh-metadata-translator', - 'tool_version': '0.0.1', - 'tool_configuration': { - 'type': 'local', - 'context': 'NpmMapping' - }, - } - elif tool['tool_name'] == 'swh-metadata-detector': - tool2 = { - 'id': 7, - 'tool_name': 'swh-metadata-detector', - 'tool_version': '0.0.1', - 'tool_configuration': { - 'type': 'local', - 'context': 'NpmMapping' - }, - } - elif tool['tool_name'] == 'origin-metadata': - tool2 = { - 'id': 8, - 'tool_name': 'origin-metadata', - 'tool_version': '0.0.1', - 'tool_configuration': {}, - } - else: - assert False, 'Unknown tool {tool_name}'.format(**tool) - - self.tools[tool2['id']] = tool2 - return tool2 - - def content_metadata_missing(self, sha1s): - yield from [] - - def content_metadata_add(self, metadata, conflict_update=None): - self.added_data.append( - ('content_metadata', conflict_update, metadata)) - - def revision_metadata_add(self, metadata, conflict_update=None): - assert conflict_update - self.added_data.append( - ('revision_metadata', conflict_update, metadata)) - for item in metadata: - assert isinstance(item['id'], bytes) - self.revision_metadata.setdefault(item['id'], []).append(item) - - def revision_metadata_get(self, ids): - for id_ in ids: - assert isinstance(id_, bytes) - for item in self.revision_metadata.get(id_): - item = item.copy() - tool_id = item.pop('indexer_configuration_id') - if tool_id in self.tools: - item['tool'] = self.tools[tool_id].copy() - else: # HACK: this needs to be removed altogether - item['tool'] = { - 'id': tool_id, - 'name': tool_id[0], - 'version': tool_id[1], - 'configuration': tool_id[2], - } - yield item - - def origin_intrinsic_metadata_add(self, metadata, conflict_update=None): - self.added_data.append( - ('origin_intrinsic_metadata', conflict_update, metadata)) - - def content_metadata_get(self, sha1s): - assert sha1s == [b'cde'] - return CONTENT_METADATA - - def fill_storage(storage): for origin in ORIGINS: origin = origin.copy() del origin['id'] - last_origin_id = storage.origin_add_one(origin) - visit = storage.origin_visit_add(last_origin_id, datetime.datetime.now()) - for (snap_id, snap_branches) in SNAPSHOTS.items(): - storage.snapshot_add(last_origin_id, visit['visit'], { + storage.origin_add_one(origin) + for (orig_pseudo_id, snap) in SNAPSHOTS.items(): + for orig in ORIGINS: + if orig_pseudo_id == orig['id']: + origin_id = storage.origin_get( + {'type': orig['type'], 'url': orig['url']})['id'] + break + else: + assert False + visit = storage.origin_visit_add(origin_id, datetime.datetime.now()) + snap_id = snap.get('id') or \ + bytes([random.randint(0, 255) for _ in range(32)]) + storage.snapshot_add(origin_id, visit['visit'], { 'id': snap_id, - 'branches': snap_branches + 'branches': snap['branches'] }) storage.revision_add(REVISIONS) storage.directory_add([{ 'id': DIRECTORY_ID, 'entries': DIRECTORY, }]) - - -class MockStorage(): - """Mock a real swh-storage storage to simplify reading indexers' - outputs. - - """ - def origin_get(self, id_): - for origin in ORIGINS: - for (k, v) in id_.items(): - if origin[k] != v: - break - else: - # This block is run iff we didn't break, ie. if all supplied - # parts of the id are set to the expected value. - return origin - assert False, id_ - - def snapshot_get_latest(self, origin_id): - if origin_id in SNAPSHOTS: - return SNAPSHOTS[origin_id] + for (obj_id, content) in OBJ_STORAGE_DATA.items(): + # TODO: use MultiHash + if hasattr(hashlib, 'blake2s'): + blake2s256 = hashlib.blake2s(content, digest_size=32).digest() else: - assert False, origin_id - - def revision_get(self, revisions): - return REVISIONS.copy() - - def directory_ls(self, directory, recursive=False, cur=None): - assert directory == DIRECTORY_ID - return DIRECTORY - - -class BasicMockStorage(): - """In memory implementation to fake the content_get_range api. - - FIXME: To remove when the actual in-memory lands. - - """ - contents = [] - - def __init__(self, contents): - self.contents = contents - - def content_get_range(self, start, end, limit=1000): - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next_id = None - counter = 0 - for c in self.contents: - _id = c['sha1'] - if start <= _id and _id <= end: - results.append(c) - if counter >= limit: - break - counter += 1 - - return { - 'contents': results, - 'next': _next_id - } - - -class BasicMockIndexerStorage(): - """Mock Indexer storage to simplify reading indexers' outputs. - - """ - state = [] - - def _internal_add(self, data, conflict_update=None): - """All content indexer have the same structure. So reuse `data` as the - same data. It's either mimetype, language, - fossology_license, etc... - - """ - self.state = data - self.conflict_update = conflict_update - - def content_mimetype_add(self, data, conflict_update=None): - self._internal_add(data, conflict_update=conflict_update) - - def content_fossology_license_add(self, data, conflict_update=None): - self._internal_add(data, conflict_update=conflict_update) - - def content_language_add(self, data, conflict_update=None): - self._internal_add(data, conflict_update=conflict_update) - - def content_ctags_add(self, data, conflict_update=None): - self._internal_add(data, conflict_update=conflict_update) - - def _internal_get_range(self, start, end, - indexer_configuration_id, limit=1000): - """Same logic as _internal_add, we retrieve indexed data given an - identifier. So the code here does not change even though - the underlying data does. - - """ - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next = None - counter = 0 - for m in self.state: - _id = m['id'] - _tool_id = m['indexer_configuration_id'] - if (start <= _id and _id <= end and - _tool_id == indexer_configuration_id): - results.append(_id) - if counter >= limit: - break - counter += 1 - - return { - 'ids': results, - 'next': _next - } - - def content_mimetype_get_range( - self, start, end, indexer_configuration_id, limit=1000): - return self._internal_get_range( - start, end, indexer_configuration_id, limit=limit) - - def content_fossology_license_get_range( - self, start, end, indexer_configuration_id, limit=1000): - return self._internal_get_range( - start, end, indexer_configuration_id, limit=limit) - - def indexer_configuration_add(self, tools): - return [{ - 'id': 10, - }] + # fallback for Python <3.6 + blake2s256 = bytes([random.randint(0, 255) for _ in range(32)]) + storage.content_add([{ + 'data': content, + 'length': len(content), + 'status': 'visible', + 'sha1': hash_to_bytes(obj_id), + 'sha1_git': hash_to_bytes(obj_id), + 'sha256': hashlib.sha256(content).digest(), + 'blake2s256': blake2s256 + }]) class CommonIndexerNoTool: """Mixin to wronly initialize content indexer""" def prepare(self): super().prepare() self.tools = None class CommonIndexerWithErrorsTest: """Test indexer configuration checks. """ Indexer = None RangeIndexer = None def test_wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool fails check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): print('indexer: %s' % self.Indexer) self.Indexer() def test_wrong_unknown_configuration_tool_range(self): """Range Indexer with unknown configuration tool fails check""" if self.RangeIndexer is not None: with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): self.RangeIndexer() -class CommonContentIndexerTest: - def assert_results_ok(self, actual_results, expected_results=None): +class CommonContentIndexerTest(metaclass=abc.ABCMeta): + legacy_get_format = False + """True iff the tested indexer uses the legacy format. + see: https://forge.softwareheritage.org/T1433""" + + def get_indexer_results(self, ids): + """Override this for indexers that don't have a mock storage.""" + return self.indexer.idx_storage.state + + def assert_legacy_results_ok(self, sha1s, expected_results=None): + # XXX old format, remove this when all endpoints are + # updated to the new one + # see: https://forge.softwareheritage.org/T1433 + sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) + for sha1 in sha1s] + actual_results = list(self.get_indexer_results(sha1s)) + if expected_results is None: expected_results = self.expected_results + self.assertEqual(len(expected_results), len(actual_results), + (expected_results, actual_results)) for indexed_data in actual_results: _id = indexed_data['id'] - self.assertEqual(indexed_data, expected_results[_id]) - _tool_id = indexed_data['indexer_configuration_id'] - self.assertEqual(_tool_id, self.indexer.tool['id']) + expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() + expected_data['id'] = _id + self.assertEqual(indexed_data, expected_data) + + def assert_results_ok(self, sha1s, expected_results=None): + if self.legacy_get_format: + self.assert_legacy_results_ok(sha1s, expected_results) + return + + sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) + for sha1 in sha1s] + actual_results = list(self.get_indexer_results(sha1s)) + + if expected_results is None: + expected_results = self.expected_results + + self.assertEqual(len(expected_results), len(actual_results), + (expected_results, actual_results)) + for indexed_data in actual_results: + (_id, indexed_data) = list(indexed_data.items())[0] + expected_data = expected_results[hashutil.hash_to_hex(_id)].copy() + expected_data = [expected_data] + self.assertEqual(indexed_data, expected_data) def test_index(self): """Known sha1 have their data indexed """ sha1s = [self.id0, self.id1, self.id2] # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state - self.assertTrue(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') - self.assertFalse(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" sha1s = [self.id1, '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } - self.assert_results_ok(actual_results, expected_results) + self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: """Allows to factorize tests on range indexer. """ + def setUp(self): + self.contents = sorted(OBJ_STORAGE_DATA) + def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results + actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data['id'] - self.assertEqual(indexed_data, expected_results[_id]) - self.assertTrue(start <= _id and _id <= end) + assert isinstance(_id, bytes) + indexed_data = indexed_data.copy() + indexed_data['id'] = hash_to_hex(indexed_data['id']) + self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) + self.assertTrue(start <= _id <= end) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) def test__index_contents(self): """Indexing contents without existing data results in indexed data """ - start, end = [self.contents[0], self.contents[2]] # output hex ids + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = list(self.indexer._index_contents( start, end, indexed={})) self.assert_results_ok(start, end, actual_results) def test__index_contents_with_indexed_data(self): """Indexing contents with existing data results in less indexed data """ - start, end = [self.contents[0], self.contents[2]] # output hex ids + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( - start, end, indexed=set(data_indexed)) + start, end, indexed=set(map(hash_to_bytes, data_indexed))) # craft the expected results expected_results = self.expected_results.copy() for already_indexed_key in data_indexed: expected_results.pop(already_indexed_key) self.assert_results_ok( start, end, actual_results, expected_results) def test_generate_content_get(self): """Optimal indexing should result in indexed data """ - start, end = [self.contents[0], self.contents[2]] # output hex ids + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end) # then self.assertTrue(actual_results) def test_generate_content_get_input_as_bytes(self): """Optimal indexing should result in indexed data Input are in bytes here. """ _start, _end = [self.contents[0], self.contents[2]] # output hex ids start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( # checks the bytes input this time start, end, skip_existing=False) # no already indexed data so same result as prior test # then self.assertTrue(actual_results) def test_generate_content_get_no_result(self): """No result indexed returns False""" - start, end = ['0000000000000000000000000000000000000000', - '0000000000000000000000000000000000000001'] + _start, _end = ['0000000000000000000000000000000000000000', + '0000000000000000000000000000000000000001'] + start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( start, end, incremental=False) # then self.assertFalse(actual_results) class NoDiskIndexer: """Mixin to override the DiskIndexer behavior avoiding side-effects in tests. """ def write_to_temp(self, filename, data): # noop return filename def cleanup(self, content_path): # noop return None diff --git a/tox.ini b/tox.ini deleted file mode 100644 index a2d8b63..0000000 --- a/tox.ini +++ /dev/null @@ -1,33 +0,0 @@ -[tox] -envlist=flake8,py3 - -[testenv:py3] -deps = - .[testing] - pytest-cov - pifpaf -commands = - pifpaf run postgresql -- pytest --hypothesis-profile=fast --cov=swh --cov-branch {posargs} - -[testenv:py3-slow] -deps = - .[testing] - pytest-cov - pifpaf -commands = - pifpaf run postgresql -- pytest --hypothesis-profile=slow --cov=swh --cov-branch {posargs} - -[testenv:py3-prop] -deps = - .[testing] - pytest-cov - pifpaf -commands = - pifpaf run postgresql -- pytest --hypothesis-profile=fast -m property_based --disable-warnings - -[testenv:flake8] -skip_install = true -deps = - flake8 -commands = - {envpython} -m flake8 diff --git a/version.txt b/version.txt index 2c9dcbb..c71589e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.118-0-gf2da005 \ No newline at end of file +v0.0.124-0-ga9cff24 \ No newline at end of file