diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1c279bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.pyc
+*.sw?
+*~
+/.coverage
+/.coverage.*
+.eggs/
+__pycache__
+*.egg-info/
+build/
+dist/
+version.txt
+/sql/createdb-stamp
+/sql/filldb-stamp
+.tox/
+.hypothesis/
+.mypy_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d1f84e3
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v2.4.0
+ hooks:
+ - id: trailing-whitespace
+ - id: flake8
+ - id: check-json
+ - id: check-yaml
+
+- repo: https://github.com/codespell-project/codespell
+ rev: v1.16.0
+ hooks:
+ - id: codespell
+ exclude: ^(swh/indexer/data/codemeta/crosswalk.csv)$
+- repo: local
+ hooks:
+ - id: mypy
+ name: mypy
+ entry: mypy
+ args: [swh]
+ pass_filenames: false
+ language: system
+ types: [python]
+
+- repo: https://github.com/python/black
+ rev: 19.10b0
+ hooks:
+ - id: black
+
+# unfortunately, we are far from being able to enable this...
+# - repo: https://github.com/PyCQA/pydocstyle.git
+# rev: 4.0.0
+# hooks:
+# - id: pydocstyle
+# name: pydocstyle
+# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions.
+# entry: pydocstyle --convention=google
+# language: python
+# types: [python]
+
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..27d038e
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+Copyright (C) 2015-2017 The Software Heritage developers
+
+See http://www.softwareheritage.org/ for more information.
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..0ad22b5
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,78 @@
+# Software Heritage Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as Software
+Heritage contributors and maintainers pledge to making participation in our
+project and our community a harassment-free experience for everyone, regardless
+of age, body size, disability, ethnicity, sex characteristics, gender identity
+and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at `conduct@softwareheritage.org`. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident. Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..a1a7b45
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,2 @@
+Siddharth Ravikumar
+Thibault Allançon
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/MANIFEST.in b/MANIFEST.in
index c6e3a9a..d5bc305 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,8 +1,9 @@
include README.md
include Makefile
include requirements*.txt
include version.txt
+include conftest.py
recursive-include sql *
recursive-include swh/indexer/sql *.sql
recursive-include swh/indexer/data *
recursive-include swh py.typed
diff --git a/Makefile.local b/Makefile.local
new file mode 100644
index 0000000..c163514
--- /dev/null
+++ b/Makefile.local
@@ -0,0 +1 @@
+TESTFLAGS=--hypothesis-profile=fast
diff --git a/PKG-INFO b/PKG-INFO
index a2920a6..06fbd34 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,71 @@
Metadata-Version: 2.1
Name: swh.indexer
-Version: 0.1.0
+Version: 0.1.1
Summary: Software Heritage Content Indexer
Home-page: https://forge.softwareheritage.org/diffusion/78/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
Description: swh-indexer
============
Tools to compute multiple indexes on SWH's raw contents:
- content:
- mimetype
- ctags
- language
- fossology-license
- metadata
- revision:
- metadata
An indexer is in charge of:
- looking up objects
- extracting information from those objects
- store those information in the swh-indexer db
There are multiple indexers working on different object types:
- content indexer: works with content sha1 hashes
- revision indexer: works with revision sha1 hashes
- origin indexer: works with origin identifiers
Indexation procedure:
- receive batch of ids
- retrieve the associated data depending on object type
- compute for that object some index
- store the result to swh's storage
Current content indexers:
- mimetype (queue swh_indexer_content_mimetype): detect the encoding
and mimetype
- language (queue swh_indexer_content_language): detect the
programming language
- ctags (queue swh_indexer_content_ctags): compute tags information
- fossology-license (queue swh_indexer_fossology_license): compute the
license
- metadata: translate file into translated_metadata dict
Current revision indexers:
- metadata: detects files containing metadata and retrieves translated_metadata
in content_metadata table in storage or run content indexer to translate
files.
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
diff --git a/codemeta.json b/codemeta.json
new file mode 100644
index 0000000..8eaf5cc
--- /dev/null
+++ b/codemeta.json
@@ -0,0 +1,39 @@
+{
+ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld",
+ "@type": "SoftwareSourceCode",
+ "identifier": "5682a72dc61f86ae69f2841c2184d6159c0b6d5d",
+ "description": "Software Heritage Indexer for revisions and contents",
+ "name": "swh-indexer",
+ "isPartOf": {
+ "@type": "SoftwareSourceCode",
+ "name": "swh-environment",
+ "identifier": "83e766feafde91242883be1bf369ed3e6865824f"
+ },
+ "codeRepository": "https://forge.softwareheritage.org/diffusion/78/",
+ "issueTracker": "https://forge.softwareheritage.org/maniphest/",
+ "license": "https://spdx.org/licenses/GPL-3.0.html",
+ "version": "0.0.35",
+ "author": [
+ {
+ "@type": "Organization",
+ "name": "Software Heritage",
+ "url": "https://www.softwareheritage.org",
+ "email": "swh-devel@inria.fr"
+ }
+ ],
+ "developmentStatus": "active",
+ "keywords": [
+ "indexer",
+ "software",
+ "mimetype",
+ "ctags",
+ "language",
+ "fossology-license",
+ "metadata",
+ "metadata-detector",
+ "metadata-translator"
+ ],
+ "dateCreated":"2017-06-12",
+ "datePublished":"2017-06-12",
+ "programmingLanguage": "Python"
+}
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..de31662
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from hypothesis import settings
+
+# define tests profile. Full documentation is at:
+# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
+settings.register_profile("fast", max_examples=5, deadline=5000)
+settings.register_profile("slow", max_examples=20, deadline=5000)
+
+# Ignore the following modules because wsgi module fails as no
+# configuration file is found (--doctest-modules forces the module
+# loading)
+collect_ignore = ["swh/indexer/storage/api/wsgi.py"]
+
+# we use the swh_scheduler fixture
+pytest_plugins = ["swh.scheduler.pytest_plugin"]
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..58a761e
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,3 @@
+_build/
+apidoc/
+*-stamp
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..59d8b80
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,3 @@
+include ../../swh-docs/Makefile.sphinx
+-include Makefile.local
+
diff --git a/docs/Makefile.local b/docs/Makefile.local
new file mode 100644
index 0000000..cd07101
--- /dev/null
+++ b/docs/Makefile.local
@@ -0,0 +1,11 @@
+sphinx/html: images
+sphinx/clean: clean-images
+assets: images
+
+images:
+ make -C images/
+clean-images:
+ make -C images/ clean
+
+.PHONY: images clean-images
+
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..f4f2481
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,49 @@
+swh-indexer
+============
+
+Tools to compute multiple indexes on SWH's raw contents:
+- content:
+ - mimetype
+ - ctags
+ - language
+ - fossology-license
+ - metadata
+- revision:
+ - metadata
+
+An indexer is in charge of:
+- looking up objects
+- extracting information from those objects
+- store those information in the swh-indexer db
+
+There are multiple indexers working on different object types:
+ - content indexer: works with content sha1 hashes
+ - revision indexer: works with revision sha1 hashes
+ - origin indexer: works with origin identifiers
+
+Indexation procedure:
+- receive batch of ids
+- retrieve the associated data depending on object type
+- compute for that object some index
+- store the result to swh's storage
+
+Current content indexers:
+
+- mimetype (queue swh_indexer_content_mimetype): detect the encoding
+ and mimetype
+
+- language (queue swh_indexer_content_language): detect the
+ programming language
+
+- ctags (queue swh_indexer_content_ctags): compute tags information
+
+- fossology-license (queue swh_indexer_fossology_license): compute the
+ license
+
+- metadata: translate file into translated_metadata dict
+
+Current revision indexers:
+
+- metadata: detects files containing metadata and retrieves translated_metadata
+ in content_metadata table in storage or run content indexer to translate
+ files.
diff --git a/docs/_static/.placeholder b/docs/_static/.placeholder
new file mode 100644
index 0000000..e69de29
diff --git a/docs/_templates/.placeholder b/docs/_templates/.placeholder
new file mode 100644
index 0000000..e69de29
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..190deb7
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1 @@
+from swh.docs.sphinx.conf import * # NoQA
diff --git a/docs/dev-info.rst b/docs/dev-info.rst
new file mode 100644
index 0000000..493b102
--- /dev/null
+++ b/docs/dev-info.rst
@@ -0,0 +1,206 @@
+Hacking on swh-indexer
+======================
+
+This tutorial will guide you through the hacking on the swh-indexer.
+If you do not have a local copy of the Software Heritage archive, go to the
+`getting started tutorial
+`_
+
+Configuration files
+-------------------
+You will need the following YAML configuration files to run the swh-indexer
+commands:
+
+- Orchestrator at
+ ``~/.config/swh/indexer/orchestrator.yml``
+
+.. code-block:: yaml
+
+ indexers:
+ mimetype:
+ check_presence: false
+ batch_size: 100
+
+- Orchestrator-text at
+ ``~/.config/swh/indexer/orchestrator-text.yml``
+
+.. code-block:: yaml
+
+ indexers:
+ # language:
+ # batch_size: 10
+ # check_presence: false
+ fossology_license:
+ batch_size: 10
+ check_presence: false
+ # ctags:
+ # batch_size: 2
+ # check_presence: false
+
+- Mimetype indexer at
+ ``~/.config/swh/indexer/mimetype.yml``
+
+.. code-block:: yaml
+
+ # storage to read sha1's metadata (path)
+ # storage:
+ # cls: local
+ # args:
+ # db: "service=swh-dev"
+ # objstorage:
+ # cls: pathslicing
+ # args:
+ # root: /home/storage/swh-storage/
+ # slicing: 0:1/1:5
+
+ storage:
+ cls: remote
+ args:
+ url: http://localhost:5002/
+
+ indexer_storage:
+ cls: remote
+ args:
+ url: http://localhost:5007/
+
+ # storage to read sha1's content
+ # adapt this to your need
+ # locally: this needs to match your storage's setup
+ objstorage:
+ cls: pathslicing
+ args:
+ slicing: 0:1/1:5
+ root: /home/storage/swh-storage/
+
+ destination_task: swh.indexer.tasks.SWHOrchestratorTextContentsTask
+ rescheduling_task: swh.indexer.tasks.SWHContentMimetypeTask
+
+
+- Fossology indexer at
+ ``~/.config/swh/indexer/fossology_license.yml``
+
+.. code-block:: yaml
+
+ # storage to read sha1's metadata (path)
+ # storage:
+ # cls: local
+ # args:
+ # db: "service=swh-dev"
+ # objstorage:
+ # cls: pathslicing
+ # args:
+ # root: /home/storage/swh-storage/
+ # slicing: 0:1/1:5
+
+ storage:
+ cls: remote
+ url: http://localhost:5002/
+
+ indexer_storage:
+ cls: remote
+ args:
+ url: http://localhost:5007/
+
+ # storage to read sha1's content
+ # adapt this to your need
+ # locally: this needs to match your storage's setup
+ objstorage:
+ cls: pathslicing
+ args:
+ slicing: 0:1/1:5
+ root: /home/storage/swh-storage/
+
+ workdir: /tmp/swh/worker.indexer/license/
+
+ tools:
+ name: 'nomos'
+ version: '3.1.0rc2-31-ga2cbb8c'
+ configuration:
+ command_line: 'nomossa '
+
+
+- Worker at
+ ``~/.config/swh/worker.yml``
+
+.. code-block:: yaml
+
+ task_broker: amqp://guest@localhost//
+ task_modules:
+ - swh.loader.svn.tasks
+ - swh.loader.tar.tasks
+ - swh.loader.git.tasks
+ - swh.storage.archiver.tasks
+ - swh.indexer.tasks
+ - swh.indexer.orchestrator
+ task_queues:
+ - swh_loader_svn
+ - swh_loader_tar
+ - swh_reader_git_to_azure_archive
+ - swh_storage_archive_worker_to_backend
+ - swh_indexer_orchestrator_content_all
+ - swh_indexer_orchestrator_content_text
+ - swh_indexer_content_mimetype
+ - swh_indexer_content_language
+ - swh_indexer_content_ctags
+ - swh_indexer_content_fossology_license
+ - swh_loader_svn_mount_and_load
+ - swh_loader_git_express
+ - swh_loader_git_archive
+ - swh_loader_svn_archive
+ task_soft_time_limit: 0
+
+
+Database
+--------
+
+swh-indxer uses a database to store the indexed content. The default
+db is expected to be called swh-indexer-dev.
+
+Create or add ``swh-dev`` and ``swh-indexer-dev`` to
+the ``~/.pg_service.conf`` and ``~/.pgpass`` files, which are postgresql's
+configuration files.
+
+Add data to local DB
+--------------------
+from within the ``swh-environment``, run the following command::
+
+ make rebuild-testdata
+
+and fetch some real data to work with, using::
+
+ python3 -m swh.loader.git.updater --origin-url
+
+Then you can list all content files using this script::
+
+ #!/usr/bin/env bash
+
+ psql service=swh-dev -c "copy (select sha1 from content) to stdin" | sed -e 's/^\\\\x//g'
+
+Run the indexers
+-----------------
+Use the list off contents to feed the indexers with with the
+following command::
+
+ ./list-sha1.sh | python3 -m swh.indexer.producer --batch 100 --task-name orchestrator_all
+
+Activate the workers
+--------------------
+To send messages to different queues using rabbitmq
+(which should already be installed through dependencies installation),
+run the following command in a dedicated terminal::
+
+ python3 -m celery worker --app=swh.scheduler.celery_backend.config.app \
+ --pool=prefork \
+ --concurrency=1 \
+ -Ofair \
+ --loglevel=info \
+ --without-gossip \
+ --without-mingle \
+ --without-heartbeat 2>&1
+
+With this command rabbitmq will consume message using the worker
+configuration file.
+
+Note: for the fossology_license indexer, you need a package fossology-nomossa
+which is in our `public debian repository
+`_.
diff --git a/docs/images/.gitignore b/docs/images/.gitignore
new file mode 100644
index 0000000..d890b03
--- /dev/null
+++ b/docs/images/.gitignore
@@ -0,0 +1 @@
+tasks-metadata-indexers.svg
diff --git a/docs/images/Makefile b/docs/images/Makefile
new file mode 100644
index 0000000..3481956
--- /dev/null
+++ b/docs/images/Makefile
@@ -0,0 +1,11 @@
+
+UML_DIAGS_SRC = $(wildcard *.uml)
+UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC))
+
+all: $(UML_DIAGS)
+
+%.svg: %.uml
+ DISPLAY="" plantuml -tsvg $<
+
+clean:
+ -rm -f $(DEP_GRAPHS) $(UML_DIAGS)
diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml
new file mode 100644
index 0000000..954e079
--- /dev/null
+++ b/docs/images/tasks-metadata-indexers.uml
@@ -0,0 +1,84 @@
+@startuml
+ participant LOADERS as "Loaders"
+ participant JOURNAL as "Journal"
+ participant SCHEDULER as "Scheduler"
+ participant IDX_ORIG_HEAD as "Origin-Head Indexer"
+ participant IDX_REV_META as "Revision Metadata Indexer"
+ participant IDX_CONT_META as "Content Metadata Indexer"
+ participant IDX_ORIG_META as "Origin Metadata Indexer"
+ participant IDX_STORAGE as "Indexer Storage"
+ participant STORAGE as "Graph Storage"
+ participant OBJ_STORAGE as "Object Storage"
+
+ activate OBJ_STORAGE
+ activate IDX_STORAGE
+ activate STORAGE
+ activate JOURNAL
+ activate SCHEDULER
+
+ activate LOADERS
+
+ LOADERS->>JOURNAL: Origin 42 was added/revisited
+ deactivate LOADERS
+
+ JOURNAL->>SCHEDULER: run indexers on origin 42
+
+ SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42
+ activate IDX_ORIG_HEAD
+
+ IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42)
+
+ STORAGE->>IDX_ORIG_HEAD: branches
+
+ IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42)
+ deactivate IDX_ORIG_HEAD
+
+ SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42)
+ activate IDX_REV_META
+
+ IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef)
+ STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...}
+
+ IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab)
+ STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...]
+
+ IDX_REV_META->>IDX_REV_META: package.json is a metadata file
+
+ IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe)
+ IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...}
+
+ alt If the storage answered "none"
+ IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file
+ activate IDX_CONT_META
+
+ IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe
+
+ OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...'
+
+ IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author
+
+ IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...})
+ IDX_STORAGE->>IDX_CONT_META: ok
+
+ IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...}
+ deactivate IDX_CONT_META
+ end
+
+ IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...})
+ IDX_STORAGE->>IDX_REV_META: ok
+
+ IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef
+ deactivate IDX_REV_META
+
+ SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef
+ activate IDX_ORIG_META
+
+ IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef)
+ IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...}
+
+ IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...})
+ IDX_STORAGE->>IDX_ORIG_META: ok
+ deactivate IDX_ORIG_META
+
+
+@enduml
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..b80d6f4
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,25 @@
+.. _swh-indexer:
+
+Software Heritage - Indexer
+===========================
+
+Tools and workers used to mine the content of the archive and extract derived
+information from archive source code artifacts.
+
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Contents:
+
+ README.md
+ dev-info.rst
+ metadata-workflow.rst
+
+
+Reference Documentation
+-----------------------
+
+.. toctree::
+ :maxdepth: 2
+
+ /apidoc/swh.indexer
diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
new file mode 100644
index 0000000..471ce8c
--- /dev/null
+++ b/docs/metadata-workflow.rst
@@ -0,0 +1,208 @@
+Metadata workflow
+=================
+
+Intrinsic metadata
+------------------
+
+Indexing :term:`intrinsic metadata` requires extracting information from the
+lowest levels of the :ref:`Merkle DAG ` (directories, files,
+and content blobs) and associate them to the highest ones (origins).
+In order to deduplicate the work between origins, we split this work between
+multiple indexers, which coordinate with each other and save their results
+at each step in the indexer storage.
+
+Indexer architecture
+--------------------
+
+.. thumbnail:: images/tasks-metadata-indexers.svg
+
+
+Origin-Head Indexer
+___________________
+
+First, the Origin-Head indexer gets called externally, with an origin as
+argument (or multiple origins, that are handled sequentially).
+For now, its tasks are scheduled manually via recurring Scheduler tasks; but
+in the near future, the :term:`journal` will be used to do that.
+
+It first looks up the last :term:`snapshot` and determines what the main
+branch of origin is (the "Head branch") and what revision it points to
+(the "Head").
+Intrinsic metadata for that origin will be extracted from that revision.
+
+It schedules a Revision Metadata Indexer task for that revision, with a
+hint that the revision is the Head of that particular origin.
+
+
+Revision and Content Metadata Indexers
+______________________________________
+
+These two indexers do the hard part of the work. The Revision Metadata
+Indexer fetches the root directory associated with a revision, then extracts
+the metadata from that directory.
+
+To do so, it lists files in that directory, and looks for known names, such
+as `codemeta.json`, `package.json`, or `pom.xml`. If there are any, it
+runs the Content Metadata Indexer on them, which in turn fetches their
+contents and runs them through extraction dictionaries/mappings.
+See below for details.
+
+Their results are saved in a database (the indexer storage), associated with
+the content and revision hashes.
+
+If it received a hint that this revision is the head of an origin, the
+Revision Metadata Indexer then schedules the Origin Metadata Indexer
+to run on that origin.
+
+
+Origin Metadata Indexer
+_______________________
+
+The job of this indexer is very simple: it takes an origin identifier and
+a revision hash, and copies the metadata of the former to a new table, to
+associate it with the latter.
+
+The reason for this is to be able to perform searches on metadata, and
+efficiently find out which origins matched the pattern.
+Running that search on the `revision_metadata` table would require either
+a reverse lookup from revisions to origins, which is costly.
+
+
+Translation from language-specific metadata to CodeMeta
+-------------------------------------------------------
+
+Intrinsic metadata are extracted from files provided with a project's source
+code, and translated using `CodeMeta`_'s `crosswalk table`_.
+
+All input formats supported so far are straightforward dictionaries (eg. JSON)
+or can be accessed as such (eg. XML); and the first part of the translation is
+to map their keys to a term in the CodeMeta vocabulary.
+This is done by parsing the crosswalk table's `CSV file`_ and using it as a
+map between these two vocabularies; and this does not require any
+format-specific code in the indexers.
+
+The second part is to normalize values. As language-specific metadata files
+each have their way(s) of formatting these values, we need to turn them into
+the data type required by CodeMeta.
+This normalization makes up for most of the code of
+:py:mod:`swh.indexer.metadata_dictionary`.
+
+.. _CodeMeta: https://codemeta.github.io/
+.. _crosswalk table: https://codemeta.github.io/crosswalk/
+.. _CSV file: https://github.com/codemeta/codemeta/blob/master/crosswalk.csv
+
+
+Supported intrinsic metadata
+----------------------------
+
+The following sources of intrinsic metadata are supported:
+
+* CodeMeta's `codemeta.json`_,
+* Maven's `pom.xml`_,
+* NPM's `package.json`_,
+* Python's `PKG-INFO`_,
+* Ruby's `.gemspec`_
+
+.. _codemeta.json: https://codemeta.github.io/terms/
+.. _pom.xml: https://maven.apache.org/pom.html
+.. _package.json: https://docs.npmjs.com/files/package.json
+.. _PKG-INFO: https://www.python.org/dev/peps/pep-0314/
+.. _.gemspec: https://guides.rubygems.org/specification-reference/
+
+
+Supported CodeMeta terms
+------------------------
+
+The following terms may be found in the output of the metadata translation
+(other than the `codemeta` mapping, which is the identity function, and
+therefore supports all terms):
+
+.. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta
+ :nostderr:
+
+
+Adding support for additional ecosystem-specific metadata
+---------------------------------------------------------
+
+This section will guide you through adding code to the metadata indexer to
+detect and translate new metadata formats.
+
+First, you should start by picking one of the `CodeMeta crosswalks`_.
+Then create a new file in `swh-indexer/swh/indexer/metadata_dictionary/`, that
+will contain your code, and create a new class that inherits from helper
+classes, with some documentation about your indexer:
+
+.. code-block:: python
+
+ from .base import DictMapping, SingleFileMapping
+ from swh.indexer.codemeta import CROSSWALK_TABLE
+
+ class MyMapping(DictMapping, SingleFileMapping):
+ """Dedicated class for ..."""
+ name = 'my-mapping'
+ filename = b'the-filename'
+ mapping = CROSSWALK_TABLE['Name of the CodeMeta crosswalk']
+
+.. _CodeMeta crosswalks: https://github.com/codemeta/codemeta/tree/master/crosswalks
+
+Then, add a `string_fields` attribute, that is the list of all keys whose
+values are simple text values. For instance, to
+`translate Python PKG-INFO`_, it's:
+
+.. code-block:: python
+
+ string_fields = ['name', 'version', 'description', 'summary',
+ 'author', 'author-email']
+
+
+These values will be automatically added to the above list of
+supported terms.
+
+.. _translate Python PKG-INFO: https://forge.softwareheritage.org/source/swh-indexer/browse/master/swh/indexer/metadata_dictionary/python.py
+
+Last step to get your code working: add a `translate` method that will
+take a single byte string as argument, turn it into a Python dictionary,
+whose keys are the ones of the input document, and pass it to
+`_translate_dict`.
+
+For instance, if the input document is in JSON, it can be as simple as:
+
+.. code-block:: python
+
+ def translate(self, raw_content):
+ raw_content = raw_content.decode() # bytes to str
+ content_dict = json.loads(raw_content) # str to dict
+ return self._translate_dict(content_dict) # convert to CodeMeta
+
+`_translate_dict` will do the heavy work of reading the crosswalk table for
+each of `string_fields`, read the corresponding value in the `content_dict`,
+and build a CodeMeta dictionary with the corresponding names from the
+crosswalk table.
+
+One last thing to run your code: add it to the list in
+`swh-indexer/swh/indexer/metadata_dictionary/__init__.py`, so the rest of the
+code is aware of it.
+
+Now, you can run it:
+
+.. code-block:: shell
+
+ python3 -m swh.indexer.metadata_dictionary MyMapping path/to/input/file
+
+and it will (hopefully) returns a CodeMeta object.
+
+If it works, well done!
+
+You can now improve your translation code further, by adding methods that
+will do more advanced conversion. For example, if there is a field named
+`license` containing an SPDX identifier, you must convert it to an URI,
+like this:
+
+.. code-block:: python
+
+ def normalize_license(self, s):
+ if isinstance(s, str):
+ return {"@id": "https://spdx.org/licenses/" + s}
+
+This method will automatically get called by `_translate_dict` when it
+finds a `license` field in `content_dict`.
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..0df07a7
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,30 @@
+[mypy]
+namespace_packages = True
+warn_unused_ignores = True
+
+
+# 3rd party libraries without stubs (yet)
+
+[mypy-celery.*]
+ignore_missing_imports = True
+
+[mypy-confluent_kafka.*]
+ignore_missing_imports = True
+
+[mypy-magic.*]
+ignore_missing_imports = True
+
+[mypy-pkg_resources.*]
+ignore_missing_imports = True
+
+[mypy-psycopg2.*]
+ignore_missing_imports = True
+
+[mypy-pyld.*]
+ignore_missing_imports = True
+
+[mypy-pytest.*]
+ignore_missing_imports = True
+
+[mypy-xmltodict.*]
+ignore_missing_imports = True
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..4b8d2d3
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+addopts = -p no:pytest_swh_scheduler
+norecursedirs = docs
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 32c8593..0363717 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,6 +1,6 @@
swh.core[db,http] >= 0.0.87
swh.model >= 0.0.15
swh.objstorage >= 0.0.43
swh.scheduler >= 0.0.47
-swh.storage >= 0.6.0
+swh.storage >= 0.8.0
swh.journal >= 0.1.0
diff --git a/requirements-test.txt b/requirements-test.txt
index 68bb694..ac0c1f0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,5 @@
confluent-kafka
pytest
-pytest-postgresql
hypothesis>=3.11.0
-swh.storage>= 0.0.178
+swh.scheduler[testing] >= 0.5.0
+swh.storage[testing] >= 0.10.0
diff --git a/setup.py b/setup.py
index 1f6fd99..b0c777c 100755
--- a/setup.py
+++ b/setup.py
@@ -1,73 +1,73 @@
#!/usr/bin/env python3
-# Copyright (C) 2015-2018 The Software Heritage developers
+# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from setuptools import setup, find_packages
from os import path
from io import open
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
def parse_requirements(name=None):
if name:
reqf = "requirements-%s.txt" % name
else:
reqf = "requirements.txt"
requirements = []
if not path.exists(reqf):
return requirements
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith("#"):
continue
requirements.append(line)
return requirements
setup(
name="swh.indexer",
description="Software Heritage Content Indexer",
long_description=long_description,
long_description_content_type="text/markdown",
python_requires=">=3.7",
author="Software Heritage developers",
author_email="swh-devel@inria.fr",
url="https://forge.softwareheritage.org/diffusion/78/",
packages=find_packages(),
scripts=[],
install_requires=parse_requirements() + parse_requirements("swh"),
- setup_requires=["vcversioner"],
+ setup_requires=["setuptools-scm"],
+ use_scm_version=True,
extras_require={"testing": parse_requirements("test")},
- vcversioner={},
include_package_data=True,
entry_points="""
[console_scripts]
swh-indexer=swh.indexer.cli:main
[swh.cli.subcommands]
indexer=swh.indexer.cli:cli
""",
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
],
project_urls={
"Bug Reports": "https://forge.softwareheritage.org/maniphest",
"Funding": "https://www.softwareheritage.org/donate",
"Source": "https://forge.softwareheritage.org/source/swh-indexer",
"Documentation": "https://docs.softwareheritage.org/devel/swh-indexer/",
},
)
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index a2920a6..06fbd34 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,71 @@
Metadata-Version: 2.1
Name: swh.indexer
-Version: 0.1.0
+Version: 0.1.1
Summary: Software Heritage Content Indexer
Home-page: https://forge.softwareheritage.org/diffusion/78/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
Description: swh-indexer
============
Tools to compute multiple indexes on SWH's raw contents:
- content:
- mimetype
- ctags
- language
- fossology-license
- metadata
- revision:
- metadata
An indexer is in charge of:
- looking up objects
- extracting information from those objects
- store those information in the swh-indexer db
There are multiple indexers working on different object types:
- content indexer: works with content sha1 hashes
- revision indexer: works with revision sha1 hashes
- origin indexer: works with origin identifiers
Indexation procedure:
- receive batch of ids
- retrieve the associated data depending on object type
- compute for that object some index
- store the result to swh's storage
Current content indexers:
- mimetype (queue swh_indexer_content_mimetype): detect the encoding
and mimetype
- language (queue swh_indexer_content_language): detect the
programming language
- ctags (queue swh_indexer_content_ctags): compute tags information
- fossology-license (queue swh_indexer_fossology_license): compute the
license
- metadata: translate file into translated_metadata dict
Current revision indexers:
- metadata: detects files containing metadata and retrieves translated_metadata
in content_metadata table in storage or run content indexer to translate
files.
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: testing
diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt
index 3f25a51..1dc3047 100644
--- a/swh.indexer.egg-info/SOURCES.txt
+++ b/swh.indexer.egg-info/SOURCES.txt
@@ -1,108 +1,133 @@
+.gitignore
+.pre-commit-config.yaml
+AUTHORS
+CODE_OF_CONDUCT.md
+CONTRIBUTORS
+LICENSE
MANIFEST.in
Makefile
+Makefile.local
README.md
+codemeta.json
+conftest.py
+mypy.ini
pyproject.toml
+pytest.ini
requirements-swh.txt
requirements-test.txt
requirements.txt
setup.cfg
setup.py
-version.txt
+tox.ini
+docs/.gitignore
+docs/Makefile
+docs/Makefile.local
+docs/README.md
+docs/conf.py
+docs/dev-info.rst
+docs/index.rst
+docs/metadata-workflow.rst
+docs/_static/.placeholder
+docs/_templates/.placeholder
+docs/images/.gitignore
+docs/images/Makefile
+docs/images/tasks-metadata-indexers.uml
sql/bin/db-upgrade
sql/bin/dot_add_content
+sql/doc/json
sql/doc/json/.gitignore
sql/doc/json/Makefile
sql/doc/json/indexer_configuration.tool_configuration.schema.json
sql/doc/json/revision_metadata.translated_metadata.json
sql/json/.gitignore
sql/json/Makefile
sql/json/indexer_configuration.tool_configuration.schema.json
sql/json/revision_metadata.translated_metadata.json
sql/upgrades/115.sql
sql/upgrades/116.sql
sql/upgrades/117.sql
sql/upgrades/118.sql
sql/upgrades/119.sql
sql/upgrades/120.sql
sql/upgrades/121.sql
sql/upgrades/122.sql
sql/upgrades/123.sql
sql/upgrades/124.sql
sql/upgrades/125.sql
sql/upgrades/126.sql
sql/upgrades/127.sql
sql/upgrades/128.sql
sql/upgrades/129.sql
sql/upgrades/130.sql
sql/upgrades/131.sql
sql/upgrades/132.sql
swh/__init__.py
swh.indexer.egg-info/PKG-INFO
swh.indexer.egg-info/SOURCES.txt
swh.indexer.egg-info/dependency_links.txt
swh.indexer.egg-info/entry_points.txt
swh.indexer.egg-info/requires.txt
swh.indexer.egg-info/top_level.txt
swh/indexer/__init__.py
swh/indexer/cli.py
swh/indexer/codemeta.py
swh/indexer/ctags.py
swh/indexer/fossology_license.py
swh/indexer/indexer.py
swh/indexer/journal_client.py
swh/indexer/metadata.py
swh/indexer/metadata_detector.py
swh/indexer/mimetype.py
swh/indexer/origin_head.py
swh/indexer/py.typed
swh/indexer/rehash.py
swh/indexer/tasks.py
swh/indexer/data/codemeta/CITATION
swh/indexer/data/codemeta/LICENSE
swh/indexer/data/codemeta/codemeta.jsonld
swh/indexer/data/codemeta/crosswalk.csv
swh/indexer/metadata_dictionary/__init__.py
swh/indexer/metadata_dictionary/base.py
swh/indexer/metadata_dictionary/codemeta.py
swh/indexer/metadata_dictionary/maven.py
swh/indexer/metadata_dictionary/npm.py
swh/indexer/metadata_dictionary/python.py
swh/indexer/metadata_dictionary/ruby.py
swh/indexer/sql/10-swh-init.sql
swh/indexer/sql/20-swh-enums.sql
swh/indexer/sql/30-swh-schema.sql
swh/indexer/sql/40-swh-func.sql
swh/indexer/sql/50-swh-data.sql
swh/indexer/sql/60-swh-indexes.sql
swh/indexer/storage/__init__.py
swh/indexer/storage/converters.py
swh/indexer/storage/db.py
swh/indexer/storage/exc.py
swh/indexer/storage/in_memory.py
swh/indexer/storage/interface.py
swh/indexer/storage/metrics.py
swh/indexer/storage/api/__init__.py
swh/indexer/storage/api/client.py
swh/indexer/storage/api/server.py
swh/indexer/tests/__init__.py
swh/indexer/tests/conftest.py
swh/indexer/tests/tasks.py
swh/indexer/tests/test_cli.py
swh/indexer/tests/test_codemeta.py
swh/indexer/tests/test_ctags.py
swh/indexer/tests/test_fossology_license.py
swh/indexer/tests/test_journal_client.py
swh/indexer/tests/test_metadata.py
swh/indexer/tests/test_mimetype.py
swh/indexer/tests/test_origin_head.py
swh/indexer/tests/test_origin_metadata.py
swh/indexer/tests/utils.py
swh/indexer/tests/storage/__init__.py
swh/indexer/tests/storage/conftest.py
swh/indexer/tests/storage/generate_data_test.py
swh/indexer/tests/storage/test_api_client.py
swh/indexer/tests/storage/test_converters.py
swh/indexer/tests/storage/test_in_memory.py
swh/indexer/tests/storage/test_metrics.py
swh/indexer/tests/storage/test_server.py
swh/indexer/tests/storage/test_storage.py
\ No newline at end of file
diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt
index 0d7adeb..69ab181 100644
--- a/swh.indexer.egg-info/requires.txt
+++ b/swh.indexer.egg-info/requires.txt
@@ -1,18 +1,18 @@
vcversioner
click
python-magic>=0.4.13
pyld
xmltodict
swh.core[db,http]>=0.0.87
swh.model>=0.0.15
swh.objstorage>=0.0.43
swh.scheduler>=0.0.47
-swh.storage>=0.6.0
+swh.storage>=0.8.0
swh.journal>=0.1.0
[testing]
confluent-kafka
pytest
-pytest-postgresql
hypothesis>=3.11.0
-swh.storage>=0.0.178
+swh.scheduler[testing]>=0.5.0
+swh.storage[testing]>=0.10.0
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 0fdb0db..0f28355 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,383 +1,381 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from copy import deepcopy
from typing import Any, Callable, Dict, Iterator, List, Tuple
from swh.core.utils import grouper
from swh.indexer.codemeta import merge_documents
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.storage import INDEXER_CFG_KEY
from swh.model import hashutil
REVISION_GET_BATCH_SIZE = 10
ORIGIN_GET_BATCH_SIZE = 10
def call_with_batches(
f: Callable[[List[Dict[str, Any]]], Dict["str", Any]],
args: List[Dict[str, str]],
batch_size: int,
) -> Iterator[str]:
"""Calls a function with batches of args, and concatenates the results.
"""
groups = grouper(args, batch_size)
for group in groups:
yield from f(list(group))
class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing(
({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)
)
def index(self, id, data, log_suffix="unknown revision"):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the metadata keys will
be returned as None
"""
result = {
"id": id,
"indexer_configuration_id": self.tool["id"],
"metadata": None,
}
try:
mapping_name = self.tool["tool_configuration"]["context"]
log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
result["metadata"] = MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
"for content %s" % hashutil.hash_to_hex(id)
)
if result["metadata"] is None:
return None
return result
def persist_index_computations(
self, results: List[Dict], policy_update: str
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- metadata (jsonb): detected metadata
policy_update: either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
return self.idx_storage.content_metadata_add(
results, conflict_update=(policy_update == "update-dups")
)
class RevisionMetadataIndexer(RevisionIndexer):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_intrinsic_metadata table
with defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
ADDITIONAL_CONFIG = {
"tools": (
"dict",
{"name": "swh-metadata-detector", "version": "0.0.2", "configuration": {},},
),
}
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_intrinsic_metadata_missing(
(
{"id": sha1_git, "indexer_configuration_id": self.tool["id"],}
for sha1_git in sha1_gits
)
)
def index(self, rev):
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
rev (dict): revision artifact from storage
Returns:
dict: dictionary representing a revision_intrinsic_metadata, with
keys:
- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- metadata: dict of retrieved metadata
"""
result = {
"id": rev["id"],
"indexer_configuration_id": self.tool["id"],
"mappings": None,
"metadata": None,
}
try:
root_dir = rev["directory"]
dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
if [entry["type"] for entry in dir_ls] == ["dir"]:
# If the root is just a single directory, recurse into it
# eg. PyPI packages, GNU tarballs
subdir = dir_ls[0]["target"]
dir_ls = self.storage.directory_ls(subdir, recursive=False)
files = [entry for entry in dir_ls if entry["type"] == "file"]
detected_files = detect_metadata(files)
(mappings, metadata) = self.translate_revision_intrinsic_metadata(
detected_files,
log_suffix="revision=%s" % hashutil.hash_to_hex(rev["id"]),
)
result["mappings"] = mappings
result["metadata"] = metadata
except Exception as e:
self.log.exception("Problem when indexing rev: %r", e)
return result
def persist_index_computations(
self, results: List[Dict], policy_update: str
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update: either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in
# revision_intrinsic_metadata
return self.idx_storage.revision_intrinsic_metadata_add(
results, conflict_update=(policy_update == "update-dups")
)
def translate_revision_intrinsic_metadata(
self, detected_files: Dict[str, List[Any]], log_suffix: str
) -> Tuple[List[Any], List[Any]]:
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files: dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
(List[str], dict): list of mappings used and dict with
translated metadata according to the CodeMeta vocabulary
"""
used_mappings = [MAPPINGS[context].name for context in detected_files]
metadata = []
tool = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
config["tools"] = [tool]
for context in detected_files.keys():
cfg = deepcopy(config)
cfg["tools"][0]["configuration"]["context"] = context
c_metadata_indexer = ContentMetadataIndexer(config=cfg)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context]
)
for c in metadata_generator:
# extracting metadata
sha1 = c["id"]
sha1s_in_storage.append(sha1)
local_metadata = c["metadata"]
# local metadata is aggregated
if local_metadata:
metadata.append(local_metadata)
sha1s_filtered = [
item for item in detected_files[context] if item not in sha1s_in_storage
]
if sha1s_filtered:
# content indexing
try:
c_metadata_indexer.run(
sha1s_filtered,
policy_update="ignore-dups",
log_suffix=log_suffix,
)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result["metadata"]
metadata.append(local_metadata)
except Exception:
self.log.exception("Exception while indexing metadata on contents")
metadata = merge_documents(metadata)
return (used_mappings, metadata)
class OriginMetadataIndexer(OriginIndexer):
ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG
USE_TOOLS = False
def __init__(self, config=None, **kwargs) -> None:
super().__init__(config=config, **kwargs)
self.origin_head_indexer = OriginHeadIndexer(config=config)
self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
def index_list(self, origin_urls, **kwargs):
head_rev_ids = []
origins_with_head = []
origins = list(
call_with_batches(
- self.storage.origin_get,
- [{"url": url} for url in origin_urls],
- ORIGIN_GET_BATCH_SIZE,
+ self.storage.origin_get, origin_urls, ORIGIN_GET_BATCH_SIZE,
)
)
for origin in origins:
if origin is None:
continue
- head_result = self.origin_head_indexer.index(origin["url"])
+ head_result = self.origin_head_indexer.index(origin.url)
if head_result:
origins_with_head.append(origin)
head_rev_ids.append(head_result["revision_id"])
head_revs = list(
call_with_batches(
self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
)
)
assert len(head_revs) == len(head_rev_ids)
results = []
for (origin, rev) in zip(origins_with_head, head_revs):
if not rev:
- self.log.warning("Missing head revision of origin %r", origin["url"])
+ self.log.warning("Missing head revision of origin %r", origin.url)
continue
rev_metadata = self.revision_metadata_indexer.index(rev)
orig_metadata = {
"from_revision": rev_metadata["id"],
- "id": origin["url"],
+ "id": origin.url,
"metadata": rev_metadata["metadata"],
"mappings": rev_metadata["mappings"],
"indexer_configuration_id": rev_metadata["indexer_configuration_id"],
}
results.append((orig_metadata, rev_metadata))
return results
def persist_index_computations(
self, results: List[Dict], policy_update: str
) -> Dict[str, int]:
conflict_update = policy_update == "update-dups"
# Deduplicate revisions
rev_metadata: List[Any] = []
orig_metadata: List[Any] = []
revs_to_delete: List[Any] = []
origs_to_delete: List[Any] = []
summary: Dict = {}
for (orig_item, rev_item) in results:
assert rev_item["metadata"] == orig_item["metadata"]
if not rev_item["metadata"] or rev_item["metadata"].keys() <= {"@context"}:
# If we didn't find any metadata, don't store a DB record
# (and delete existing ones, if any)
if rev_item not in revs_to_delete:
revs_to_delete.append(rev_item)
if orig_item not in origs_to_delete:
origs_to_delete.append(orig_item)
else:
if rev_item not in rev_metadata:
rev_metadata.append(rev_item)
if orig_item not in orig_metadata:
orig_metadata.append(orig_item)
if rev_metadata:
summary_rev = self.idx_storage.revision_intrinsic_metadata_add(
rev_metadata, conflict_update=conflict_update
)
summary.update(summary_rev)
if orig_metadata:
summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
orig_metadata, conflict_update=conflict_update
)
summary.update(summary_ori)
# revs_to_delete should always be empty unless we changed a mapping
# to detect less files or less content.
# However, origs_to_delete may be empty whenever an upstream deletes
# a metadata file.
if origs_to_delete:
summary_ori = self.idx_storage.origin_intrinsic_metadata_delete(
origs_to_delete
)
summary.update(summary_ori)
if revs_to_delete:
summary_rev = self.idx_storage.revision_intrinsic_metadata_delete(
revs_to_delete
)
summary.update(summary_rev)
return summary
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
index fb25abd..1ba1528 100644
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,86 +1,74 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import timedelta
from unittest.mock import patch
import pytest
from swh.objstorage import get_objstorage
-from swh.scheduler.tests.conftest import * # noqa
from swh.storage import get_storage
from swh.indexer.storage import get_indexer_storage
from .utils import fill_storage, fill_obj_storage
TASK_NAMES = ["revision_intrinsic_metadata", "origin_intrinsic_metadata"]
-storage_config = {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]}
-
-
@pytest.fixture
def indexer_scheduler(swh_scheduler):
for taskname in TASK_NAMES:
swh_scheduler.create_task_type(
{
"type": taskname,
"description": "The {} indexer testing task".format(taskname),
"backend_name": "swh.indexer.tests.tasks.{}".format(taskname),
"default_interval": timedelta(days=1),
"min_interval": timedelta(hours=6),
"max_interval": timedelta(days=12),
"num_retries": 3,
}
)
return swh_scheduler
@pytest.fixture
def idx_storage():
"""An instance of in-memory indexer storage that gets injected into all
indexers classes.
"""
idx_storage = get_indexer_storage("memory", {})
with patch("swh.indexer.storage.in_memory.IndexerStorage") as idx_storage_mock:
idx_storage_mock.return_value = idx_storage
yield idx_storage
@pytest.fixture
def storage():
"""An instance of in-memory storage that gets injected into all indexers
classes.
"""
- storage = get_storage(**storage_config)
+ storage = get_storage(cls="memory")
fill_storage(storage)
with patch("swh.storage.in_memory.InMemoryStorage") as storage_mock:
storage_mock.return_value = storage
yield storage
@pytest.fixture
def obj_storage():
"""An instance of in-memory objstorage that gets injected into all indexers
classes.
"""
objstorage = get_objstorage("memory", {})
fill_obj_storage(objstorage)
with patch.dict(
"swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage}
):
yield objstorage
-
-
-@pytest.fixture(scope="session") # type: ignore # expected redefinition
-def celery_includes():
- return [
- "swh.indexer.tests.tasks",
- "swh.indexer.tasks",
- ]
diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
index e2df26c..a67b2dc 100644
--- a/swh/indexer/tests/storage/conftest.py
+++ b/swh/indexer/tests/storage/conftest.py
@@ -1,73 +1,73 @@
# Copyright (C) 2015-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from os.path import join
import pytest
from . import SQL_DIR
-from swh.storage.tests.conftest import postgresql_fact
+from swh.storage.pytest_plugin import postgresql_fact
from swh.indexer.storage import get_indexer_storage
from swh.model.hashutil import hash_to_bytes
from .generate_data_test import MIMETYPE_OBJECTS, FOSSOLOGY_LICENSES, TOOLS
DUMP_FILES = join(SQL_DIR, "*.sql")
class DataObj(dict):
def __getattr__(self, key):
return self.__getitem__(key)
def __setattr__(self, key, value):
return self.__setitem__(key, value)
@pytest.fixture
def swh_indexer_storage_with_data(swh_indexer_storage):
data = DataObj()
tools = {
tool["tool_name"]: {
"id": tool["id"],
"name": tool["tool_name"],
"version": tool["tool_version"],
"configuration": tool["tool_configuration"],
}
for tool in swh_indexer_storage.indexer_configuration_add(TOOLS)
}
data.tools = tools
data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
data.origin_url_1 = "file:///dev/0/zero" # 44434341
data.origin_url_2 = "file:///dev/1/one" # 44434342
data.origin_url_3 = "file:///dev/2/two" # 54974445
data.mimetypes = [
{**mimetype_obj, "indexer_configuration_id": tools["file"]["id"]}
for mimetype_obj in MIMETYPE_OBJECTS
]
swh_indexer_storage.content_mimetype_add(data.mimetypes)
data.fossology_licenses = [
{**fossology_obj, "indexer_configuration_id": tools["nomos"]["id"]}
for fossology_obj in FOSSOLOGY_LICENSES
]
swh_indexer_storage._test_data = data
return (swh_indexer_storage, data)
swh_indexer_storage_postgresql = postgresql_fact(
"postgresql_proc", dump_files=DUMP_FILES
)
@pytest.fixture
def swh_indexer_storage(swh_indexer_storage_postgresql):
storage_config = {
"cls": "local",
"args": {"db": swh_indexer_storage_postgresql.dsn,},
}
return get_indexer_storage(**storage_config)
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index c3ef250..7abb4ed 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1210 +1,1205 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import unittest
-import attr
-
from hypothesis import given, strategies, settings, HealthCheck
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Directory, DirectoryEntry, Revision
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.tests.utils import REVISION, DIRECTORY2
+
from .utils import (
BASE_TEST_CONFIG,
fill_obj_storage,
fill_storage,
YARN_PARSER_METADATA,
json_document_strategy,
xml_document_strategy,
)
TRANSLATOR_TOOL = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {"type": "local", "context": "NpmMapping"},
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, "should not be called; the rev indexer configures it."
REVISION_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.npm_mapping = MAPPINGS["NpmMapping"]()
self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
self.maven_mapping = MAPPINGS["MavenMapping"]()
self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]()
self.gemspec_mapping = MAPPINGS["GemspecMapping"]()
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"author": [
{"type": "Person", "name": "Morane G", "email": "moranegg@example.com",}
],
}
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = [
hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config["tools"] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# when
metadata_indexer.run(sha1s, policy_update="ignore-dups")
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
{
"metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
"id": hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
},
{
"metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"issueTracker": "https://github.com/npm/npm/issues",
"author": [
{
"type": "Person",
"name": "Isaac Z. Schlueter",
"email": "i@izs.me",
"url": "http://blog.izs.me",
}
],
"codeRepository": "git+https://github.com/npm/npm",
"description": "a package manager for JavaScript",
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
"keywords": [
"install",
"modules",
"package manager",
"package.json",
],
"url": "https://docs.npmjs.com/",
},
"id": hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
},
]
for result in results:
del result["tool"]
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_npm_bugs_normalization(self):
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
},
)
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
},
)
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
},
)
def test_npm_repository_normalization(self):
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
},
)
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
},
)
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
self.assertEqual(result, expected_result)
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, expected_result)
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://gitlab.com/user/repo.git",
"type": "SoftwareSourceCode",
},
)
def test_detect_metadata_package_json(self):
# given
df = [
{
"sha1_git": b"abc",
"name": b"index.js",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": b"package.json",
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
# when
results = detect_metadata(df)
expected_results = {"NpmMapping": [b"cde"]}
# then
self.assertEqual(expected_results, results)
def test_compute_metadata_valid_codemeta(self):
raw_content = b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738",
},
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation",
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": ["metadata", "software"],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = b"""{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": (
"http://repo1.maven.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_compute_metadata_maven_invalid_xml(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error parsing XML from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_unknown_encoding(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error detecting XML encoding from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_invalid_encoding(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error unidecoding XML from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"version": "1.2.3",
},
)
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_multiple(self):
"""Tests when there are multiple code repos and licenses."""
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": [
"https://www.apache.org/licenses/LICENSE-2.0.txt",
"https://opensource.org/licenses/MIT",
],
"codeRepository": [
"http://repo1.maven.org/maven2/com/mycompany/app/my-app",
"http://example.org/maven2/com/mycompany/app/my-app",
],
},
)
def test_compute_metadata_pkginfo(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: swh.core
Version: 0.0.49
Summary: Software Heritage core utilities
Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
Description: swh-core
========
\x20
core library for swh's modules:
- config parser
- hash computations
- serialization
- logging mechanism
\x20
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertCountEqual(
result["description"],
[
"Software Heritage core utilities", # note the comma here
"swh-core\n"
"========\n"
"\n"
"core library for swh's modules:\n"
"- config parser\n"
"- hash computations\n"
"- serialization\n"
"- logging mechanism\n"
"",
],
result,
)
del result["description"]
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"url": "https://forge.softwareheritage.org/diffusion/DCORE/",
"name": "swh.core",
"author": [
{
"type": "Person",
"name": "Software Heritage developers",
"email": "swh-devel@inria.fr",
}
],
"version": "0.0.49",
},
)
def test_compute_metadata_pkginfo_utf8(self):
raw_content = b"""\
Metadata-Version: 1.1
Name: snowpyt
Description-Content-Type: UNKNOWN
Description: foo
Hydrology N\xc2\xb083
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "snowpyt",
"description": "foo\nHydrology N°83",
},
)
def test_compute_metadata_pkginfo_keywords(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
Keywords: foo bar baz
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"keywords": ["foo", "bar", "baz"],
},
)
def test_compute_metadata_pkginfo_license(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
License: MIT
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"license": "MIT",
},
)
def test_gemspec_base(self):
raw_content = b"""
Gem::Specification.new do |s|
s.name = 'example'
s.version = '0.1.0'
s.licenses = ['MIT']
s.summary = "This is an example!"
s.description = "Much longer explanation of the example!"
s.authors = ["Ruby Coder"]
s.email = 'rubycoder@example.com'
s.files = ["lib/example.rb"]
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(
result.pop("description"),
["This is an example!", "Much longer explanation of the example!"],
)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder"}],
"name": "example",
"license": "https://spdx.org/licenses/MIT",
"codeRepository": "https://rubygems.org/gems/example",
"email": "rubycoder@example.com",
"version": "0.1.0",
},
)
def test_gemspec_two_author_fields(self):
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(
result.pop("author"),
[
{"type": "Person", "name": "Ruby Coder1"},
{"type": "Person", "name": "Ruby Coder2"},
],
)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_gemspec_invalid_author(self):
raw_content = b"""
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder1"}],
},
)
def test_gemspec_alternative_header(self):
raw_content = b"""
require './lib/version'
Gem::Specification.new { |s|
s.name = 'rb-system-with-aliases'
s.summary = 'execute system commands with aliases'
}
"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "rb-system-with-aliases",
"description": "execute system commands with aliases",
},
)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
def test_npm_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.npm_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=CODEMETA_TERMS))
def test_codemeta_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.codemeta_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
xml_document_strategy(
keys=list(MavenMapping.mapping),
root="project",
xmlns="http://maven.apache.org/POM/4.0.0",
)
)
def test_maven_adversarial(self, doc):
self.maven_mapping.translate(doc)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
strategies.dictionaries(
# keys
strategies.one_of(
strategies.text(), *map(strategies.just, GemspecMapping.mapping)
),
# values
strategies.recursive(
strategies.characters(),
lambda children: strategies.lists(children, min_size=1),
),
)
)
def test_gemspec_adversarial(self, doc):
parts = [b"Gem::Specification.new do |s|\n"]
for (k, v) in doc.items():
parts.append(" s.{} = {}\n".format(k, repr(v)).encode())
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
- {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()}
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
+ rev = REVISION
+ assert rev.directory == DIRECTORY2.id
metadata_indexer.idx_storage.content_metadata_add(
[
{
"indexer_configuration_id": tool["id"],
- "id": b"cde",
+ "id": DIRECTORY2.entries[0].target,
"metadata": YARN_PARSER_METADATA,
}
]
)
- sha1_gits = [
- hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- ]
- metadata_indexer.run(sha1_gits, "update-dups")
+ metadata_indexer.run([rev.id], "update-dups")
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits)
+ metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
)
expected_results = [
{
- "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
+ "id": rev.id,
"tool": TRANSLATOR_TOOL,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
]
for result in results:
del result["tool"]["id"]
# then
- self.assertEqual(expected_results, results)
+ self.assertEqual(results, expected_results)
def test_revision_metadata_indexer_single_root_dir(self):
metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
# of the revision
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
- rev = metadata_indexer.storage._revisions[rev_id]
- subdir_id = rev.directory
- rev = attr.evolve(rev, directory=b"123456")
- metadata_indexer.storage.directory_add(
- [
- {
- "id": b"123456",
- "entries": [
- {
- "name": b"foobar-1.0.0",
- "type": "dir",
- "target": subdir_id,
- "perms": 16384,
- }
- ],
- }
- ]
+ rev = REVISION
+ assert rev.directory == DIRECTORY2.id
+
+ directory = Directory(
+ entries=(
+ DirectoryEntry(
+ name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384,
+ ),
+ ),
)
+ assert directory.id is not None
+ metadata_indexer.storage.directory_add([directory])
+
+ new_rev_dict = {**rev.to_dict(), "directory": directory.id}
+ new_rev_dict.pop("id")
+ new_rev = Revision.from_dict(new_rev_dict)
+ metadata_indexer.storage.revision_add([new_rev])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
- {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()}
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add(
[
{
"indexer_configuration_id": tool["id"],
- "id": b"cde",
+ "id": DIRECTORY2.entries[0].target,
"metadata": YARN_PARSER_METADATA,
}
]
)
- sha1_gits = [
- hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- ]
- metadata_indexer.run(sha1_gits, "update-dups")
+ metadata_indexer.run([new_rev.id], "update-dups")
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits)
+ metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
)
expected_results = [
{
- "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
+ "id": new_rev.id,
"tool": TRANSLATOR_TOOL,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
]
for result in results:
del result["tool"]["id"]
# then
- self.assertEqual(expected_results, results)
+ self.assertEqual(results, expected_results)
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
index a5ed93c..c137dd0 100644
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -1,199 +1,170 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from datetime import datetime, timezone
from swh.model.model import OriginVisit, OriginVisitStatus
from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.tests.utils import BASE_TEST_CONFIG, fill_storage
from swh.storage.utils import now
+from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType
+
ORIGIN_HEAD_CONFIG = {
**BASE_TEST_CONFIG,
"tools": {"name": "origin-metadata", "version": "0.0.1", "configuration": {},},
"tasks": {"revision_intrinsic_metadata": None, "origin_intrinsic_metadata": None,},
}
class OriginHeadTestIndexer(OriginHeadIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
return ORIGIN_HEAD_CONFIG
def persist_index_computations(self, results, policy_update):
self.results = results
class OriginHead(unittest.TestCase):
def setUp(self):
self.indexer = OriginHeadTestIndexer()
self.indexer.catch_exceptions = False
fill_storage(self.indexer.storage)
def test_git(self):
- self.indexer.run(["https://github.com/SoftwareHeritage/swh-storage"])
+ origin_url = "https://github.com/SoftwareHeritage/swh-storage"
+ self.indexer.run([origin_url])
+ rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{"
- b"\xd7}\xac\xefrm",
- "origin_url": "https://github.com/SoftwareHeritage/swh-storage",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
def test_git_partial_snapshot(self):
"""Checks partial snapshots are ignored."""
origin_url = "https://github.com/SoftwareHeritage/swh-core"
- self.indexer.storage.origin_add_one(
- {"url": origin_url,}
- )
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
visit = self.indexer.storage.origin_visit_add(
[
OriginVisit(
origin=origin_url,
date=datetime(2019, 2, 27, tzinfo=timezone.utc),
type="git",
- status="ongoing",
- snapshot=None,
)
]
)[0]
self.indexer.storage.snapshot_add(
[
- {
- "id": b"foo",
- "branches": {
+ Snapshot(
+ branches={
b"foo": None,
- b"HEAD": {"target_type": "alias", "target": b"foo",},
+ b"HEAD": SnapshotBranch(
+ target_type=TargetType.ALIAS, target=b"foo",
+ ),
},
- }
+ ),
]
)
visit_status = OriginVisitStatus(
origin=origin_url,
visit=visit.visit,
date=now(),
status="partial",
snapshot=b"foo",
)
self.indexer.storage.origin_visit_status_add([visit_status])
self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_vcs_missing_snapshot(self):
- self.indexer.storage.origin_add(
- [{"url": "https://github.com/SoftwareHeritage/swh-indexer",}]
- )
- self.indexer.run(["https://github.com/SoftwareHeritage/swh-indexer"])
+ origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
+ self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_pypi_missing_branch(self):
origin_url = "https://pypi.org/project/abcdef/"
- self.indexer.storage.origin_add_one(
- {"url": origin_url,}
- )
+ self.indexer.storage.origin_add([Origin(url=origin_url,)])
visit = self.indexer.storage.origin_visit_add(
[
OriginVisit(
origin=origin_url,
date=datetime(2019, 2, 27, tzinfo=timezone.utc),
type="pypi",
- status="ongoing",
- snapshot=None,
)
]
)[0]
self.indexer.storage.snapshot_add(
[
- {
- "id": b"foo",
- "branches": {
+ Snapshot(
+ branches={
b"foo": None,
- b"HEAD": {"target_type": "alias", "target": b"foo",},
+ b"HEAD": SnapshotBranch(
+ target_type=TargetType.ALIAS, target=b"foo",
+ ),
},
- }
+ )
]
)
visit_status = OriginVisitStatus(
origin=origin_url,
visit=visit.visit,
date=now(),
status="full",
snapshot=b"foo",
)
self.indexer.storage.origin_visit_status_add([visit_status])
self.indexer.run(["https://pypi.org/project/abcdef/"])
self.assertEqual(self.indexer.results, [])
def test_ftp(self):
- self.indexer.run(["rsync://ftp.gnu.org/gnu/3dldf"])
+ origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
+ self.indexer.run([origin_url])
+ rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee"
- b"\xcc\x1a\xb4`\x8c\x8by",
- "origin_url": "rsync://ftp.gnu.org/gnu/3dldf",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
def test_ftp_missing_snapshot(self):
- self.indexer.storage.origin_add([{"url": "rsync://ftp.gnu.org/gnu/foobar",}])
- self.indexer.run(["rsync://ftp.gnu.org/gnu/foobar"])
+ origin_url = "rsync://ftp.gnu.org/gnu/foobar"
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
+ self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_deposit(self):
- self.indexer.run(["https://forge.softwareheritage.org/source/jesuisgpl/"])
+ origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
+ self.indexer.run([origin_url])
+ rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{"
- b"\xa6\xe9\x99\xb1\x9e]q\xeb",
- "origin_url": "https://forge.softwareheritage.org/source/"
- "jesuisgpl/",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
def test_deposit_missing_snapshot(self):
- self.indexer.storage.origin_add(
- [{"url": "https://forge.softwareheritage.org/source/foobar",}]
- )
- self.indexer.run(["https://forge.softwareheritage.org/source/foobar"])
+ origin_url = "https://forge.softwareheritage.org/source/foobar"
+ self.indexer.storage.origin_add([Origin(url=origin_url,)])
+ self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_pypi(self):
- self.indexer.run(["https://pypi.org/project/limnoria/"])
+ origin_url = "https://pypi.org/project/limnoria/"
+ self.indexer.run([origin_url])
+
+ rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k"
- b"A\x10\x9d\xc5\xfa2\xf8t",
- "origin_url": "https://pypi.org/project/limnoria/",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url}],
)
def test_svn(self):
- self.indexer.run(["http://0-512-md.googlecode.com/svn/"])
+ origin_url = "http://0-512-md.googlecode.com/svn/"
+ self.indexer.run([origin_url])
+ rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8"
- b"\xc9\xad#.\x1bw=\x18",
- "origin_url": "http://0-512-md.googlecode.com/svn/",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 79e8de3..2533981 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,224 +1,212 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
-from swh.model.hashutil import hash_to_bytes
-
from swh.indexer.metadata import OriginMetadataIndexer
-from .utils import YARN_PARSER_METADATA
+from swh.model.model import Origin
+
+from .utils import YARN_PARSER_METADATA, REVISION
from .test_metadata import REVISION_METADATA_CONFIG
def test_origin_metadata_indexer(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
- indexer.run(["https://github.com/librariesio/yarn-parser"])
-
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ indexer.run([origin])
+ rev_id = REVISION.id
rev_metadata = {
"id": rev_id,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
origin_metadata = {
"id": origin,
"from_revision": rev_id,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
for result in results:
del result["tool"]
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
for result in results:
del result["tool"]
assert results == [origin_metadata]
def test_origin_metadata_indexer_duplicate_origin(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.storage = storage
indexer.idx_storage = idx_storage
indexer.run(["https://github.com/librariesio/yarn-parser"])
-
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert len(results) == 1
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert len(results) == 1
def test_origin_metadata_indexer_missing_head(idx_storage, storage, obj_storage):
-
- storage.origin_add([{"url": "https://example.com"}])
+ storage.origin_add([Origin(url="https://example.com")])
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://example.com"])
origin = "https://example.com"
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_partial_missing_head(
idx_storage, storage, obj_storage
):
- storage.origin_add([{"url": "https://example.com"}])
-
- indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
- indexer.run(["https://example.com", "https://github.com/librariesio/yarn-parser"])
-
origin1 = "https://example.com"
origin2 = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ storage.origin_add([Origin(url=origin1)])
+ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ indexer.run([origin1, origin2])
- rev_metadata = {
- "id": rev_id,
- "metadata": YARN_PARSER_METADATA,
- "mappings": ["npm"],
- }
- origin_metadata = {
- "id": origin2,
- "from_revision": rev_id,
- "metadata": YARN_PARSER_METADATA,
- "mappings": ["npm"],
- }
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
for result in results:
del result["tool"]
- assert results == [rev_metadata]
+ assert results == [
+ {"id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"],}
+ ]
results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
)
for result in results:
del result["tool"]
- assert results == [origin_metadata]
+ assert results == [
+ {
+ "id": origin2,
+ "from_revision": rev_id,
+ "metadata": YARN_PARSER_METADATA,
+ "mappings": ["npm"],
+ }
+ ]
def test_origin_metadata_indexer_duplicate_revision(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.storage = storage
indexer.idx_storage = idx_storage
- indexer.run(
- [
- "https://github.com/librariesio/yarn-parser",
- "https://github.com/librariesio/yarn-parser.git",
- ]
- )
-
origin1 = "https://github.com/librariesio/yarn-parser"
origin2 = "https://github.com/librariesio/yarn-parser.git"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ indexer.run([origin1, origin2])
+
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert len(results) == 1
results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
)
assert len(results) == 2
def test_origin_metadata_indexer_no_metadata_file(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ origin = "https://github.com/librariesio/yarn-parser"
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
- origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_no_metadata(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ origin = "https://github.com/librariesio/yarn-parser"
with patch(
"swh.indexer.metadata.RevisionMetadataIndexer"
".translate_revision_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
- origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_error(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ origin = "https://github.com/librariesio/yarn-parser"
with patch(
"swh.indexer.metadata.RevisionMetadataIndexer"
".translate_revision_intrinsic_metadata",
return_value=None,
):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
- origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_delete_metadata(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
- indexer.run(["https://github.com/librariesio/yarn-parser"])
-
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ indexer.run([origin])
+
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results != []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results != []
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_unknown_origin(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
result = indexer.index_list(["https://unknown.org/foo"])
assert not result
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index 3a39558..b3f0612 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,740 +1,774 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import functools
-import random
from typing import Dict, Any
import unittest
from hypothesis import strategies
from swh.model import hashutil
from swh.model.hashutil import hash_to_bytes, hash_to_hex
-from swh.model.model import OriginVisit, OriginVisitStatus
+from swh.model.model import (
+ Content,
+ Directory,
+ DirectoryEntry,
+ Origin,
+ OriginVisit,
+ OriginVisitStatus,
+ Person,
+ Revision,
+ RevisionType,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ Timestamp,
+ TimestampWithTimezone,
+)
from swh.storage.utils import now
from swh.indexer.storage import INDEXER_CFG_KEY
BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = {
- "storage": {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]},
+ "storage": {"cls": "memory"},
"objstorage": {"cls": "memory", "args": {},},
INDEXER_CFG_KEY: {"cls": "memory", "args": {},},
}
+
+ORIGINS = [
+ Origin(url="https://github.com/SoftwareHeritage/swh-storage"),
+ Origin(url="rsync://ftp.gnu.org/gnu/3dldf"),
+ Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"),
+ Origin(url="https://pypi.org/project/limnoria/"),
+ Origin(url="http://0-512-md.googlecode.com/svn/"),
+ Origin(url="https://github.com/librariesio/yarn-parser"),
+ Origin(url="https://github.com/librariesio/yarn-parser.git"),
+]
+
+
ORIGIN_VISITS = [
- {"type": "git", "url": "https://github.com/SoftwareHeritage/swh-storage"},
- {"type": "ftp", "url": "rsync://ftp.gnu.org/gnu/3dldf"},
- {"type": "deposit", "url": "https://forge.softwareheritage.org/source/jesuisgpl/"},
- {"type": "pypi", "url": "https://pypi.org/project/limnoria/"},
- {"type": "svn", "url": "http://0-512-md.googlecode.com/svn/"},
- {"type": "git", "url": "https://github.com/librariesio/yarn-parser"},
- {"type": "git", "url": "https://github.com/librariesio/yarn-parser.git"},
+ {"type": "git", "origin": ORIGINS[0].url},
+ {"type": "ftp", "origin": ORIGINS[1].url},
+ {"type": "deposit", "origin": ORIGINS[2].url},
+ {"type": "pypi", "origin": ORIGINS[3].url},
+ {"type": "svn", "origin": ORIGINS[4].url},
+ {"type": "git", "origin": ORIGINS[5].url},
+ {"type": "git", "origin": ORIGINS[6].url},
]
+
+DIRECTORY = Directory(
+ id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"),
+ entries=(
+ DirectoryEntry(
+ name=b"index.js",
+ type="file",
+ target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"),
+ perms=0o100644,
+ ),
+ DirectoryEntry(
+ name=b"package.json",
+ type="file",
+ target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
+ perms=0o100644,
+ ),
+ DirectoryEntry(
+ name=b".github",
+ type="dir",
+ target=Directory(entries=()).id,
+ perms=0o040000,
+ ),
+ ),
+)
+
+DIRECTORY2 = Directory(
+ id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6",
+ entries=(
+ DirectoryEntry(
+ name=b"package.json",
+ type="file",
+ target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"),
+ perms=0o100644,
+ ),
+ ),
+)
+
+REVISION = Revision(
+ id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"),
+ message=b"Improve search functionality",
+ author=Person(
+ name=b"Andrew Nesbitt",
+ fullname=b"Andrew Nesbitt ",
+ email=b"andrewnez@gmail.com",
+ ),
+ committer=Person(
+ name=b"Andrew Nesbitt",
+ fullname=b"Andrew Nesbitt ",
+ email=b"andrewnez@gmail.com",
+ ),
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1380883849, microseconds=0,),
+ offset=120,
+ negative_utc=False,
+ ),
+ type=RevisionType.GIT,
+ synthetic=False,
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1487596456, microseconds=0,),
+ offset=0,
+ negative_utc=False,
+ ),
+ directory=DIRECTORY2.id,
+ parents=(),
+)
+
+REVISIONS = [REVISION]
+
SNAPSHOTS = [
- {
- "origin": "https://github.com/SoftwareHeritage/swh-storage",
- "branches": {
- b"refs/heads/add-revision-origin-cache": {
- "target": b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
- b"s\xe7/\xe9l\x1e",
- "target_type": "revision",
- },
- b"refs/head/master": {
- "target": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}" b"\xac\xefrm",
- "target_type": "revision",
- },
- b"HEAD": {"target": b"refs/head/master", "target_type": "alias"},
- b"refs/tags/v0.0.103": {
- "target": b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b"\x0f\xdd",
- "target_type": "release",
- },
- },
- },
- {
- "origin": "rsync://ftp.gnu.org/gnu/3dldf",
- "branches": {
- b"3DLDF-1.1.4.tar.gz": {
- "target": b"dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc" b'"G\x99\x11',
- "target_type": "revision",
- },
- b"3DLDF-2.0.2.tar.gz": {
- "target": b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e="
- b"\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V",
- "target_type": "revision",
- },
- b"3DLDF-2.0.3-examples.tar.gz": {
- "target": b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97"
- b"\xfe\xadZ\x80\x80\xc1\x83\xff",
- "target_type": "revision",
- },
- b"3DLDF-2.0.3.tar.gz": {
- "target": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee"
- b"\xcc\x1a\xb4`\x8c\x8by",
- "target_type": "revision",
- },
- b"3DLDF-2.0.tar.gz": {
- "target": b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G" b"\xd3\xd1m",
- "target_type": "revision",
- },
- },
- },
- {
- "origin": "https://forge.softwareheritage.org/source/jesuisgpl/",
- "branches": {
- b"master": {
- "target": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{"
- b"\xa6\xe9\x99\xb1\x9e]q\xeb",
- "target_type": "revision",
- }
- },
- "id": b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r ",
- },
- {
- "origin": "https://pypi.org/project/limnoria/",
- "branches": {
- b"HEAD": {"target": b"releases/2018.09.09", "target_type": "alias"},
- b"releases/2018.09.01": {
- "target": b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d"
- b"\xbb\xdfF\xfdw\xcf",
- "target_type": "revision",
- },
- b"releases/2018.09.09": {
- "target": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k"
- b"A\x10\x9d\xc5\xfa2\xf8t",
- "target_type": "revision",
- },
- },
- "id": b"{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay" b"\x12\x9e\xd6\xb3",
- },
- {
- "origin": "http://0-512-md.googlecode.com/svn/",
- "branches": {
- b"master": {
- "target": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8"
- b"\xc9\xad#.\x1bw=\x18",
- "target_type": "revision",
- }
+ Snapshot(
+ id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
+ branches={
+ b"refs/heads/add-revision-origin-cache": SnapshotBranch(
+ target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e',
+ target_type=TargetType.REVISION,
+ ),
+ b"refs/head/master": SnapshotBranch(
+ target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm",
+ target_type=TargetType.REVISION,
+ ),
+ b"HEAD": SnapshotBranch(
+ target=b"refs/head/master", target_type=TargetType.ALIAS
+ ),
+ b"refs/tags/v0.0.103": SnapshotBranch(
+ target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd',
+ target_type=TargetType.RELEASE,
+ ),
},
- "id": b"\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7"
- b"\x05\xea\xb8\x1f\xc4H\xf4s",
- },
- {
- "origin": "https://github.com/librariesio/yarn-parser",
- "branches": {
- b"HEAD": {
- "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- "target_type": "revision",
- }
+ ),
+ Snapshot(
+ id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
+ branches={
+ b"3DLDF-1.1.4.tar.gz": SnapshotBranch(
+ target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11',
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.2.tar.gz": SnapshotBranch(
+ target=b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", # noqa
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.3-examples.tar.gz": SnapshotBranch(
+ target=b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97\xfe\xadZ\x80\x80\xc1\x83\xff", # noqa
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.3.tar.gz": SnapshotBranch(
+ target=b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by", # noqa
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.tar.gz": SnapshotBranch(
+ target=b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G\xd3\xd1m",
+ target_type=TargetType.REVISION,
+ ),
},
- },
- {
- "origin": "https://github.com/librariesio/yarn-parser.git",
- "branches": {
- b"HEAD": {
- "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- "target_type": "revision",
- }
+ ),
+ Snapshot(
+ id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"),
+ branches={
+ b"master": SnapshotBranch(
+ target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb", # noqa
+ target_type=TargetType.REVISION,
+ )
},
- },
-]
-
-
-REVISIONS = [
- {
- "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- "message": b"Improve search functionality",
- "author": {
- "name": b"Andrew Nesbitt",
- "fullname": b"Andrew Nesbitt ",
- "email": b"andrewnez@gmail.com",
+ ),
+ Snapshot(
+ id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"),
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=b"releases/2018.09.09", target_type=TargetType.ALIAS
+ ),
+ b"releases/2018.09.01": SnapshotBranch(
+ target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf",
+ target_type=TargetType.REVISION,
+ ),
+ b"releases/2018.09.09": SnapshotBranch(
+ target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa
+ target_type=TargetType.REVISION,
+ ),
},
- "committer": {
- "name": b"Andrew Nesbitt",
- "fullname": b"Andrew Nesbitt ",
- "email": b"andrewnez@gmail.com",
+ ),
+ Snapshot(
+ id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"),
+ branches={
+ b"master": SnapshotBranch(
+ target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18",
+ target_type=TargetType.REVISION,
+ )
},
- "committer_date": {
- "negative_utc": False,
- "offset": 120,
- "timestamp": {"microseconds": 0, "seconds": 1380883849,},
+ ),
+ Snapshot(
+ id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=REVISION.id, target_type=TargetType.REVISION,
+ )
},
- "type": "git",
- "synthetic": False,
- "date": {
- "negative_utc": False,
- "timestamp": {"seconds": 1487596456, "microseconds": 0,},
- "offset": 0,
+ ),
+ Snapshot(
+ id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=REVISION.id, target_type=TargetType.REVISION,
+ )
},
- "directory": b"10",
- "parents": (),
- }
+ ),
]
-DIRECTORY_ID = b"10"
-
-DIRECTORY_ENTRIES = [
- {"name": b"index.js", "type": "file", "target": b"abc", "perms": 33188,},
- {"name": b"package.json", "type": "file", "target": b"cde", "perms": 33188,},
- {"name": b".github", "type": "dir", "target": b"11", "perms": 16384,},
-]
SHA1_TO_LICENSES = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"],
"02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"],
"103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"],
"688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"],
"da39a3ee5e6b4b0d3255bfef95601890afd80709": [],
}
SHA1_TO_CTAGS = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [
{"name": "foo", "kind": "str", "line": 10, "lang": "bar",}
],
"d4c647f0fc257591cc9ba1722484229780d1c607": [
{"name": "let", "kind": "int", "line": 100, "lang": "haskell",}
],
"688a5ef812c53907562fe379d4b3851e69c7cb15": [
{"name": "symbol", "kind": "float", "line": 99, "lang": "python",}
],
}
OBJ_STORAGE_DATA = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text",
"688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text",
"8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text",
"02fb2c89e14f7fab46701478c83779c7beb7b069": b"""
import unittest
import logging
from swh.indexer.mimetype import MimetypeIndexer
from swh.indexer.tests.test_utils import MockObjStorage
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
""",
"103bc087db1d26afc3a0283f38663d081e9b01e6": b"""
#ifndef __AVL__
#define __AVL__
typedef struct _avl_tree avl_tree;
typedef struct _data_t {
int content;
} data_t;
""",
"93666f74f1cf635c8c8ac118879da6ec5623c410": b"""
(should 'pygments (recognize 'lisp 'easily))
""",
"26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
""",
"d4c647f0fc257591cc9ba1722484229780d1c607": b"""
{
"version": "5.0.3",
"name": "npm",
"description": "a package manager for JavaScript",
"keywords": [
"install",
"modules",
"package manager",
"package.json"
],
"preferGlobal": true,
"config": {
"publishtest": false
},
"homepage": "https://docs.npmjs.com/",
"author": "Isaac Z. Schlueter (http://blog.izs.me)",
"repository": {
"type": "git",
"url": "https://github.com/npm/npm"
},
"bugs": {
"url": "https://github.com/npm/npm/issues"
},
"dependencies": {
"JSONStream": "~1.3.1",
"abbrev": "~1.1.0",
"ansi-regex": "~2.1.1",
"ansicolors": "~0.3.2",
"ansistyles": "~0.1.3"
},
"devDependencies": {
"tacks": "~1.2.6",
"tap": "~10.3.2"
},
"license": "Artistic-2.0"
}
""",
"a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b"""
""",
"da39a3ee5e6b4b0d3255bfef95601890afd80709": b"",
- # 626364
- hash_to_hex(b"bcd"): b"unimportant content for bcd",
- # 636465
- hash_to_hex(
- b"cde"
- ): b"""
+ # was 626364 / b'bcd'
+ "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd",
+ # was 636465 / b'cde' now yarn-parser package.json
+ "f5305243b3ce7ef8dc864ebc73794da304025beb": b"""
{
"name": "yarn-parser",
"version": "1.0.0",
"description": "Tiny web service for parsing yarn.lock files",
"main": "index.js",
"scripts": {
"start": "node index.js",
"test": "mocha"
},
"engines": {
"node": "9.8.0"
},
"repository": {
"type": "git",
"url": "git+https://github.com/librariesio/yarn-parser.git"
},
"keywords": [
"yarn",
"parse",
"lock",
"dependencies"
],
"author": "Andrew Nesbitt",
"license": "AGPL-3.0",
"bugs": {
"url": "https://github.com/librariesio/yarn-parser/issues"
},
"homepage": "https://github.com/librariesio/yarn-parser#readme",
"dependencies": {
"@yarnpkg/lockfile": "^1.0.0",
"body-parser": "^1.15.2",
"express": "^4.14.0"
},
"devDependencies": {
"chai": "^4.1.2",
"mocha": "^5.2.0",
"request": "^2.87.0",
"test": "^0.6.0"
}
}
""",
}
+
YARN_PARSER_METADATA = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"url": "https://github.com/librariesio/yarn-parser#readme",
"codeRepository": "git+git+https://github.com/librariesio/yarn-parser.git",
"author": [{"type": "Person", "name": "Andrew Nesbitt"}],
"license": "https://spdx.org/licenses/AGPL-3.0",
"version": "1.0.0",
"description": "Tiny web service for parsing yarn.lock files",
"issueTracker": "https://github.com/librariesio/yarn-parser/issues",
"name": "yarn-parser",
"keywords": ["yarn", "parse", "lock", "dependencies"],
"type": "SoftwareSourceCode",
}
json_dict_keys = strategies.one_of(
strategies.characters(),
strategies.just("type"),
strategies.just("url"),
strategies.just("name"),
strategies.just("email"),
strategies.just("@id"),
strategies.just("@context"),
strategies.just("repository"),
strategies.just("license"),
strategies.just("repositories"),
strategies.just("licenses"),
)
"""Hypothesis strategy that generates strings, with an emphasis on those
that are often used as dictionary keys in metadata files."""
generic_json_document = strategies.recursive(
strategies.none()
| strategies.booleans()
| strategies.floats()
| strategies.characters(),
lambda children: (
strategies.lists(children, min_size=1)
| strategies.dictionaries(json_dict_keys, children, min_size=1)
),
)
"""Hypothesis strategy that generates possible values for values of JSON
metadata files."""
def json_document_strategy(keys=None):
"""Generates an hypothesis strategy that generates metadata files
for a JSON-based format that uses the given keys."""
if keys is None:
keys = strategies.characters()
else:
keys = strategies.one_of(map(strategies.just, keys))
return strategies.dictionaries(keys, generic_json_document, min_size=1)
def _tree_to_xml(root, xmlns, data):
def encode(s):
"Skips unpaired surrogates generated by json_document_strategy"
return s.encode("utf8", "replace")
def to_xml(data, indent=b" "):
if data is None:
return b""
elif isinstance(data, (bool, str, int, float)):
return indent + encode(str(data))
elif isinstance(data, list):
return b"\n".join(to_xml(v, indent=indent) for v in data)
elif isinstance(data, dict):
lines = []
for (key, value) in data.items():
lines.append(indent + encode("<{}>".format(key)))
lines.append(to_xml(value, indent=indent + b" "))
lines.append(indent + encode("{}>".format(key)))
return b"\n".join(lines)
else:
raise TypeError(data)
return b"\n".join(
[
'<{} xmlns="{}">'.format(root, xmlns).encode(),
to_xml(data),
"{}>".format(root).encode(),
]
)
class TreeToXmlTest(unittest.TestCase):
def test_leaves(self):
self.assertEqual(
_tree_to_xml("root", "http://example.com", None),
b'\n\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", True),
b'\n True\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", "abc"),
b'\n abc\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", 42),
b'\n 42\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", 3.14),
b'\n 3.14\n',
)
def test_dict(self):
self.assertIn(
_tree_to_xml("root", "http://example.com", {"foo": "bar", "baz": "qux"}),
[
b'\n'
b" \n bar\n \n"
b" \n qux\n \n"
b"",
b'\n'
b" \n qux\n \n"
b" \n bar\n \n"
b"",
],
)
def test_list(self):
self.assertEqual(
_tree_to_xml(
"root", "http://example.com", [{"foo": "bar"}, {"foo": "baz"},]
),
b'\n'
b" \n bar\n \n"
b" \n baz\n \n"
b"",
)
def xml_document_strategy(keys, root, xmlns):
"""Generates an hypothesis strategy that generates metadata files
for an XML format that uses the given keys."""
return strategies.builds(
functools.partial(_tree_to_xml, root, xmlns), json_document_strategy(keys)
)
def filter_dict(d, keys):
"return a copy of the dict with keys deleted"
if not isinstance(keys, (list, tuple)):
keys = (keys,)
return dict((k, v) for (k, v) in d.items() if k not in keys)
def fill_obj_storage(obj_storage):
"""Add some content in an object storage."""
for (obj_id, content) in OBJ_STORAGE_DATA.items():
obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
def fill_storage(storage):
- visit_types = {}
- for visit in ORIGIN_VISITS:
- storage.origin_add_one({"url": visit["url"]})
- visit_types[visit["url"]] = visit["type"]
- for snap in SNAPSHOTS:
- origin_url = snap["origin"]
+ storage.origin_add(ORIGINS)
+ storage.directory_add([DIRECTORY, DIRECTORY2])
+ storage.revision_add(REVISIONS)
+ storage.snapshot_add(SNAPSHOTS)
+
+ for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
+ assert snapshot.id is not None
+
visit = storage.origin_visit_add(
- [
- OriginVisit(
- origin=origin_url,
- date=now(),
- type=visit_types[origin_url],
- status="ongoing",
- snapshot=None,
- )
- ]
+ [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])]
)[0]
- snap_id = snap.get("id") or bytes([random.randint(0, 255) for _ in range(32)])
- storage.snapshot_add([{"id": snap_id, "branches": snap["branches"]}])
visit_status = OriginVisitStatus(
- origin=origin_url,
+ origin=visit.origin,
visit=visit.visit,
date=now(),
status="full",
- snapshot=snap_id,
+ snapshot=snapshot.id,
)
storage.origin_visit_status_add([visit_status])
- storage.revision_add(REVISIONS)
contents = []
for (obj_id, content) in OBJ_STORAGE_DATA.items():
content_hashes = hashutil.MultiHash.from_data(content).digest()
contents.append(
- {
- "data": content,
- "length": len(content),
- "status": "visible",
- "sha1": hash_to_bytes(obj_id),
- "sha1_git": hash_to_bytes(obj_id),
- "sha256": content_hashes["sha256"],
- "blake2s256": content_hashes["blake2s256"],
- }
+ Content(
+ data=content,
+ length=len(content),
+ status="visible",
+ sha1=hash_to_bytes(obj_id),
+ sha1_git=hash_to_bytes(obj_id),
+ sha256=content_hashes["sha256"],
+ blake2s256=content_hashes["blake2s256"],
+ )
)
storage.content_add(contents)
- storage.directory_add([{"id": DIRECTORY_ID, "entries": DIRECTORY_ENTRIES,}])
class CommonContentIndexerTest(metaclass=abc.ABCMeta):
legacy_get_format = False
"""True if and only if the tested indexer uses the legacy format.
see: https://forge.softwareheritage.org/T1433
"""
def get_indexer_results(self, ids):
"""Override this for indexers that don't have a mock storage."""
return self.indexer.idx_storage.state
def assert_legacy_results_ok(self, sha1s, expected_results=None):
# XXX old format, remove this when all endpoints are
# updated to the new one
# see: https://forge.softwareheritage.org/T1433
sha1s = [
sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s
]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(
len(expected_results),
len(actual_results),
(expected_results, actual_results),
)
for indexed_data in actual_results:
_id = indexed_data["id"]
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data["id"] = _id
self.assertEqual(indexed_data, expected_data)
def assert_results_ok(self, sha1s, expected_results=None):
if self.legacy_get_format:
self.assert_legacy_results_ok(sha1s, expected_results)
return
sha1s = [
sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s
]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(
len(expected_results),
len(actual_results),
(expected_results, actual_results),
)
for indexed_data in actual_results:
(_id, indexed_data) = list(indexed_data.items())[0]
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data = [expected_data]
self.assertEqual(indexed_data, expected_data)
def test_index(self):
"""Known sha1 have their data indexed
"""
sha1s = [self.id0, self.id1, self.id2]
# when
self.indexer.run(sha1s, policy_update="update-dups")
self.assert_results_ok(sha1s)
# 2nd pass
self.indexer.run(sha1s, policy_update="ignore-dups")
self.assert_results_ok(sha1s)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
sha1s = [
self.id1,
"799a5ef812c53907562fe379d4b3851e69c7cb15", # unknown
"800a5ef812c53907562fe379d4b3851e69c7cb15",
] # unknown
# when
self.indexer.run(sha1s, policy_update="update-dups")
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
self.assert_results_ok(sha1s, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
def setUp(self):
self.contents = sorted(OBJ_STORAGE_DATA)
def assert_results_ok(self, start, end, actual_results, expected_results=None):
if expected_results is None:
expected_results = self.expected_results
actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data["id"]
assert isinstance(_id, bytes)
indexed_data = indexed_data.copy()
indexed_data["id"] = hash_to_hex(indexed_data["id"])
self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
self.assertTrue(start <= _id <= end)
_tool_id = indexed_data["indexer_configuration_id"]
self.assertEqual(_tool_id, self.indexer.tool["id"])
def test__index_contents(self):
"""Indexing contents without existing data results in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(start, end, indexed={}))
self.assert_results_ok(start, end, actual_results)
def test__index_contents_with_indexed_data(self):
"""Indexing contents with existing data results in less indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
start, end, indexed=set(map(hash_to_bytes, data_indexed))
)
# craft the expected results
expected_results = self.expected_results.copy()
for already_indexed_key in data_indexed:
expected_results.pop(already_indexed_key)
self.assert_results_ok(start, end, actual_results, expected_results)
def test_generate_content_get(self):
"""Optimal indexing should result in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
# then
self.assertEqual(actual_results, {"status": "uneventful"})
def test_generate_content_get_input_as_bytes(self):
"""Optimal indexing should result in indexed data
Input are in bytes here.
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end, skip_existing=False)
# no already indexed data so same result as prior test
# then
self.assertEqual(actual_results, {"status": "uneventful"})
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
_start, _end = [
"0000000000000000000000000000000000000000",
"0000000000000000000000000000000000000001",
]
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end, incremental=False)
# then
self.assertEqual(actual_results, {"status": "uneventful"})
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..8a1495e
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,40 @@
+[tox]
+envlist=black,flake8,mypy,py3
+
+[testenv]
+extras =
+ testing
+deps =
+ pytest-cov
+ swh-scheduler[testing] >= 0.5.0
+ swh-storage[testing] >= 0.10.0
+ dev: pdbpp
+commands =
+ pytest --doctest-modules \
+ !slow: --hypothesis-profile=fast \
+ slow: --hypothesis-profile=slow \
+ {envsitepackagesdir}/swh/indexer \
+ --cov={envsitepackagesdir}/swh/indexer \
+ --cov-branch {posargs}
+
+[testenv:black]
+skip_install = true
+deps =
+ black
+commands =
+ {envpython} -m black --check swh
+
+[testenv:flake8]
+skip_install = true
+deps =
+ flake8
+commands =
+ {envpython} -m flake8
+
+[testenv:mypy]
+extras =
+ testing
+deps =
+ mypy
+commands =
+ mypy swh
diff --git a/version.txt b/version.txt
deleted file mode 100644
index a538b5a..0000000
--- a/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-v0.1.0-0-ga8307fc
\ No newline at end of file