diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1c279bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.pyc
+*.sw?
+*~
+/.coverage
+/.coverage.*
+.eggs/
+__pycache__
+*.egg-info/
+build/
+dist/
+version.txt
+/sql/createdb-stamp
+/sql/filldb-stamp
+.tox/
+.hypothesis/
+.mypy_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..d1f84e3
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.4.0
+  hooks:
+  - id: trailing-whitespace
+  - id: flake8
+  - id: check-json
+  - id: check-yaml
+
+- repo: https://github.com/codespell-project/codespell
+  rev: v1.16.0
+  hooks:
+  - id: codespell
+    exclude: ^(swh/indexer/data/codemeta/crosswalk.csv)$
+- repo: local
+  hooks:
+  - id: mypy
+    name: mypy
+    entry: mypy
+    args: [swh]
+    pass_filenames: false
+    language: system
+    types: [python]
+
+- repo: https://github.com/python/black
+  rev: 19.10b0
+  hooks:
+  - id: black
+
+# unfortunately, we are far from being able to enable this...
+# - repo: https://github.com/PyCQA/pydocstyle.git
+#   rev: 4.0.0
+#   hooks:
+#   - id: pydocstyle
+#     name: pydocstyle
+#     description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions.
+#     entry: pydocstyle --convention=google
+#     language: python
+#     types: [python]
+
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..27d038e
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+Copyright (C) 2015-2017 The Software Heritage developers
+
+See http://www.softwareheritage.org/ for more information.
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..0ad22b5
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,78 @@
+# Software Heritage Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as Software
+Heritage contributors and maintainers pledge to making participation in our
+project and our community a harassment-free experience for everyone, regardless
+of age, body size, disability, ethnicity, sex characteristics, gender identity
+and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at `conduct@softwareheritage.org`. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident.  Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..a1a7b45
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,2 @@
+Siddharth Ravikumar
+Thibault Allançon
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/MANIFEST.in b/MANIFEST.in
index c6e3a9a..d5bc305 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,8 +1,9 @@
 include README.md
 include Makefile
 include requirements*.txt
 include version.txt
+include conftest.py
 recursive-include sql *
 recursive-include swh/indexer/sql *.sql
 recursive-include swh/indexer/data *
 recursive-include swh py.typed
diff --git a/Makefile.local b/Makefile.local
new file mode 100644
index 0000000..c163514
--- /dev/null
+++ b/Makefile.local
@@ -0,0 +1 @@
+TESTFLAGS=--hypothesis-profile=fast
diff --git a/PKG-INFO b/PKG-INFO
index a2920a6..06fbd34 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.1.0
+Version: 0.1.1
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/codemeta.json b/codemeta.json
new file mode 100644
index 0000000..8eaf5cc
--- /dev/null
+++ b/codemeta.json
@@ -0,0 +1,39 @@
+{
+  "@context": "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld",
+  "@type": "SoftwareSourceCode",
+  "identifier": "5682a72dc61f86ae69f2841c2184d6159c0b6d5d",
+  "description": "Software Heritage Indexer for revisions and contents",
+  "name": "swh-indexer",
+  "isPartOf": {
+    "@type": "SoftwareSourceCode",
+    "name": "swh-environment",
+    "identifier": "83e766feafde91242883be1bf369ed3e6865824f"
+  },
+  "codeRepository": "https://forge.softwareheritage.org/diffusion/78/",
+  "issueTracker": "https://forge.softwareheritage.org/maniphest/",
+  "license": "https://spdx.org/licenses/GPL-3.0.html",
+  "version": "0.0.35",
+  "author": [
+    {
+      "@type": "Organization",
+      "name": "Software Heritage",
+      "url": "https://www.softwareheritage.org",
+      "email": "swh-devel@inria.fr"
+    }
+  ],
+  "developmentStatus": "active",
+  "keywords": [
+    "indexer",
+    "software",
+    "mimetype",
+    "ctags",
+    "language",
+    "fossology-license",
+    "metadata",
+    "metadata-detector",
+    "metadata-translator"
+  ],
+  "dateCreated":"2017-06-12",
+  "datePublished":"2017-06-12",
+  "programmingLanguage": "Python"
+}
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..de31662
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2020  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from hypothesis import settings
+
+# define tests profile. Full documentation is at:
+# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
+settings.register_profile("fast", max_examples=5, deadline=5000)
+settings.register_profile("slow", max_examples=20, deadline=5000)
+
+# Ignore the following modules because wsgi module fails as no
+# configuration file is found (--doctest-modules forces the module
+# loading)
+collect_ignore = ["swh/indexer/storage/api/wsgi.py"]
+
+# we use the swh_scheduler fixture
+pytest_plugins = ["swh.scheduler.pytest_plugin"]
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..58a761e
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,3 @@
+_build/
+apidoc/
+*-stamp
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..59d8b80
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,3 @@
+include ../../swh-docs/Makefile.sphinx
+-include Makefile.local
+
diff --git a/docs/Makefile.local b/docs/Makefile.local
new file mode 100644
index 0000000..cd07101
--- /dev/null
+++ b/docs/Makefile.local
@@ -0,0 +1,11 @@
+sphinx/html: images
+sphinx/clean: clean-images
+assets: images
+
+images:
+	make -C images/
+clean-images:
+	make -C images/ clean
+
+.PHONY: images clean-images
+
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..f4f2481
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,49 @@
+swh-indexer
+============
+
+Tools to compute multiple indexes on SWH's raw contents:
+- content:
+  - mimetype
+  - ctags
+  - language
+  - fossology-license
+  - metadata
+- revision:
+  - metadata
+
+An indexer is in charge of:
+- looking up objects
+- extracting information from those objects
+- store those information in the swh-indexer db
+
+There are multiple indexers working on different object types:
+  - content indexer: works with content sha1 hashes
+  - revision indexer: works with revision sha1 hashes
+  - origin indexer: works with origin identifiers
+
+Indexation procedure:
+- receive batch of ids
+- retrieve the associated data depending on object type
+- compute for that object some index
+- store the result to swh's storage
+
+Current content indexers:
+
+- mimetype (queue swh_indexer_content_mimetype): detect the encoding
+  and mimetype
+
+- language (queue swh_indexer_content_language): detect the
+  programming language
+
+- ctags (queue swh_indexer_content_ctags): compute tags information
+
+- fossology-license (queue swh_indexer_fossology_license): compute the
+  license
+
+- metadata: translate file into translated_metadata dict
+
+Current revision indexers:
+
+- metadata: detects files containing metadata and retrieves translated_metadata
+  in content_metadata table in storage or run content indexer to translate
+  files.
diff --git a/docs/_static/.placeholder b/docs/_static/.placeholder
new file mode 100644
index 0000000..e69de29
diff --git a/docs/_templates/.placeholder b/docs/_templates/.placeholder
new file mode 100644
index 0000000..e69de29
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..190deb7
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1 @@
+from swh.docs.sphinx.conf import *  # NoQA
diff --git a/docs/dev-info.rst b/docs/dev-info.rst
new file mode 100644
index 0000000..493b102
--- /dev/null
+++ b/docs/dev-info.rst
@@ -0,0 +1,206 @@
+Hacking on swh-indexer
+======================
+
+This tutorial will guide you through the hacking on the swh-indexer.
+If you do not have a local copy of the Software Heritage archive, go to the
+`getting started tutorial
+<https://docs.softwareheritage.org/devel/getting-started.html>`_
+
+Configuration files
+-------------------
+You will need the following YAML configuration files to run the swh-indexer
+commands:
+
+- Orchestrator at
+  ``~/.config/swh/indexer/orchestrator.yml``
+
+.. code-block:: yaml
+
+  indexers:
+    mimetype:
+      check_presence: false
+      batch_size: 100
+
+- Orchestrator-text at
+  ``~/.config/swh/indexer/orchestrator-text.yml``
+
+.. code-block:: yaml
+
+  indexers:
+    # language:
+    #   batch_size: 10
+    #   check_presence: false
+    fossology_license:
+      batch_size: 10
+      check_presence: false
+    # ctags:
+    #   batch_size: 2
+    #   check_presence: false
+
+- Mimetype indexer at
+  ``~/.config/swh/indexer/mimetype.yml``
+
+.. code-block:: yaml
+
+    # storage to read sha1's metadata (path)
+  	# storage:
+  	#   cls: local
+  	#   args:
+  	#     db: "service=swh-dev"
+  	#     objstorage:
+  	#       cls: pathslicing
+  	#       args:
+  	#         root: /home/storage/swh-storage/
+  	#         slicing: 0:1/1:5
+
+  	storage:
+  	  cls: remote
+  	  args:
+  	    url: http://localhost:5002/
+
+  	indexer_storage:
+  	  cls: remote
+  	  args:
+  	    url: http://localhost:5007/
+
+  	# storage to read sha1's content
+  	# adapt this to your need
+  	# locally: this needs to match your storage's setup
+  	objstorage:
+  	  cls: pathslicing
+  	  args:
+  	    slicing: 0:1/1:5
+  	    root: /home/storage/swh-storage/
+
+  	destination_task: swh.indexer.tasks.SWHOrchestratorTextContentsTask
+  	rescheduling_task: swh.indexer.tasks.SWHContentMimetypeTask
+
+
+- Fossology indexer at
+  ``~/.config/swh/indexer/fossology_license.yml``
+
+.. code-block:: yaml
+
+    # storage to read sha1's metadata (path)
+  	# storage:
+  	#   cls: local
+  	#   args:
+  	#     db: "service=swh-dev"
+  	#     objstorage:
+  	#       cls: pathslicing
+  	#       args:
+  	#         root: /home/storage/swh-storage/
+  	#         slicing: 0:1/1:5
+
+  	storage:
+  	  cls: remote
+  	  url: http://localhost:5002/
+
+  	indexer_storage:
+  	  cls: remote
+  	  args:
+  	    url: http://localhost:5007/
+
+  	# storage to read sha1's content
+  	# adapt this to your need
+  	# locally: this needs to match your storage's setup
+  	objstorage:
+  	  cls: pathslicing
+  	  args:
+  	    slicing: 0:1/1:5
+  	    root: /home/storage/swh-storage/
+
+  	workdir: /tmp/swh/worker.indexer/license/
+
+  	tools:
+  	  name: 'nomos'
+  	  version: '3.1.0rc2-31-ga2cbb8c'
+  	  configuration:
+  	    command_line: 'nomossa <filepath>'
+
+
+- Worker at
+  ``~/.config/swh/worker.yml``
+
+.. code-block:: yaml
+
+  task_broker: amqp://guest@localhost//
+  	task_modules:
+  	  - swh.loader.svn.tasks
+  	  - swh.loader.tar.tasks
+  	  - swh.loader.git.tasks
+  	  - swh.storage.archiver.tasks
+  	  - swh.indexer.tasks
+  	  - swh.indexer.orchestrator
+  	task_queues:
+  	  - swh_loader_svn
+  	  - swh_loader_tar
+  	  - swh_reader_git_to_azure_archive
+  	  - swh_storage_archive_worker_to_backend
+  	  - swh_indexer_orchestrator_content_all
+  	  - swh_indexer_orchestrator_content_text
+  	  - swh_indexer_content_mimetype
+  	  - swh_indexer_content_language
+  	  - swh_indexer_content_ctags
+  	  - swh_indexer_content_fossology_license
+  	  - swh_loader_svn_mount_and_load
+  	  - swh_loader_git_express
+  	  - swh_loader_git_archive
+  	  - swh_loader_svn_archive
+  	task_soft_time_limit: 0
+
+
+Database
+--------
+
+swh-indxer uses a database to store the indexed content. The default
+db is expected to be called swh-indexer-dev.
+
+Create or add  ``swh-dev`` and ``swh-indexer-dev`` to
+the ``~/.pg_service.conf`` and ``~/.pgpass`` files, which are postgresql's
+configuration files.
+
+Add data to local DB
+--------------------
+from within the ``swh-environment``, run the following command::
+
+  make rebuild-testdata
+
+and fetch some real data to work with, using::
+
+   python3 -m swh.loader.git.updater --origin-url <github url>
+
+Then you can list all content files using this script::
+
+  #!/usr/bin/env bash
+
+  psql service=swh-dev -c "copy (select sha1 from content) to stdin" | sed -e 's/^\\\\x//g'
+
+Run the indexers
+-----------------
+Use the list off contents to feed the indexers with with the
+following command::
+
+  ./list-sha1.sh | python3 -m swh.indexer.producer --batch 100 --task-name orchestrator_all
+
+Activate the workers
+--------------------
+To send messages to different queues using rabbitmq
+(which should already be installed through dependencies installation),
+run the following command in a dedicated terminal::
+
+  python3 -m celery worker --app=swh.scheduler.celery_backend.config.app \
+                 --pool=prefork \
+                 --concurrency=1 \
+                 -Ofair \
+                 --loglevel=info \
+                 --without-gossip \
+                 --without-mingle \
+                 --without-heartbeat 2>&1
+
+With this command rabbitmq will consume message using the worker
+configuration file.
+
+Note: for the fossology_license indexer, you need a package fossology-nomossa
+which is in our `public debian repository
+<https://wiki.softwareheritage.org/index.php?title=Debian_packaging#Package_repository>`_.
diff --git a/docs/images/.gitignore b/docs/images/.gitignore
new file mode 100644
index 0000000..d890b03
--- /dev/null
+++ b/docs/images/.gitignore
@@ -0,0 +1 @@
+tasks-metadata-indexers.svg
diff --git a/docs/images/Makefile b/docs/images/Makefile
new file mode 100644
index 0000000..3481956
--- /dev/null
+++ b/docs/images/Makefile
@@ -0,0 +1,11 @@
+
+UML_DIAGS_SRC = $(wildcard *.uml)
+UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC))
+
+all: $(UML_DIAGS)
+
+%.svg: %.uml
+	DISPLAY="" plantuml -tsvg $<
+
+clean:
+	-rm -f $(DEP_GRAPHS) $(UML_DIAGS)
diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml
new file mode 100644
index 0000000..954e079
--- /dev/null
+++ b/docs/images/tasks-metadata-indexers.uml
@@ -0,0 +1,84 @@
+@startuml
+  participant LOADERS as "Loaders"
+  participant JOURNAL as "Journal"
+  participant SCHEDULER as "Scheduler"
+  participant IDX_ORIG_HEAD as "Origin-Head Indexer"
+  participant IDX_REV_META as "Revision Metadata Indexer"
+  participant IDX_CONT_META as "Content Metadata Indexer"
+  participant IDX_ORIG_META as "Origin Metadata Indexer"
+  participant IDX_STORAGE as "Indexer Storage"
+  participant STORAGE as "Graph Storage"
+  participant OBJ_STORAGE as "Object Storage"
+
+  activate OBJ_STORAGE
+  activate IDX_STORAGE
+  activate STORAGE
+  activate JOURNAL
+  activate SCHEDULER
+
+  activate LOADERS
+
+  LOADERS->>JOURNAL: Origin 42 was added/revisited
+  deactivate LOADERS
+
+  JOURNAL->>SCHEDULER: run indexers on origin 42
+
+  SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42
+  activate IDX_ORIG_HEAD
+
+  IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42)
+
+  STORAGE->>IDX_ORIG_HEAD: branches
+
+  IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42)
+  deactivate IDX_ORIG_HEAD
+
+  SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42)
+  activate IDX_REV_META
+
+  IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef)
+  STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...}
+
+  IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab)
+  STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...]
+
+  IDX_REV_META->>IDX_REV_META: package.json is a metadata file
+
+  IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe)
+  IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...}
+
+  alt If the storage answered "none"
+    IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file
+    activate IDX_CONT_META
+
+    IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe
+
+    OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...'
+
+    IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author
+
+    IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...})
+    IDX_STORAGE->>IDX_CONT_META: ok
+
+    IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...}
+    deactivate IDX_CONT_META
+  end
+
+  IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...})
+  IDX_STORAGE->>IDX_REV_META: ok
+
+  IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef
+  deactivate IDX_REV_META
+
+  SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef
+  activate IDX_ORIG_META
+
+  IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef)
+  IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...}
+
+  IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...})
+  IDX_STORAGE->>IDX_ORIG_META: ok
+  deactivate IDX_ORIG_META
+
+
+@enduml
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..b80d6f4
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,25 @@
+.. _swh-indexer:
+
+Software Heritage - Indexer
+===========================
+
+Tools and workers used to mine the content of the archive and extract derived
+information from archive source code artifacts.
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
+
+   README.md
+   dev-info.rst
+   metadata-workflow.rst
+
+
+Reference Documentation
+-----------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   /apidoc/swh.indexer
diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
new file mode 100644
index 0000000..471ce8c
--- /dev/null
+++ b/docs/metadata-workflow.rst
@@ -0,0 +1,208 @@
+Metadata workflow
+=================
+
+Intrinsic metadata
+------------------
+
+Indexing :term:`intrinsic metadata` requires extracting information from the
+lowest levels of the :ref:`Merkle DAG <swh-merkle-dag>` (directories, files,
+and content blobs) and associate them to the highest ones (origins).
+In order to deduplicate the work between origins, we split this work between
+multiple indexers, which coordinate with each other and save their results
+at each step in the indexer storage.
+
+Indexer architecture
+--------------------
+
+.. thumbnail:: images/tasks-metadata-indexers.svg
+
+
+Origin-Head Indexer
+___________________
+
+First, the Origin-Head indexer gets called externally, with an origin as
+argument (or multiple origins, that are handled sequentially).
+For now, its tasks are scheduled manually via recurring Scheduler tasks; but
+in the near future, the :term:`journal` will be used to do that.
+
+It first looks up the last :term:`snapshot` and determines what the main
+branch of origin is (the "Head branch") and what revision it points to
+(the "Head").
+Intrinsic metadata for that origin will be extracted from that revision.
+
+It schedules a Revision Metadata Indexer task for that revision, with a
+hint that the revision is the Head of that particular origin.
+
+
+Revision and Content Metadata Indexers
+______________________________________
+
+These two indexers do the hard part of the work. The Revision Metadata
+Indexer fetches the root directory associated with a revision, then extracts
+the metadata from that directory.
+
+To do so, it lists files in that directory, and looks for known names, such
+as `codemeta.json`, `package.json`, or `pom.xml`. If there are any, it
+runs the Content Metadata Indexer on them, which in turn fetches their
+contents and runs them through extraction dictionaries/mappings.
+See below for details.
+
+Their results are saved in a database (the indexer storage), associated with
+the content and revision hashes.
+
+If it received a hint that this revision is the head of an origin, the
+Revision Metadata Indexer then schedules the Origin Metadata Indexer
+to run on that origin.
+
+
+Origin Metadata Indexer
+_______________________
+
+The job of this indexer is very simple: it takes an origin identifier and
+a revision hash, and copies the metadata of the former to a new table, to
+associate it with the latter.
+
+The reason for this is to be able to perform searches on metadata, and
+efficiently find out which origins matched the pattern.
+Running that search on the `revision_metadata` table would require either
+a reverse lookup from revisions to origins, which is costly.
+
+
+Translation from language-specific metadata to CodeMeta
+-------------------------------------------------------
+
+Intrinsic metadata are extracted from files provided with a project's source
+code, and translated using `CodeMeta`_'s `crosswalk table`_.
+
+All input formats supported so far are straightforward dictionaries (eg. JSON)
+or can be accessed as such (eg. XML); and the first part of the translation is
+to map their keys to a term in the CodeMeta vocabulary.
+This is done by parsing the crosswalk table's `CSV file`_ and using it as a
+map between these two vocabularies; and this does not require any
+format-specific code in the indexers.
+
+The second part is to normalize values. As language-specific metadata files
+each have their way(s) of formatting these values, we need to turn them into
+the data type required by CodeMeta.
+This normalization makes up for most of the code of
+:py:mod:`swh.indexer.metadata_dictionary`.
+
+.. _CodeMeta: https://codemeta.github.io/
+.. _crosswalk table: https://codemeta.github.io/crosswalk/
+.. _CSV file: https://github.com/codemeta/codemeta/blob/master/crosswalk.csv
+
+
+Supported intrinsic metadata
+----------------------------
+
+The following sources of intrinsic metadata are supported:
+
+* CodeMeta's `codemeta.json`_,
+* Maven's `pom.xml`_,
+* NPM's `package.json`_,
+* Python's `PKG-INFO`_,
+* Ruby's `.gemspec`_
+
+.. _codemeta.json: https://codemeta.github.io/terms/
+.. _pom.xml: https://maven.apache.org/pom.html
+.. _package.json: https://docs.npmjs.com/files/package.json
+.. _PKG-INFO: https://www.python.org/dev/peps/pep-0314/
+.. _.gemspec: https://guides.rubygems.org/specification-reference/
+
+
+Supported CodeMeta terms
+------------------------
+
+The following terms may be found in the output of the metadata translation
+(other than the `codemeta` mapping, which is the identity function, and
+therefore supports all terms):
+
+.. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta
+    :nostderr:
+
+
+Adding support for additional ecosystem-specific metadata
+---------------------------------------------------------
+
+This section will guide you through adding code to the metadata indexer to
+detect and translate new metadata formats.
+
+First, you should start by picking one of the `CodeMeta crosswalks`_.
+Then create a new file in `swh-indexer/swh/indexer/metadata_dictionary/`, that
+will contain your code, and create a new class that inherits from helper
+classes, with some documentation about your indexer:
+
+.. code-block:: python
+
+	from .base import DictMapping, SingleFileMapping
+	from swh.indexer.codemeta import CROSSWALK_TABLE
+
+	class MyMapping(DictMapping, SingleFileMapping):
+		"""Dedicated class for ..."""
+		name = 'my-mapping'
+		filename = b'the-filename'
+		mapping = CROSSWALK_TABLE['Name of the CodeMeta crosswalk']
+
+.. _CodeMeta crosswalks: https://github.com/codemeta/codemeta/tree/master/crosswalks
+
+Then, add a `string_fields` attribute, that is the list of all keys whose
+values are simple text values. For instance, to
+`translate Python PKG-INFO`_, it's:
+
+.. code-block:: python
+
+    string_fields = ['name', 'version', 'description', 'summary',
+                     'author', 'author-email']
+
+
+These values will be automatically added to the above list of
+supported terms.
+
+.. _translate Python PKG-INFO: https://forge.softwareheritage.org/source/swh-indexer/browse/master/swh/indexer/metadata_dictionary/python.py
+
+Last step to get your code working: add a `translate` method that will
+take a single byte string as argument, turn it into a Python dictionary,
+whose keys are the ones of the input document, and pass it to
+`_translate_dict`.
+
+For instance, if the input document is in JSON, it can be as simple as:
+
+.. code-block:: python
+
+    def translate(self, raw_content):
+        raw_content = raw_content.decode()  # bytes to str
+        content_dict = json.loads(raw_content)  # str to dict
+        return self._translate_dict(content_dict)  # convert to CodeMeta
+
+`_translate_dict` will do the heavy work of reading the crosswalk table for
+each of `string_fields`, read the corresponding value in the `content_dict`,
+and build a CodeMeta dictionary with the corresponding names from the
+crosswalk table.
+
+One last thing to run your code: add it to the list in
+`swh-indexer/swh/indexer/metadata_dictionary/__init__.py`, so the rest of the
+code is aware of it.
+
+Now, you can run it:
+
+.. code-block:: shell
+
+    python3 -m swh.indexer.metadata_dictionary MyMapping path/to/input/file
+
+and it will (hopefully) returns a CodeMeta object.
+
+If it works, well done!
+
+You can now improve your translation code further, by adding methods that
+will do more advanced conversion. For example, if there is a field named
+`license` containing an SPDX identifier, you must convert it to an URI,
+like this:
+
+.. code-block:: python
+
+    def normalize_license(self, s):
+        if isinstance(s, str):
+            return {"@id": "https://spdx.org/licenses/" + s}
+
+This method will automatically get called by `_translate_dict` when it
+finds a `license` field in `content_dict`.
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..0df07a7
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,30 @@
+[mypy]
+namespace_packages = True
+warn_unused_ignores = True
+
+
+# 3rd party libraries without stubs (yet)
+
+[mypy-celery.*]
+ignore_missing_imports = True
+
+[mypy-confluent_kafka.*]
+ignore_missing_imports = True
+
+[mypy-magic.*]
+ignore_missing_imports = True
+
+[mypy-pkg_resources.*]
+ignore_missing_imports = True
+
+[mypy-psycopg2.*]
+ignore_missing_imports = True
+
+[mypy-pyld.*]
+ignore_missing_imports = True
+
+[mypy-pytest.*]
+ignore_missing_imports = True
+
+[mypy-xmltodict.*]
+ignore_missing_imports = True
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..4b8d2d3
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+addopts = -p no:pytest_swh_scheduler
+norecursedirs = docs
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 32c8593..0363717 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,6 +1,6 @@
 swh.core[db,http] >= 0.0.87
 swh.model >= 0.0.15
 swh.objstorage >= 0.0.43
 swh.scheduler >= 0.0.47
-swh.storage >= 0.6.0
+swh.storage >= 0.8.0
 swh.journal >= 0.1.0
diff --git a/requirements-test.txt b/requirements-test.txt
index 68bb694..ac0c1f0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,5 @@
 confluent-kafka
 pytest
-pytest-postgresql
 hypothesis>=3.11.0
-swh.storage>= 0.0.178
+swh.scheduler[testing] >= 0.5.0
+swh.storage[testing] >= 0.10.0
diff --git a/setup.py b/setup.py
index 1f6fd99..b0c777c 100755
--- a/setup.py
+++ b/setup.py
@@ -1,73 +1,73 @@
 #!/usr/bin/env python3
-# Copyright (C) 2015-2018  The Software Heritage developers
+# Copyright (C) 2015-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from setuptools import setup, find_packages
 
 from os import path
 from io import open
 
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
 with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 
 def parse_requirements(name=None):
     if name:
         reqf = "requirements-%s.txt" % name
     else:
         reqf = "requirements.txt"
 
     requirements = []
     if not path.exists(reqf):
         return requirements
 
     with open(reqf) as f:
         for line in f.readlines():
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
             requirements.append(line)
     return requirements
 
 
 setup(
     name="swh.indexer",
     description="Software Heritage Content Indexer",
     long_description=long_description,
     long_description_content_type="text/markdown",
     python_requires=">=3.7",
     author="Software Heritage developers",
     author_email="swh-devel@inria.fr",
     url="https://forge.softwareheritage.org/diffusion/78/",
     packages=find_packages(),
     scripts=[],
     install_requires=parse_requirements() + parse_requirements("swh"),
-    setup_requires=["vcversioner"],
+    setup_requires=["setuptools-scm"],
+    use_scm_version=True,
     extras_require={"testing": parse_requirements("test")},
-    vcversioner={},
     include_package_data=True,
     entry_points="""
         [console_scripts]
         swh-indexer=swh.indexer.cli:main
         [swh.cli.subcommands]
         indexer=swh.indexer.cli:cli
     """,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Development Status :: 5 - Production/Stable",
     ],
     project_urls={
         "Bug Reports": "https://forge.softwareheritage.org/maniphest",
         "Funding": "https://www.softwareheritage.org/donate",
         "Source": "https://forge.softwareheritage.org/source/swh-indexer",
         "Documentation": "https://docs.softwareheritage.org/devel/swh-indexer/",
     },
 )
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index a2920a6..06fbd34 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.1.0
+Version: 0.1.1
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt
index 3f25a51..1dc3047 100644
--- a/swh.indexer.egg-info/SOURCES.txt
+++ b/swh.indexer.egg-info/SOURCES.txt
@@ -1,108 +1,133 @@
+.gitignore
+.pre-commit-config.yaml
+AUTHORS
+CODE_OF_CONDUCT.md
+CONTRIBUTORS
+LICENSE
 MANIFEST.in
 Makefile
+Makefile.local
 README.md
+codemeta.json
+conftest.py
+mypy.ini
 pyproject.toml
+pytest.ini
 requirements-swh.txt
 requirements-test.txt
 requirements.txt
 setup.cfg
 setup.py
-version.txt
+tox.ini
+docs/.gitignore
+docs/Makefile
+docs/Makefile.local
+docs/README.md
+docs/conf.py
+docs/dev-info.rst
+docs/index.rst
+docs/metadata-workflow.rst
+docs/_static/.placeholder
+docs/_templates/.placeholder
+docs/images/.gitignore
+docs/images/Makefile
+docs/images/tasks-metadata-indexers.uml
 sql/bin/db-upgrade
 sql/bin/dot_add_content
+sql/doc/json
 sql/doc/json/.gitignore
 sql/doc/json/Makefile
 sql/doc/json/indexer_configuration.tool_configuration.schema.json
 sql/doc/json/revision_metadata.translated_metadata.json
 sql/json/.gitignore
 sql/json/Makefile
 sql/json/indexer_configuration.tool_configuration.schema.json
 sql/json/revision_metadata.translated_metadata.json
 sql/upgrades/115.sql
 sql/upgrades/116.sql
 sql/upgrades/117.sql
 sql/upgrades/118.sql
 sql/upgrades/119.sql
 sql/upgrades/120.sql
 sql/upgrades/121.sql
 sql/upgrades/122.sql
 sql/upgrades/123.sql
 sql/upgrades/124.sql
 sql/upgrades/125.sql
 sql/upgrades/126.sql
 sql/upgrades/127.sql
 sql/upgrades/128.sql
 sql/upgrades/129.sql
 sql/upgrades/130.sql
 sql/upgrades/131.sql
 sql/upgrades/132.sql
 swh/__init__.py
 swh.indexer.egg-info/PKG-INFO
 swh.indexer.egg-info/SOURCES.txt
 swh.indexer.egg-info/dependency_links.txt
 swh.indexer.egg-info/entry_points.txt
 swh.indexer.egg-info/requires.txt
 swh.indexer.egg-info/top_level.txt
 swh/indexer/__init__.py
 swh/indexer/cli.py
 swh/indexer/codemeta.py
 swh/indexer/ctags.py
 swh/indexer/fossology_license.py
 swh/indexer/indexer.py
 swh/indexer/journal_client.py
 swh/indexer/metadata.py
 swh/indexer/metadata_detector.py
 swh/indexer/mimetype.py
 swh/indexer/origin_head.py
 swh/indexer/py.typed
 swh/indexer/rehash.py
 swh/indexer/tasks.py
 swh/indexer/data/codemeta/CITATION
 swh/indexer/data/codemeta/LICENSE
 swh/indexer/data/codemeta/codemeta.jsonld
 swh/indexer/data/codemeta/crosswalk.csv
 swh/indexer/metadata_dictionary/__init__.py
 swh/indexer/metadata_dictionary/base.py
 swh/indexer/metadata_dictionary/codemeta.py
 swh/indexer/metadata_dictionary/maven.py
 swh/indexer/metadata_dictionary/npm.py
 swh/indexer/metadata_dictionary/python.py
 swh/indexer/metadata_dictionary/ruby.py
 swh/indexer/sql/10-swh-init.sql
 swh/indexer/sql/20-swh-enums.sql
 swh/indexer/sql/30-swh-schema.sql
 swh/indexer/sql/40-swh-func.sql
 swh/indexer/sql/50-swh-data.sql
 swh/indexer/sql/60-swh-indexes.sql
 swh/indexer/storage/__init__.py
 swh/indexer/storage/converters.py
 swh/indexer/storage/db.py
 swh/indexer/storage/exc.py
 swh/indexer/storage/in_memory.py
 swh/indexer/storage/interface.py
 swh/indexer/storage/metrics.py
 swh/indexer/storage/api/__init__.py
 swh/indexer/storage/api/client.py
 swh/indexer/storage/api/server.py
 swh/indexer/tests/__init__.py
 swh/indexer/tests/conftest.py
 swh/indexer/tests/tasks.py
 swh/indexer/tests/test_cli.py
 swh/indexer/tests/test_codemeta.py
 swh/indexer/tests/test_ctags.py
 swh/indexer/tests/test_fossology_license.py
 swh/indexer/tests/test_journal_client.py
 swh/indexer/tests/test_metadata.py
 swh/indexer/tests/test_mimetype.py
 swh/indexer/tests/test_origin_head.py
 swh/indexer/tests/test_origin_metadata.py
 swh/indexer/tests/utils.py
 swh/indexer/tests/storage/__init__.py
 swh/indexer/tests/storage/conftest.py
 swh/indexer/tests/storage/generate_data_test.py
 swh/indexer/tests/storage/test_api_client.py
 swh/indexer/tests/storage/test_converters.py
 swh/indexer/tests/storage/test_in_memory.py
 swh/indexer/tests/storage/test_metrics.py
 swh/indexer/tests/storage/test_server.py
 swh/indexer/tests/storage/test_storage.py
\ No newline at end of file
diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt
index 0d7adeb..69ab181 100644
--- a/swh.indexer.egg-info/requires.txt
+++ b/swh.indexer.egg-info/requires.txt
@@ -1,18 +1,18 @@
 vcversioner
 click
 python-magic>=0.4.13
 pyld
 xmltodict
 swh.core[db,http]>=0.0.87
 swh.model>=0.0.15
 swh.objstorage>=0.0.43
 swh.scheduler>=0.0.47
-swh.storage>=0.6.0
+swh.storage>=0.8.0
 swh.journal>=0.1.0
 
 [testing]
 confluent-kafka
 pytest
-pytest-postgresql
 hypothesis>=3.11.0
-swh.storage>=0.0.178
+swh.scheduler[testing]>=0.5.0
+swh.storage[testing]>=0.10.0
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 0fdb0db..0f28355 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,383 +1,381 @@
 # Copyright (C) 2017-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
 
 from typing import Any, Callable, Dict, Iterator, List, Tuple
 
 from swh.core.utils import grouper
 
 from swh.indexer.codemeta import merge_documents
 from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
 from swh.indexer.origin_head import OriginHeadIndexer
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 from swh.model import hashutil
 
 
 REVISION_GET_BATCH_SIZE = 10
 ORIGIN_GET_BATCH_SIZE = 10
 
 
 def call_with_batches(
     f: Callable[[List[Dict[str, Any]]], Dict["str", Any]],
     args: List[Dict[str, str]],
     batch_size: int,
 ) -> Iterator[str]:
     """Calls a function with batches of args, and concatenates the results.
     """
     groups = grouper(args, batch_size)
     for group in groups:
         yield from f(list(group))
 
 
 class ContentMetadataIndexer(ContentIndexer):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
         """
         yield from self.idx_storage.content_metadata_missing(
             ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)
         )
 
     def index(self, id, data, log_suffix="unknown revision"):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         result = {
             "id": id,
             "indexer_configuration_id": self.tool["id"],
             "metadata": None,
         }
         try:
             mapping_name = self.tool["tool_configuration"]["context"]
             log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
             result["metadata"] = MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id)
             )
         if result["metadata"] is None:
             return None
         return result
 
     def persist_index_computations(
         self, results: List[Dict], policy_update: str
     ) -> Dict[str, int]:
         """Persist the results in storage.
 
         Args:
             results: list of content_metadata, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - metadata (jsonb): detected metadata
             policy_update: either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         return self.idx_storage.content_metadata_add(
             results, conflict_update=(policy_update == "update-dups")
         )
 
 
 class RevisionMetadataIndexer(RevisionIndexer):
     """Revision-level indexer
 
     This indexer is in charge of:
 
     - filtering revisions already indexed in revision_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in root directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for revision
 
     """
 
     ADDITIONAL_CONFIG = {
         "tools": (
             "dict",
             {"name": "swh-metadata-detector", "version": "0.0.2", "configuration": {},},
         ),
     }
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.revision_intrinsic_metadata_missing(
             (
                 {"id": sha1_git, "indexer_configuration_id": self.tool["id"],}
                 for sha1_git in sha1_gits
             )
         )
 
     def index(self, rev):
         """Index rev by processing it and organizing result.
 
         use metadata_detector to iterate on filenames
 
         - if one filename detected -> sends file to content indexer
         - if multiple file detected -> translation needed at revision level
 
         Args:
           rev (dict): revision artifact from storage
 
         Returns:
             dict: dictionary representing a revision_intrinsic_metadata, with
             keys:
 
             - id (str): rev's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         result = {
             "id": rev["id"],
             "indexer_configuration_id": self.tool["id"],
             "mappings": None,
             "metadata": None,
         }
 
         try:
             root_dir = rev["directory"]
             dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
             if [entry["type"] for entry in dir_ls] == ["dir"]:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_ls[0]["target"]
                 dir_ls = self.storage.directory_ls(subdir, recursive=False)
             files = [entry for entry in dir_ls if entry["type"] == "file"]
             detected_files = detect_metadata(files)
             (mappings, metadata) = self.translate_revision_intrinsic_metadata(
                 detected_files,
                 log_suffix="revision=%s" % hashutil.hash_to_hex(rev["id"]),
             )
             result["mappings"] = mappings
             result["metadata"] = metadata
         except Exception as e:
             self.log.exception("Problem when indexing rev: %r", e)
         return result
 
     def persist_index_computations(
         self, results: List[Dict], policy_update: str
     ) -> Dict[str, int]:
         """Persist the results in storage.
 
         Args:
             results: list of content_mimetype, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
             policy_update: either 'update-dups' or 'ignore-dups' to
               respectively update duplicates or ignore them
 
         """
         # TODO: add functions in storage to keep data in
         # revision_intrinsic_metadata
         return self.idx_storage.revision_intrinsic_metadata_add(
             results, conflict_update=(policy_update == "update-dups")
         )
 
     def translate_revision_intrinsic_metadata(
         self, detected_files: Dict[str, List[Any]], log_suffix: str
     ) -> Tuple[List[Any], List[Any]]:
         """
         Determine plan of action to translate metadata when containing
         one or multiple detected files:
 
         Args:
             detected_files: dictionary mapping context names (e.g.,
               "npm", "authors") to list of sha1
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         used_mappings = [MAPPINGS[context].name for context in detected_files]
         metadata = []
         tool = {
             "name": "swh-metadata-translator",
             "version": "0.0.2",
             "configuration": {},
         }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
         config["tools"] = [tool]
         for context in detected_files.keys():
             cfg = deepcopy(config)
             cfg["tools"][0]["configuration"]["context"] = context
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(
                 detected_files[context]
             )
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c["id"]
                 sha1s_in_storage.append(sha1)
                 local_metadata = c["metadata"]
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [
                 item for item in detected_files[context] if item not in sha1s_in_storage
             ]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(
                         sha1s_filtered,
                         policy_update="ignore-dups",
                         log_suffix=log_suffix,
                     )
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result["metadata"]
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception("Exception while indexing metadata on contents")
 
         metadata = merge_documents(metadata)
         return (used_mappings, metadata)
 
 
 class OriginMetadataIndexer(OriginIndexer):
     ADDITIONAL_CONFIG = RevisionMetadataIndexer.ADDITIONAL_CONFIG
 
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs) -> None:
         super().__init__(config=config, **kwargs)
         self.origin_head_indexer = OriginHeadIndexer(config=config)
         self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
 
     def index_list(self, origin_urls, **kwargs):
         head_rev_ids = []
         origins_with_head = []
         origins = list(
             call_with_batches(
-                self.storage.origin_get,
-                [{"url": url} for url in origin_urls],
-                ORIGIN_GET_BATCH_SIZE,
+                self.storage.origin_get, origin_urls, ORIGIN_GET_BATCH_SIZE,
             )
         )
         for origin in origins:
             if origin is None:
                 continue
-            head_result = self.origin_head_indexer.index(origin["url"])
+            head_result = self.origin_head_indexer.index(origin.url)
             if head_result:
                 origins_with_head.append(origin)
                 head_rev_ids.append(head_result["revision_id"])
 
         head_revs = list(
             call_with_batches(
                 self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
             )
         )
         assert len(head_revs) == len(head_rev_ids)
 
         results = []
         for (origin, rev) in zip(origins_with_head, head_revs):
             if not rev:
-                self.log.warning("Missing head revision of origin %r", origin["url"])
+                self.log.warning("Missing head revision of origin %r", origin.url)
                 continue
 
             rev_metadata = self.revision_metadata_indexer.index(rev)
             orig_metadata = {
                 "from_revision": rev_metadata["id"],
-                "id": origin["url"],
+                "id": origin.url,
                 "metadata": rev_metadata["metadata"],
                 "mappings": rev_metadata["mappings"],
                 "indexer_configuration_id": rev_metadata["indexer_configuration_id"],
             }
             results.append((orig_metadata, rev_metadata))
         return results
 
     def persist_index_computations(
         self, results: List[Dict], policy_update: str
     ) -> Dict[str, int]:
         conflict_update = policy_update == "update-dups"
 
         # Deduplicate revisions
         rev_metadata: List[Any] = []
         orig_metadata: List[Any] = []
         revs_to_delete: List[Any] = []
         origs_to_delete: List[Any] = []
         summary: Dict = {}
         for (orig_item, rev_item) in results:
             assert rev_item["metadata"] == orig_item["metadata"]
             if not rev_item["metadata"] or rev_item["metadata"].keys() <= {"@context"}:
                 # If we didn't find any metadata, don't store a DB record
                 # (and delete existing ones, if any)
                 if rev_item not in revs_to_delete:
                     revs_to_delete.append(rev_item)
                 if orig_item not in origs_to_delete:
                     origs_to_delete.append(orig_item)
             else:
                 if rev_item not in rev_metadata:
                     rev_metadata.append(rev_item)
                 if orig_item not in orig_metadata:
                     orig_metadata.append(orig_item)
 
         if rev_metadata:
             summary_rev = self.idx_storage.revision_intrinsic_metadata_add(
                 rev_metadata, conflict_update=conflict_update
             )
             summary.update(summary_rev)
         if orig_metadata:
             summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
                 orig_metadata, conflict_update=conflict_update
             )
             summary.update(summary_ori)
 
         # revs_to_delete should always be empty unless we changed a mapping
         # to detect less files or less content.
         # However, origs_to_delete may be empty whenever an upstream deletes
         # a metadata file.
         if origs_to_delete:
             summary_ori = self.idx_storage.origin_intrinsic_metadata_delete(
                 origs_to_delete
             )
             summary.update(summary_ori)
         if revs_to_delete:
             summary_rev = self.idx_storage.revision_intrinsic_metadata_delete(
                 revs_to_delete
             )
             summary.update(summary_rev)
 
         return summary
diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
index fb25abd..1ba1528 100644
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,86 +1,74 @@
 # Copyright (C) 2019-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import timedelta
 from unittest.mock import patch
 
 import pytest
 
 from swh.objstorage import get_objstorage
-from swh.scheduler.tests.conftest import *  # noqa
 from swh.storage import get_storage
 from swh.indexer.storage import get_indexer_storage
 
 from .utils import fill_storage, fill_obj_storage
 
 
 TASK_NAMES = ["revision_intrinsic_metadata", "origin_intrinsic_metadata"]
 
 
-storage_config = {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]}
-
-
 @pytest.fixture
 def indexer_scheduler(swh_scheduler):
     for taskname in TASK_NAMES:
         swh_scheduler.create_task_type(
             {
                 "type": taskname,
                 "description": "The {} indexer testing task".format(taskname),
                 "backend_name": "swh.indexer.tests.tasks.{}".format(taskname),
                 "default_interval": timedelta(days=1),
                 "min_interval": timedelta(hours=6),
                 "max_interval": timedelta(days=12),
                 "num_retries": 3,
             }
         )
     return swh_scheduler
 
 
 @pytest.fixture
 def idx_storage():
     """An instance of in-memory indexer storage that gets injected into all
     indexers classes.
 
     """
     idx_storage = get_indexer_storage("memory", {})
     with patch("swh.indexer.storage.in_memory.IndexerStorage") as idx_storage_mock:
         idx_storage_mock.return_value = idx_storage
         yield idx_storage
 
 
 @pytest.fixture
 def storage():
     """An instance of in-memory storage that gets injected into all indexers
        classes.
 
     """
-    storage = get_storage(**storage_config)
+    storage = get_storage(cls="memory")
     fill_storage(storage)
     with patch("swh.storage.in_memory.InMemoryStorage") as storage_mock:
         storage_mock.return_value = storage
         yield storage
 
 
 @pytest.fixture
 def obj_storage():
     """An instance of in-memory objstorage that gets injected into all indexers
     classes.
 
     """
     objstorage = get_objstorage("memory", {})
     fill_obj_storage(objstorage)
     with patch.dict(
         "swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage}
     ):
         yield objstorage
-
-
-@pytest.fixture(scope="session")  # type: ignore  # expected redefinition
-def celery_includes():
-    return [
-        "swh.indexer.tests.tasks",
-        "swh.indexer.tasks",
-    ]
diff --git a/swh/indexer/tests/storage/conftest.py b/swh/indexer/tests/storage/conftest.py
index e2df26c..a67b2dc 100644
--- a/swh/indexer/tests/storage/conftest.py
+++ b/swh/indexer/tests/storage/conftest.py
@@ -1,73 +1,73 @@
 # Copyright (C) 2015-2019  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from os.path import join
 import pytest
 
 from . import SQL_DIR
-from swh.storage.tests.conftest import postgresql_fact
+from swh.storage.pytest_plugin import postgresql_fact
 from swh.indexer.storage import get_indexer_storage
 from swh.model.hashutil import hash_to_bytes
 from .generate_data_test import MIMETYPE_OBJECTS, FOSSOLOGY_LICENSES, TOOLS
 
 
 DUMP_FILES = join(SQL_DIR, "*.sql")
 
 
 class DataObj(dict):
     def __getattr__(self, key):
         return self.__getitem__(key)
 
     def __setattr__(self, key, value):
         return self.__setitem__(key, value)
 
 
 @pytest.fixture
 def swh_indexer_storage_with_data(swh_indexer_storage):
     data = DataObj()
     tools = {
         tool["tool_name"]: {
             "id": tool["id"],
             "name": tool["tool_name"],
             "version": tool["tool_version"],
             "configuration": tool["tool_configuration"],
         }
         for tool in swh_indexer_storage.indexer_configuration_add(TOOLS)
     }
     data.tools = tools
     data.sha1_1 = hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689")
     data.sha1_2 = hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7")
     data.revision_id_1 = hash_to_bytes("7026b7c1a2af56521e951c01ed20f255fa054238")
     data.revision_id_2 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904321")
     data.revision_id_3 = hash_to_bytes("7026b7c1a2af56521e9587659012345678904320")
     data.origin_url_1 = "file:///dev/0/zero"  # 44434341
     data.origin_url_2 = "file:///dev/1/one"  # 44434342
     data.origin_url_3 = "file:///dev/2/two"  # 54974445
     data.mimetypes = [
         {**mimetype_obj, "indexer_configuration_id": tools["file"]["id"]}
         for mimetype_obj in MIMETYPE_OBJECTS
     ]
     swh_indexer_storage.content_mimetype_add(data.mimetypes)
     data.fossology_licenses = [
         {**fossology_obj, "indexer_configuration_id": tools["nomos"]["id"]}
         for fossology_obj in FOSSOLOGY_LICENSES
     ]
     swh_indexer_storage._test_data = data
 
     return (swh_indexer_storage, data)
 
 
 swh_indexer_storage_postgresql = postgresql_fact(
     "postgresql_proc", dump_files=DUMP_FILES
 )
 
 
 @pytest.fixture
 def swh_indexer_storage(swh_indexer_storage_postgresql):
     storage_config = {
         "cls": "local",
         "args": {"db": swh_indexer_storage_postgresql.dsn,},
     }
     return get_indexer_storage(**storage_config)
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index c3ef250..7abb4ed 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1210 +1,1205 @@
-# Copyright (C) 2017-2018  The Software Heritage developers
+# Copyright (C) 2017-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 import unittest
 
-import attr
-
 from hypothesis import given, strategies, settings, HealthCheck
 
 from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Directory, DirectoryEntry, Revision
 
 from swh.indexer.codemeta import CODEMETA_TERMS
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.metadata_dictionary.maven import MavenMapping
 from swh.indexer.metadata_dictionary.npm import NpmMapping
 from swh.indexer.metadata_dictionary.ruby import GemspecMapping
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
 
+from swh.indexer.tests.utils import REVISION, DIRECTORY2
+
 from .utils import (
     BASE_TEST_CONFIG,
     fill_obj_storage,
     fill_storage,
     YARN_PARSER_METADATA,
     json_document_strategy,
     xml_document_strategy,
 )
 
 
 TRANSLATOR_TOOL = {
     "name": "swh-metadata-translator",
     "version": "0.0.2",
     "configuration": {"type": "local", "context": "NpmMapping"},
 }
 
 
 class ContentMetadataTestIndexer(ContentMetadataIndexer):
     """Specific Metadata whose configuration is enough to satisfy the
        indexing tests.
     """
 
     def parse_config_file(self, *args, **kwargs):
         assert False, "should not be called; the rev indexer configures it."
 
 
 REVISION_METADATA_CONFIG = {
     **BASE_TEST_CONFIG,
     "tools": TRANSLATOR_TOOL,
 }
 
 
 class Metadata(unittest.TestCase):
     """
     Tests metadata_mock_tool tool for Metadata detection
     """
 
     def setUp(self):
         """
         shows the entire diff in the results
         """
         self.maxDiff = None
         self.npm_mapping = MAPPINGS["NpmMapping"]()
         self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
         self.maven_mapping = MAPPINGS["MavenMapping"]()
         self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]()
         self.gemspec_mapping = MAPPINGS["GemspecMapping"]()
 
     def test_compute_metadata_none(self):
         """
         testing content empty content is empty
         should return None
         """
         # given
         content = b""
 
         # None if no metadata was found or an error occurred
         declared_metadata = None
         # when
         result = self.npm_mapping.translate(content)
         # then
         self.assertEqual(declared_metadata, result)
 
     def test_compute_metadata_npm(self):
         """
         testing only computation of metadata with hard_mapping_npm
         """
         # given
         content = b"""
             {
                 "name": "test_metadata",
                 "version": "0.0.2",
                 "description": "Simple package.json test for indexer",
                   "repository": {
                     "type": "git",
                     "url": "https://github.com/moranegg/metadata_test"
                 },
                 "author": {
                     "email": "moranegg@example.com",
                     "name": "Morane G"
                 }
             }
         """
         declared_metadata = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "name": "test_metadata",
             "version": "0.0.2",
             "description": "Simple package.json test for indexer",
             "codeRepository": "git+https://github.com/moranegg/metadata_test",
             "author": [
                 {"type": "Person", "name": "Morane G", "email": "moranegg@example.com",}
             ],
         }
 
         # when
         result = self.npm_mapping.translate(content)
         # then
         self.assertEqual(declared_metadata, result)
 
     def test_index_content_metadata_npm(self):
         """
         testing NPM with package.json
         - one sha1 uses a file that can't be translated to metadata and
           should return None in the translated metadata
         """
         # given
         sha1s = [
             hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
             hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
             hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
         ]
         # this metadata indexer computes only metadata for package.json
         # in npm context with a hard mapping
         config = BASE_TEST_CONFIG.copy()
         config["tools"] = [TRANSLATOR_TOOL]
         metadata_indexer = ContentMetadataTestIndexer(config=config)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # when
         metadata_indexer.run(sha1s, policy_update="ignore-dups")
         results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
 
         expected_results = [
             {
                 "metadata": {
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                     "type": "SoftwareSourceCode",
                     "codeRepository": "git+https://github.com/moranegg/metadata_test",
                     "description": "Simple package.json test for indexer",
                     "name": "test_metadata",
                     "version": "0.0.1",
                 },
                 "id": hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
             },
             {
                 "metadata": {
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                     "type": "SoftwareSourceCode",
                     "issueTracker": "https://github.com/npm/npm/issues",
                     "author": [
                         {
                             "type": "Person",
                             "name": "Isaac Z. Schlueter",
                             "email": "i@izs.me",
                             "url": "http://blog.izs.me",
                         }
                     ],
                     "codeRepository": "git+https://github.com/npm/npm",
                     "description": "a package manager for JavaScript",
                     "license": "https://spdx.org/licenses/Artistic-2.0",
                     "version": "5.0.3",
                     "name": "npm",
                     "keywords": [
                         "install",
                         "modules",
                         "package manager",
                         "package.json",
                     ],
                     "url": "https://docs.npmjs.com/",
                 },
                 "id": hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
             },
         ]
 
         for result in results:
             del result["tool"]
 
         # The assertion below returns False sometimes because of nested lists
         self.assertEqual(expected_results, results)
 
     def test_npm_bugs_normalization(self):
         # valid dictionary
         package_json = b"""{
             "name": "foo",
             "bugs": {
                 "url": "https://github.com/owner/project/issues",
                 "email": "foo@example.com"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "name": "foo",
                 "issueTracker": "https://github.com/owner/project/issues",
                 "type": "SoftwareSourceCode",
             },
         )
 
         # "invalid" dictionary
         package_json = b"""{
             "name": "foo",
             "bugs": {
                 "email": "foo@example.com"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "name": "foo",
                 "type": "SoftwareSourceCode",
             },
         )
 
         # string
         package_json = b"""{
             "name": "foo",
             "bugs": "https://github.com/owner/project/issues"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "name": "foo",
                 "issueTracker": "https://github.com/owner/project/issues",
                 "type": "SoftwareSourceCode",
             },
         )
 
     def test_npm_repository_normalization(self):
         # normal
         package_json = b"""{
             "name": "foo",
             "repository": {
                 "type" : "git",
                 "url" : "https://github.com/npm/cli.git"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "name": "foo",
                 "codeRepository": "git+https://github.com/npm/cli.git",
                 "type": "SoftwareSourceCode",
             },
         )
 
         # missing url
         package_json = b"""{
             "name": "foo",
             "repository": {
                 "type" : "git"
             }
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "name": "foo",
                 "type": "SoftwareSourceCode",
             },
         )
 
         # github shortcut
         package_json = b"""{
             "name": "foo",
             "repository": "github:npm/cli"
         }"""
         result = self.npm_mapping.translate(package_json)
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "name": "foo",
             "codeRepository": "git+https://github.com/npm/cli.git",
             "type": "SoftwareSourceCode",
         }
         self.assertEqual(result, expected_result)
 
         # github shortshortcut
         package_json = b"""{
             "name": "foo",
             "repository": "npm/cli"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(result, expected_result)
 
         # gitlab shortcut
         package_json = b"""{
             "name": "foo",
             "repository": "gitlab:user/repo"
         }"""
         result = self.npm_mapping.translate(package_json)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "name": "foo",
                 "codeRepository": "git+https://gitlab.com/user/repo.git",
                 "type": "SoftwareSourceCode",
             },
         )
 
     def test_detect_metadata_package_json(self):
         # given
         df = [
             {
                 "sha1_git": b"abc",
                 "name": b"index.js",
                 "target": b"abc",
                 "length": 897,
                 "status": "visible",
                 "type": "file",
                 "perms": 33188,
                 "dir_id": b"dir_a",
                 "sha1": b"bcd",
             },
             {
                 "sha1_git": b"aab",
                 "name": b"package.json",
                 "target": b"aab",
                 "length": 712,
                 "status": "visible",
                 "type": "file",
                 "perms": 33188,
                 "dir_id": b"dir_a",
                 "sha1": b"cde",
             },
         ]
         # when
         results = detect_metadata(df)
 
         expected_results = {"NpmMapping": [b"cde"]}
         # then
         self.assertEqual(expected_results, results)
 
     def test_compute_metadata_valid_codemeta(self):
         raw_content = b"""{
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "@type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
             "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
             "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
             "codeRepository": "https://github.com/codemeta/codemeta",
             "issueTracker": "https://github.com/codemeta/codemeta/issues",
             "license": "https://spdx.org/licenses/Apache-2.0",
             "version": "2.0",
             "author": [
               {
                 "@type": "Person",
                 "givenName": "Carl",
                 "familyName": "Boettiger",
                 "email": "cboettig@gmail.com",
                 "@id": "http://orcid.org/0000-0002-1642-628X"
               },
               {
                 "@type": "Person",
                 "givenName": "Matthew B.",
                 "familyName": "Jones",
                 "email": "jones@nceas.ucsb.edu",
                 "@id": "http://orcid.org/0000-0003-0077-4738"
               }
             ],
             "maintainer": {
               "@type": "Person",
               "givenName": "Carl",
               "familyName": "Boettiger",
               "email": "cboettig@gmail.com",
               "@id": "http://orcid.org/0000-0002-1642-628X"
             },
             "contIntegration": "https://travis-ci.org/codemeta/codemeta",
             "developmentStatus": "active",
             "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
             "funder": {
                 "@id": "https://doi.org/10.13039/100000001",
                 "@type": "Organization",
                 "name": "National Science Foundation"
             },
             "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
             "keywords": [
               "metadata",
               "software"
             ],
             "version":"2.0",
             "dateCreated":"2017-06-05",
             "datePublished":"2017-06-05",
             "programmingLanguage": "JSON-LD"
           }"""  # noqa
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
             "description": "CodeMeta is a concept vocabulary that can "
             "be used to standardize the exchange of software metadata "
             "across repositories and organizations.",
             "name": "CodeMeta: Minimal metadata schemas for science "
             "software and code, in JSON-LD",
             "codeRepository": "https://github.com/codemeta/codemeta",
             "issueTracker": "https://github.com/codemeta/codemeta/issues",
             "license": "https://spdx.org/licenses/Apache-2.0",
             "version": "2.0",
             "author": [
                 {
                     "type": "Person",
                     "givenName": "Carl",
                     "familyName": "Boettiger",
                     "email": "cboettig@gmail.com",
                     "id": "http://orcid.org/0000-0002-1642-628X",
                 },
                 {
                     "type": "Person",
                     "givenName": "Matthew B.",
                     "familyName": "Jones",
                     "email": "jones@nceas.ucsb.edu",
                     "id": "http://orcid.org/0000-0003-0077-4738",
                 },
             ],
             "maintainer": {
                 "type": "Person",
                 "givenName": "Carl",
                 "familyName": "Boettiger",
                 "email": "cboettig@gmail.com",
                 "id": "http://orcid.org/0000-0002-1642-628X",
             },
             "contIntegration": "https://travis-ci.org/codemeta/codemeta",
             "developmentStatus": "active",
             "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
             "funder": {
                 "id": "https://doi.org/10.13039/100000001",
                 "type": "Organization",
                 "name": "National Science Foundation",
             },
             "funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
             "in Scientific Software",
             "keywords": ["metadata", "software"],
             "version": "2.0",
             "dateCreated": "2017-06-05",
             "datePublished": "2017-06-05",
             "programmingLanguage": "JSON-LD",
         }
         result = self.codemeta_mapping.translate(raw_content)
         self.assertEqual(result, expected_result)
 
     def test_compute_metadata_codemeta_alternate_context(self):
         raw_content = b"""{
             "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
             "@type": "SoftwareSourceCode",
             "identifier": "CodeMeta"
         }"""  # noqa
         expected_result = {
             "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
             "type": "SoftwareSourceCode",
             "identifier": "CodeMeta",
         }
         result = self.codemeta_mapping.translate(raw_content)
         self.assertEqual(result, expected_result)
 
     def test_compute_metadata_maven(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
             <repository>
               <id>central</id>
               <name>Maven Repository Switchboard</name>
               <layout>default</layout>
               <url>http://repo1.maven.org/maven2</url>
               <snapshots>
                 <enabled>false</enabled>
               </snapshots>
             </repository>
           </repositories>
           <licenses>
             <license>
               <name>Apache License, Version 2.0</name>
               <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
               <distribution>repo</distribution>
               <comments>A business-friendly OSS license</comments>
             </license>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
                 "codeRepository": (
                     "http://repo1.maven.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
     def test_compute_metadata_maven_empty(self):
         raw_content = b"""
         <project>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
             },
         )
 
     def test_compute_metadata_maven_almost_empty(self):
         raw_content = b"""
         <project>
           <foo/>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
             },
         )
 
     def test_compute_metadata_maven_invalid_xml(self):
         expected_warning = (
             "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
             "Error parsing XML from foo"
         )
 
         raw_content = b"""
         <project>"""
         with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
             result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
         raw_content = b"""
         """
         with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
             result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_unknown_encoding(self):
         expected_warning = (
             "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
             "Error detecting XML encoding from foo"
         )
 
         raw_content = b"""<?xml version="1.0" encoding="foo"?>
         <project>
         </project>"""
         with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
             result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
         raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
         <project>
         </project>"""
         with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
             result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_invalid_encoding(self):
         expected_warning = (
             "WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
             "Error unidecoding XML from foo"
         )
 
         raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
         <foo\xe5ct>
         </foo>"""
         with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
             result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
             self.assertEqual(cm.output, [expected_warning])
         self.assertEqual(result, None)
 
     def test_compute_metadata_maven_minimal(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "codeRepository": (
                     "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
     def test_compute_metadata_maven_empty_nodes(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
           </repositories>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "codeRepository": (
                     "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version></version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "codeRepository": (
                     "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
         raw_content = b"""
         <project>
           <name></name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "codeRepository": (
                     "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <licenses>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "codeRepository": (
                     "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
         raw_content = b"""
         <project>
           <groupId></groupId>
           <version>1.2.3</version>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "version": "1.2.3",
             },
         )
 
     def test_compute_metadata_maven_invalid_licenses(self):
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <licenses>
             foo
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "codeRepository": (
                     "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
                 ),
             },
         )
 
     def test_compute_metadata_maven_multiple(self):
         """Tests when there are multiple code repos and licenses."""
         raw_content = b"""
         <project>
           <name>Maven Default Project</name>
           <modelVersion>4.0.0</modelVersion>
           <groupId>com.mycompany.app</groupId>
           <artifactId>my-app</artifactId>
           <version>1.2.3</version>
           <repositories>
             <repository>
               <id>central</id>
               <name>Maven Repository Switchboard</name>
               <layout>default</layout>
               <url>http://repo1.maven.org/maven2</url>
               <snapshots>
                 <enabled>false</enabled>
               </snapshots>
             </repository>
             <repository>
               <id>example</id>
               <name>Example Maven Repo</name>
               <layout>default</layout>
               <url>http://example.org/maven2</url>
             </repository>
           </repositories>
           <licenses>
             <license>
               <name>Apache License, Version 2.0</name>
               <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
               <distribution>repo</distribution>
               <comments>A business-friendly OSS license</comments>
             </license>
             <license>
               <name>MIT license</name>
               <url>https://opensource.org/licenses/MIT</url>
             </license>
           </licenses>
         </project>"""
         result = self.maven_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "Maven Default Project",
                 "identifier": "com.mycompany.app",
                 "version": "1.2.3",
                 "license": [
                     "https://www.apache.org/licenses/LICENSE-2.0.txt",
                     "https://opensource.org/licenses/MIT",
                 ],
                 "codeRepository": [
                     "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
                     "http://example.org/maven2/com/mycompany/app/my-app",
                 ],
             },
         )
 
     def test_compute_metadata_pkginfo(self):
         raw_content = b"""\
 Metadata-Version: 2.1
 Name: swh.core
 Version: 0.0.49
 Summary: Software Heritage core utilities
 Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
 Description: swh-core
         ========
        \x20
         core library for swh's modules:
         - config parser
         - hash computations
         - serialization
         - logging mechanism
        \x20
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 """  # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertCountEqual(
             result["description"],
             [
                 "Software Heritage core utilities",  # note the comma here
                 "swh-core\n"
                 "========\n"
                 "\n"
                 "core library for swh's modules:\n"
                 "- config parser\n"
                 "- hash computations\n"
                 "- serialization\n"
                 "- logging mechanism\n"
                 "",
             ],
             result,
         )
         del result["description"]
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "url": "https://forge.softwareheritage.org/diffusion/DCORE/",
                 "name": "swh.core",
                 "author": [
                     {
                         "type": "Person",
                         "name": "Software Heritage developers",
                         "email": "swh-devel@inria.fr",
                     }
                 ],
                 "version": "0.0.49",
             },
         )
 
     def test_compute_metadata_pkginfo_utf8(self):
         raw_content = b"""\
 Metadata-Version: 1.1
 Name: snowpyt
 Description-Content-Type: UNKNOWN
 Description: foo
         Hydrology N\xc2\xb083
 """  # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "snowpyt",
                 "description": "foo\nHydrology N°83",
             },
         )
 
     def test_compute_metadata_pkginfo_keywords(self):
         raw_content = b"""\
 Metadata-Version: 2.1
 Name: foo
 Keywords: foo bar baz
 """  # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "foo",
                 "keywords": ["foo", "bar", "baz"],
             },
         )
 
     def test_compute_metadata_pkginfo_license(self):
         raw_content = b"""\
 Metadata-Version: 2.1
 Name: foo
 License: MIT
 """  # noqa
         result = self.pkginfo_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "foo",
                 "license": "MIT",
             },
         )
 
     def test_gemspec_base(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.name        = 'example'
   s.version     = '0.1.0'
   s.licenses    = ['MIT']
   s.summary     = "This is an example!"
   s.description = "Much longer explanation of the example!"
   s.authors     = ["Ruby Coder"]
   s.email       = 'rubycoder@example.com'
   s.files       = ["lib/example.rb"]
   s.homepage    = 'https://rubygems.org/gems/example'
   s.metadata    = { "source_code_uri" => "https://github.com/example/example" }
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertCountEqual(
             result.pop("description"),
             ["This is an example!", "Much longer explanation of the example!"],
         )
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "author": [{"type": "Person", "name": "Ruby Coder"}],
                 "name": "example",
                 "license": "https://spdx.org/licenses/MIT",
                 "codeRepository": "https://rubygems.org/gems/example",
                 "email": "rubycoder@example.com",
                 "version": "0.1.0",
             },
         )
 
     def test_gemspec_two_author_fields(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.authors     = ["Ruby Coder1"]
   s.author      = "Ruby Coder2"
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertCountEqual(
             result.pop("author"),
             [
                 {"type": "Person", "name": "Ruby Coder1"},
                 {"type": "Person", "name": "Ruby Coder2"},
             ],
         )
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
             },
         )
 
     def test_gemspec_invalid_author(self):
         raw_content = b"""
 Gem::Specification.new do |s|
   s.author      = ["Ruby Coder"]
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
             },
         )
         raw_content = b"""
 Gem::Specification.new do |s|
   s.author      = "Ruby Coder1",
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
             },
         )
         raw_content = b"""
 Gem::Specification.new do |s|
   s.authors     = ["Ruby Coder1", ["Ruby Coder2"]]
 end"""
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "author": [{"type": "Person", "name": "Ruby Coder1"}],
             },
         )
 
     def test_gemspec_alternative_header(self):
         raw_content = b"""
 require './lib/version'
 
 Gem::Specification.new { |s|
   s.name = 'rb-system-with-aliases'
   s.summary = 'execute system commands with aliases'
 }
 """
         result = self.gemspec_mapping.translate(raw_content)
         self.assertEqual(
             result,
             {
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "name": "rb-system-with-aliases",
                 "description": "execute system commands with aliases",
             },
         )
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(keys=list(NpmMapping.mapping)))
     def test_npm_adversarial(self, doc):
         raw = json.dumps(doc).encode()
         self.npm_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(json_document_strategy(keys=CODEMETA_TERMS))
     def test_codemeta_adversarial(self, doc):
         raw = json.dumps(doc).encode()
         self.codemeta_mapping.translate(raw)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(
         xml_document_strategy(
             keys=list(MavenMapping.mapping),
             root="project",
             xmlns="http://maven.apache.org/POM/4.0.0",
         )
     )
     def test_maven_adversarial(self, doc):
         self.maven_mapping.translate(doc)
 
     @settings(suppress_health_check=[HealthCheck.too_slow])
     @given(
         strategies.dictionaries(
             # keys
             strategies.one_of(
                 strategies.text(), *map(strategies.just, GemspecMapping.mapping)
             ),
             # values
             strategies.recursive(
                 strategies.characters(),
                 lambda children: strategies.lists(children, min_size=1),
             ),
         )
     )
     def test_gemspec_adversarial(self, doc):
         parts = [b"Gem::Specification.new do |s|\n"]
         for (k, v) in doc.items():
             parts.append("  s.{} = {}\n".format(k, repr(v)).encode())
         parts.append(b"end\n")
         self.gemspec_mapping.translate(b"".join(parts))
 
     def test_revision_metadata_indexer(self):
         metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
-            {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()}
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
+        rev = REVISION
+        assert rev.directory == DIRECTORY2.id
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 {
                     "indexer_configuration_id": tool["id"],
-                    "id": b"cde",
+                    "id": DIRECTORY2.entries[0].target,
                     "metadata": YARN_PARSER_METADATA,
                 }
             ]
         )
 
-        sha1_gits = [
-            hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
-        ]
-        metadata_indexer.run(sha1_gits, "update-dups")
+        metadata_indexer.run([rev.id], "update-dups")
 
         results = list(
-            metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits)
+            metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
         )
 
         expected_results = [
             {
-                "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
+                "id": rev.id,
                 "tool": TRANSLATOR_TOOL,
                 "metadata": YARN_PARSER_METADATA,
                 "mappings": ["npm"],
             }
         ]
 
         for result in results:
             del result["tool"]["id"]
 
         # then
-        self.assertEqual(expected_results, results)
+        self.assertEqual(results, expected_results)
 
     def test_revision_metadata_indexer_single_root_dir(self):
         metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # Add a parent directory, that is the only directory at the root
         # of the revision
-        rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
-        rev = metadata_indexer.storage._revisions[rev_id]
-        subdir_id = rev.directory
-        rev = attr.evolve(rev, directory=b"123456")
-        metadata_indexer.storage.directory_add(
-            [
-                {
-                    "id": b"123456",
-                    "entries": [
-                        {
-                            "name": b"foobar-1.0.0",
-                            "type": "dir",
-                            "target": subdir_id,
-                            "perms": 16384,
-                        }
-                    ],
-                }
-            ]
+        rev = REVISION
+        assert rev.directory == DIRECTORY2.id
+
+        directory = Directory(
+            entries=(
+                DirectoryEntry(
+                    name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384,
+                ),
+            ),
         )
+        assert directory.id is not None
+        metadata_indexer.storage.directory_add([directory])
+
+        new_rev_dict = {**rev.to_dict(), "directory": directory.id}
+        new_rev_dict.pop("id")
+        new_rev = Revision.from_dict(new_rev_dict)
+        metadata_indexer.storage.revision_add([new_rev])
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
-            {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()}
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 {
                     "indexer_configuration_id": tool["id"],
-                    "id": b"cde",
+                    "id": DIRECTORY2.entries[0].target,
                     "metadata": YARN_PARSER_METADATA,
                 }
             ]
         )
 
-        sha1_gits = [
-            hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
-        ]
-        metadata_indexer.run(sha1_gits, "update-dups")
+        metadata_indexer.run([new_rev.id], "update-dups")
 
         results = list(
-            metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits)
+            metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
         )
 
         expected_results = [
             {
-                "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
+                "id": new_rev.id,
                 "tool": TRANSLATOR_TOOL,
                 "metadata": YARN_PARSER_METADATA,
                 "mappings": ["npm"],
             }
         ]
 
         for result in results:
             del result["tool"]["id"]
 
         # then
-        self.assertEqual(expected_results, results)
+        self.assertEqual(results, expected_results)
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
index a5ed93c..c137dd0 100644
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -1,199 +1,170 @@
 # Copyright (C) 2017-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import unittest
 from datetime import datetime, timezone
 
 from swh.model.model import OriginVisit, OriginVisitStatus
 from swh.indexer.origin_head import OriginHeadIndexer
 from swh.indexer.tests.utils import BASE_TEST_CONFIG, fill_storage
 from swh.storage.utils import now
+from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType
+
 
 ORIGIN_HEAD_CONFIG = {
     **BASE_TEST_CONFIG,
     "tools": {"name": "origin-metadata", "version": "0.0.1", "configuration": {},},
     "tasks": {"revision_intrinsic_metadata": None, "origin_intrinsic_metadata": None,},
 }
 
 
 class OriginHeadTestIndexer(OriginHeadIndexer):
     """Specific indexer whose configuration is enough to satisfy the
        indexing tests.
     """
 
     def parse_config_file(self, *args, **kwargs):
         return ORIGIN_HEAD_CONFIG
 
     def persist_index_computations(self, results, policy_update):
         self.results = results
 
 
 class OriginHead(unittest.TestCase):
     def setUp(self):
         self.indexer = OriginHeadTestIndexer()
         self.indexer.catch_exceptions = False
         fill_storage(self.indexer.storage)
 
     def test_git(self):
-        self.indexer.run(["https://github.com/SoftwareHeritage/swh-storage"])
+        origin_url = "https://github.com/SoftwareHeritage/swh-storage"
+        self.indexer.run([origin_url])
+        rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm"
         self.assertEqual(
-            self.indexer.results,
-            [
-                {
-                    "revision_id": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{"
-                    b"\xd7}\xac\xefrm",
-                    "origin_url": "https://github.com/SoftwareHeritage/swh-storage",
-                }
-            ],
+            self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
         )
 
     def test_git_partial_snapshot(self):
         """Checks partial snapshots are ignored."""
         origin_url = "https://github.com/SoftwareHeritage/swh-core"
-        self.indexer.storage.origin_add_one(
-            {"url": origin_url,}
-        )
+        self.indexer.storage.origin_add([Origin(url=origin_url)])
         visit = self.indexer.storage.origin_visit_add(
             [
                 OriginVisit(
                     origin=origin_url,
                     date=datetime(2019, 2, 27, tzinfo=timezone.utc),
                     type="git",
-                    status="ongoing",
-                    snapshot=None,
                 )
             ]
         )[0]
         self.indexer.storage.snapshot_add(
             [
-                {
-                    "id": b"foo",
-                    "branches": {
+                Snapshot(
+                    branches={
                         b"foo": None,
-                        b"HEAD": {"target_type": "alias", "target": b"foo",},
+                        b"HEAD": SnapshotBranch(
+                            target_type=TargetType.ALIAS, target=b"foo",
+                        ),
                     },
-                }
+                ),
             ]
         )
         visit_status = OriginVisitStatus(
             origin=origin_url,
             visit=visit.visit,
             date=now(),
             status="partial",
             snapshot=b"foo",
         )
         self.indexer.storage.origin_visit_status_add([visit_status])
         self.indexer.run([origin_url])
         self.assertEqual(self.indexer.results, [])
 
     def test_vcs_missing_snapshot(self):
-        self.indexer.storage.origin_add(
-            [{"url": "https://github.com/SoftwareHeritage/swh-indexer",}]
-        )
-        self.indexer.run(["https://github.com/SoftwareHeritage/swh-indexer"])
+        origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
+        self.indexer.storage.origin_add([Origin(url=origin_url)])
+        self.indexer.run([origin_url])
         self.assertEqual(self.indexer.results, [])
 
     def test_pypi_missing_branch(self):
         origin_url = "https://pypi.org/project/abcdef/"
-        self.indexer.storage.origin_add_one(
-            {"url": origin_url,}
-        )
+        self.indexer.storage.origin_add([Origin(url=origin_url,)])
         visit = self.indexer.storage.origin_visit_add(
             [
                 OriginVisit(
                     origin=origin_url,
                     date=datetime(2019, 2, 27, tzinfo=timezone.utc),
                     type="pypi",
-                    status="ongoing",
-                    snapshot=None,
                 )
             ]
         )[0]
         self.indexer.storage.snapshot_add(
             [
-                {
-                    "id": b"foo",
-                    "branches": {
+                Snapshot(
+                    branches={
                         b"foo": None,
-                        b"HEAD": {"target_type": "alias", "target": b"foo",},
+                        b"HEAD": SnapshotBranch(
+                            target_type=TargetType.ALIAS, target=b"foo",
+                        ),
                     },
-                }
+                )
             ]
         )
         visit_status = OriginVisitStatus(
             origin=origin_url,
             visit=visit.visit,
             date=now(),
             status="full",
             snapshot=b"foo",
         )
         self.indexer.storage.origin_visit_status_add([visit_status])
         self.indexer.run(["https://pypi.org/project/abcdef/"])
         self.assertEqual(self.indexer.results, [])
 
     def test_ftp(self):
-        self.indexer.run(["rsync://ftp.gnu.org/gnu/3dldf"])
+        origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
+        self.indexer.run([origin_url])
+        rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by"
         self.assertEqual(
-            self.indexer.results,
-            [
-                {
-                    "revision_id": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee"
-                    b"\xcc\x1a\xb4`\x8c\x8by",
-                    "origin_url": "rsync://ftp.gnu.org/gnu/3dldf",
-                }
-            ],
+            self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
         )
 
     def test_ftp_missing_snapshot(self):
-        self.indexer.storage.origin_add([{"url": "rsync://ftp.gnu.org/gnu/foobar",}])
-        self.indexer.run(["rsync://ftp.gnu.org/gnu/foobar"])
+        origin_url = "rsync://ftp.gnu.org/gnu/foobar"
+        self.indexer.storage.origin_add([Origin(url=origin_url)])
+        self.indexer.run([origin_url])
         self.assertEqual(self.indexer.results, [])
 
     def test_deposit(self):
-        self.indexer.run(["https://forge.softwareheritage.org/source/jesuisgpl/"])
+        origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
+        self.indexer.storage.origin_add([Origin(url=origin_url)])
+        self.indexer.run([origin_url])
+        rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb"
         self.assertEqual(
-            self.indexer.results,
-            [
-                {
-                    "revision_id": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{"
-                    b"\xa6\xe9\x99\xb1\x9e]q\xeb",
-                    "origin_url": "https://forge.softwareheritage.org/source/"
-                    "jesuisgpl/",
-                }
-            ],
+            self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
         )
 
     def test_deposit_missing_snapshot(self):
-        self.indexer.storage.origin_add(
-            [{"url": "https://forge.softwareheritage.org/source/foobar",}]
-        )
-        self.indexer.run(["https://forge.softwareheritage.org/source/foobar"])
+        origin_url = "https://forge.softwareheritage.org/source/foobar"
+        self.indexer.storage.origin_add([Origin(url=origin_url,)])
+        self.indexer.run([origin_url])
         self.assertEqual(self.indexer.results, [])
 
     def test_pypi(self):
-        self.indexer.run(["https://pypi.org/project/limnoria/"])
+        origin_url = "https://pypi.org/project/limnoria/"
+        self.indexer.run([origin_url])
+
+        rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t"
         self.assertEqual(
-            self.indexer.results,
-            [
-                {
-                    "revision_id": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k"
-                    b"A\x10\x9d\xc5\xfa2\xf8t",
-                    "origin_url": "https://pypi.org/project/limnoria/",
-                }
-            ],
+            self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url}],
         )
 
     def test_svn(self):
-        self.indexer.run(["http://0-512-md.googlecode.com/svn/"])
+        origin_url = "http://0-512-md.googlecode.com/svn/"
+        self.indexer.run([origin_url])
+        rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18"
         self.assertEqual(
-            self.indexer.results,
-            [
-                {
-                    "revision_id": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8"
-                    b"\xc9\xad#.\x1bw=\x18",
-                    "origin_url": "http://0-512-md.googlecode.com/svn/",
-                }
-            ],
+            self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
         )
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 79e8de3..2533981 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,224 +1,212 @@
 # Copyright (C) 2018-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from unittest.mock import patch
 
-from swh.model.hashutil import hash_to_bytes
-
 from swh.indexer.metadata import OriginMetadataIndexer
 
-from .utils import YARN_PARSER_METADATA
+from swh.model.model import Origin
+
+from .utils import YARN_PARSER_METADATA, REVISION
 from .test_metadata import REVISION_METADATA_CONFIG
 
 
 def test_origin_metadata_indexer(idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
-    indexer.run(["https://github.com/librariesio/yarn-parser"])
-
     origin = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    indexer.run([origin])
 
+    rev_id = REVISION.id
     rev_metadata = {
         "id": rev_id,
         "metadata": YARN_PARSER_METADATA,
         "mappings": ["npm"],
     }
     origin_metadata = {
         "id": origin,
         "from_revision": rev_id,
         "metadata": YARN_PARSER_METADATA,
         "mappings": ["npm"],
     }
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for result in results:
         del result["tool"]
     assert results == [rev_metadata]
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     for result in results:
         del result["tool"]
     assert results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_origin(idx_storage, storage, obj_storage):
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["https://github.com/librariesio/yarn-parser"])
-
     indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
 
     origin = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert len(results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(idx_storage, storage, obj_storage):
-
-    storage.origin_add([{"url": "https://example.com"}])
+    storage.origin_add([Origin(url="https://example.com")])
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.run(["https://example.com"])
 
     origin = "https://example.com"
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
     idx_storage, storage, obj_storage
 ):
 
-    storage.origin_add([{"url": "https://example.com"}])
-
-    indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
-    indexer.run(["https://example.com", "https://github.com/librariesio/yarn-parser"])
-
     origin1 = "https://example.com"
     origin2 = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    storage.origin_add([Origin(url=origin1)])
+    indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+    indexer.run([origin1, origin2])
 
-    rev_metadata = {
-        "id": rev_id,
-        "metadata": YARN_PARSER_METADATA,
-        "mappings": ["npm"],
-    }
-    origin_metadata = {
-        "id": origin2,
-        "from_revision": rev_id,
-        "metadata": YARN_PARSER_METADATA,
-        "mappings": ["npm"],
-    }
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for result in results:
         del result["tool"]
-    assert results == [rev_metadata]
+        assert results == [
+            {"id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"],}
+        ]
 
     results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     for result in results:
         del result["tool"]
-    assert results == [origin_metadata]
+        assert results == [
+            {
+                "id": origin2,
+                "from_revision": rev_id,
+                "metadata": YARN_PARSER_METADATA,
+                "mappings": ["npm"],
+            }
+        ]
 
 
 def test_origin_metadata_indexer_duplicate_revision(idx_storage, storage, obj_storage):
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
-    indexer.run(
-        [
-            "https://github.com/librariesio/yarn-parser",
-            "https://github.com/librariesio/yarn-parser.git",
-        ]
-    )
-
     origin1 = "https://github.com/librariesio/yarn-parser"
     origin2 = "https://github.com/librariesio/yarn-parser.git"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    indexer.run([origin1, origin2])
+
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(results) == 1
 
     results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     assert len(results) == 2
 
 
 def test_origin_metadata_indexer_no_metadata_file(idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+    origin = "https://github.com/librariesio/yarn-parser"
     with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
-        indexer.run(["https://github.com/librariesio/yarn-parser"])
+        indexer.run([origin])
 
-    origin = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_no_metadata(idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+    origin = "https://github.com/librariesio/yarn-parser"
     with patch(
         "swh.indexer.metadata.RevisionMetadataIndexer"
         ".translate_revision_intrinsic_metadata",
         return_value=(["npm"], {"@context": "foo"}),
     ):
-        indexer.run(["https://github.com/librariesio/yarn-parser"])
+        indexer.run([origin])
 
-    origin = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_error(idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+    origin = "https://github.com/librariesio/yarn-parser"
     with patch(
         "swh.indexer.metadata.RevisionMetadataIndexer"
         ".translate_revision_intrinsic_metadata",
         return_value=None,
     ):
-        indexer.run(["https://github.com/librariesio/yarn-parser"])
+        indexer.run([origin])
 
-    origin = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_delete_metadata(idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
-    indexer.run(["https://github.com/librariesio/yarn-parser"])
-
     origin = "https://github.com/librariesio/yarn-parser"
-    rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+    indexer.run([origin])
+
+    rev_id = REVISION.id
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results != []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results != []
 
     with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
-        indexer.run(["https://github.com/librariesio/yarn-parser"])
+        indexer.run([origin])
 
     results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert results == []
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_unknown_origin(idx_storage, storage, obj_storage):
 
     indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
     result = indexer.index_list(["https://unknown.org/foo"])
     assert not result
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index 3a39558..b3f0612 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,740 +1,774 @@
 # Copyright (C) 2017-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
 import functools
-import random
 from typing import Dict, Any
 import unittest
 
 from hypothesis import strategies
 
 from swh.model import hashutil
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
-from swh.model.model import OriginVisit, OriginVisitStatus
+from swh.model.model import (
+    Content,
+    Directory,
+    DirectoryEntry,
+    Origin,
+    OriginVisit,
+    OriginVisitStatus,
+    Person,
+    Revision,
+    RevisionType,
+    Snapshot,
+    SnapshotBranch,
+    TargetType,
+    Timestamp,
+    TimestampWithTimezone,
+)
 from swh.storage.utils import now
 
 from swh.indexer.storage import INDEXER_CFG_KEY
 
 
 BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = {
-    "storage": {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]},
+    "storage": {"cls": "memory"},
     "objstorage": {"cls": "memory", "args": {},},
     INDEXER_CFG_KEY: {"cls": "memory", "args": {},},
 }
 
+
+ORIGINS = [
+    Origin(url="https://github.com/SoftwareHeritage/swh-storage"),
+    Origin(url="rsync://ftp.gnu.org/gnu/3dldf"),
+    Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"),
+    Origin(url="https://pypi.org/project/limnoria/"),
+    Origin(url="http://0-512-md.googlecode.com/svn/"),
+    Origin(url="https://github.com/librariesio/yarn-parser"),
+    Origin(url="https://github.com/librariesio/yarn-parser.git"),
+]
+
+
 ORIGIN_VISITS = [
-    {"type": "git", "url": "https://github.com/SoftwareHeritage/swh-storage"},
-    {"type": "ftp", "url": "rsync://ftp.gnu.org/gnu/3dldf"},
-    {"type": "deposit", "url": "https://forge.softwareheritage.org/source/jesuisgpl/"},
-    {"type": "pypi", "url": "https://pypi.org/project/limnoria/"},
-    {"type": "svn", "url": "http://0-512-md.googlecode.com/svn/"},
-    {"type": "git", "url": "https://github.com/librariesio/yarn-parser"},
-    {"type": "git", "url": "https://github.com/librariesio/yarn-parser.git"},
+    {"type": "git", "origin": ORIGINS[0].url},
+    {"type": "ftp", "origin": ORIGINS[1].url},
+    {"type": "deposit", "origin": ORIGINS[2].url},
+    {"type": "pypi", "origin": ORIGINS[3].url},
+    {"type": "svn", "origin": ORIGINS[4].url},
+    {"type": "git", "origin": ORIGINS[5].url},
+    {"type": "git", "origin": ORIGINS[6].url},
 ]
 
+
+DIRECTORY = Directory(
+    id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"),
+    entries=(
+        DirectoryEntry(
+            name=b"index.js",
+            type="file",
+            target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"),
+            perms=0o100644,
+        ),
+        DirectoryEntry(
+            name=b"package.json",
+            type="file",
+            target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
+            perms=0o100644,
+        ),
+        DirectoryEntry(
+            name=b".github",
+            type="dir",
+            target=Directory(entries=()).id,
+            perms=0o040000,
+        ),
+    ),
+)
+
+DIRECTORY2 = Directory(
+    id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6",
+    entries=(
+        DirectoryEntry(
+            name=b"package.json",
+            type="file",
+            target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"),
+            perms=0o100644,
+        ),
+    ),
+)
+
+REVISION = Revision(
+    id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"),
+    message=b"Improve search functionality",
+    author=Person(
+        name=b"Andrew Nesbitt",
+        fullname=b"Andrew Nesbitt <andrewnez@gmail.com>",
+        email=b"andrewnez@gmail.com",
+    ),
+    committer=Person(
+        name=b"Andrew Nesbitt",
+        fullname=b"Andrew Nesbitt <andrewnez@gmail.com>",
+        email=b"andrewnez@gmail.com",
+    ),
+    committer_date=TimestampWithTimezone(
+        timestamp=Timestamp(seconds=1380883849, microseconds=0,),
+        offset=120,
+        negative_utc=False,
+    ),
+    type=RevisionType.GIT,
+    synthetic=False,
+    date=TimestampWithTimezone(
+        timestamp=Timestamp(seconds=1487596456, microseconds=0,),
+        offset=0,
+        negative_utc=False,
+    ),
+    directory=DIRECTORY2.id,
+    parents=(),
+)
+
+REVISIONS = [REVISION]
+
 SNAPSHOTS = [
-    {
-        "origin": "https://github.com/SoftwareHeritage/swh-storage",
-        "branches": {
-            b"refs/heads/add-revision-origin-cache": {
-                "target": b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
-                b"s\xe7/\xe9l\x1e",
-                "target_type": "revision",
-            },
-            b"refs/head/master": {
-                "target": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}" b"\xac\xefrm",
-                "target_type": "revision",
-            },
-            b"HEAD": {"target": b"refs/head/master", "target_type": "alias"},
-            b"refs/tags/v0.0.103": {
-                "target": b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b"\x0f\xdd",
-                "target_type": "release",
-            },
-        },
-    },
-    {
-        "origin": "rsync://ftp.gnu.org/gnu/3dldf",
-        "branches": {
-            b"3DLDF-1.1.4.tar.gz": {
-                "target": b"dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc" b'"G\x99\x11',
-                "target_type": "revision",
-            },
-            b"3DLDF-2.0.2.tar.gz": {
-                "target": b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e="
-                b"\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V",
-                "target_type": "revision",
-            },
-            b"3DLDF-2.0.3-examples.tar.gz": {
-                "target": b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97"
-                b"\xfe\xadZ\x80\x80\xc1\x83\xff",
-                "target_type": "revision",
-            },
-            b"3DLDF-2.0.3.tar.gz": {
-                "target": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee"
-                b"\xcc\x1a\xb4`\x8c\x8by",
-                "target_type": "revision",
-            },
-            b"3DLDF-2.0.tar.gz": {
-                "target": b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G" b"\xd3\xd1m",
-                "target_type": "revision",
-            },
-        },
-    },
-    {
-        "origin": "https://forge.softwareheritage.org/source/jesuisgpl/",
-        "branches": {
-            b"master": {
-                "target": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{"
-                b"\xa6\xe9\x99\xb1\x9e]q\xeb",
-                "target_type": "revision",
-            }
-        },
-        "id": b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r ",
-    },
-    {
-        "origin": "https://pypi.org/project/limnoria/",
-        "branches": {
-            b"HEAD": {"target": b"releases/2018.09.09", "target_type": "alias"},
-            b"releases/2018.09.01": {
-                "target": b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d"
-                b"\xbb\xdfF\xfdw\xcf",
-                "target_type": "revision",
-            },
-            b"releases/2018.09.09": {
-                "target": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k"
-                b"A\x10\x9d\xc5\xfa2\xf8t",
-                "target_type": "revision",
-            },
-        },
-        "id": b"{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay" b"\x12\x9e\xd6\xb3",
-    },
-    {
-        "origin": "http://0-512-md.googlecode.com/svn/",
-        "branches": {
-            b"master": {
-                "target": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8"
-                b"\xc9\xad#.\x1bw=\x18",
-                "target_type": "revision",
-            }
+    Snapshot(
+        id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
+        branches={
+            b"refs/heads/add-revision-origin-cache": SnapshotBranch(
+                target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e',
+                target_type=TargetType.REVISION,
+            ),
+            b"refs/head/master": SnapshotBranch(
+                target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm",
+                target_type=TargetType.REVISION,
+            ),
+            b"HEAD": SnapshotBranch(
+                target=b"refs/head/master", target_type=TargetType.ALIAS
+            ),
+            b"refs/tags/v0.0.103": SnapshotBranch(
+                target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd',
+                target_type=TargetType.RELEASE,
+            ),
         },
-        "id": b"\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7"
-        b"\x05\xea\xb8\x1f\xc4H\xf4s",
-    },
-    {
-        "origin": "https://github.com/librariesio/yarn-parser",
-        "branches": {
-            b"HEAD": {
-                "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
-                "target_type": "revision",
-            }
+    ),
+    Snapshot(
+        id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
+        branches={
+            b"3DLDF-1.1.4.tar.gz": SnapshotBranch(
+                target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11',
+                target_type=TargetType.REVISION,
+            ),
+            b"3DLDF-2.0.2.tar.gz": SnapshotBranch(
+                target=b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V",  # noqa
+                target_type=TargetType.REVISION,
+            ),
+            b"3DLDF-2.0.3-examples.tar.gz": SnapshotBranch(
+                target=b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97\xfe\xadZ\x80\x80\xc1\x83\xff",  # noqa
+                target_type=TargetType.REVISION,
+            ),
+            b"3DLDF-2.0.3.tar.gz": SnapshotBranch(
+                target=b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by",  # noqa
+                target_type=TargetType.REVISION,
+            ),
+            b"3DLDF-2.0.tar.gz": SnapshotBranch(
+                target=b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G\xd3\xd1m",
+                target_type=TargetType.REVISION,
+            ),
         },
-    },
-    {
-        "origin": "https://github.com/librariesio/yarn-parser.git",
-        "branches": {
-            b"HEAD": {
-                "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
-                "target_type": "revision",
-            }
+    ),
+    Snapshot(
+        id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"),
+        branches={
+            b"master": SnapshotBranch(
+                target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb",  # noqa
+                target_type=TargetType.REVISION,
+            )
         },
-    },
-]
-
-
-REVISIONS = [
-    {
-        "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
-        "message": b"Improve search functionality",
-        "author": {
-            "name": b"Andrew Nesbitt",
-            "fullname": b"Andrew Nesbitt <andrewnez@gmail.com>",
-            "email": b"andrewnez@gmail.com",
+    ),
+    Snapshot(
+        id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"),
+        branches={
+            b"HEAD": SnapshotBranch(
+                target=b"releases/2018.09.09", target_type=TargetType.ALIAS
+            ),
+            b"releases/2018.09.01": SnapshotBranch(
+                target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf",
+                target_type=TargetType.REVISION,
+            ),
+            b"releases/2018.09.09": SnapshotBranch(
+                target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t",  # noqa
+                target_type=TargetType.REVISION,
+            ),
         },
-        "committer": {
-            "name": b"Andrew Nesbitt",
-            "fullname": b"Andrew Nesbitt <andrewnez@gmail.com>",
-            "email": b"andrewnez@gmail.com",
+    ),
+    Snapshot(
+        id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"),
+        branches={
+            b"master": SnapshotBranch(
+                target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18",
+                target_type=TargetType.REVISION,
+            )
         },
-        "committer_date": {
-            "negative_utc": False,
-            "offset": 120,
-            "timestamp": {"microseconds": 0, "seconds": 1380883849,},
+    ),
+    Snapshot(
+        id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
+        branches={
+            b"HEAD": SnapshotBranch(
+                target=REVISION.id, target_type=TargetType.REVISION,
+            )
         },
-        "type": "git",
-        "synthetic": False,
-        "date": {
-            "negative_utc": False,
-            "timestamp": {"seconds": 1487596456, "microseconds": 0,},
-            "offset": 0,
+    ),
+    Snapshot(
+        id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
+        branches={
+            b"HEAD": SnapshotBranch(
+                target=REVISION.id, target_type=TargetType.REVISION,
+            )
         },
-        "directory": b"10",
-        "parents": (),
-    }
+    ),
 ]
 
-DIRECTORY_ID = b"10"
-
-DIRECTORY_ENTRIES = [
-    {"name": b"index.js", "type": "file", "target": b"abc", "perms": 33188,},
-    {"name": b"package.json", "type": "file", "target": b"cde", "perms": 33188,},
-    {"name": b".github", "type": "dir", "target": b"11", "perms": 16384,},
-]
 
 SHA1_TO_LICENSES = {
     "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"],
     "02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"],
     "103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"],
     "688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"],
     "da39a3ee5e6b4b0d3255bfef95601890afd80709": [],
 }
 
 
 SHA1_TO_CTAGS = {
     "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [
         {"name": "foo", "kind": "str", "line": 10, "lang": "bar",}
     ],
     "d4c647f0fc257591cc9ba1722484229780d1c607": [
         {"name": "let", "kind": "int", "line": 100, "lang": "haskell",}
     ],
     "688a5ef812c53907562fe379d4b3851e69c7cb15": [
         {"name": "symbol", "kind": "float", "line": 99, "lang": "python",}
     ],
 }
 
 
 OBJ_STORAGE_DATA = {
     "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text",
     "688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text",
     "8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text",
     "02fb2c89e14f7fab46701478c83779c7beb7b069": b"""
     import unittest
     import logging
     from swh.indexer.mimetype import MimetypeIndexer
     from swh.indexer.tests.test_utils import MockObjStorage
 
     class MockStorage():
         def content_mimetype_add(self, mimetypes):
             self.state = mimetypes
             self.conflict_update = conflict_update
 
         def indexer_configuration_add(self, tools):
             return [{
                 'id': 10,
             }]
     """,
     "103bc087db1d26afc3a0283f38663d081e9b01e6": b"""
         #ifndef __AVL__
         #define __AVL__
 
         typedef struct _avl_tree avl_tree;
 
         typedef struct _data_t {
           int content;
         } data_t;
     """,
     "93666f74f1cf635c8c8ac118879da6ec5623c410": b"""
     (should 'pygments (recognize 'lisp 'easily))
 
     """,
     "26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b"""
     {
         "name": "test_metadata",
         "version": "0.0.1",
         "description": "Simple package.json test for indexer",
         "repository": {
           "type": "git",
           "url": "https://github.com/moranegg/metadata_test"
       }
     }
     """,
     "d4c647f0fc257591cc9ba1722484229780d1c607": b"""
     {
       "version": "5.0.3",
       "name": "npm",
       "description": "a package manager for JavaScript",
       "keywords": [
         "install",
         "modules",
         "package manager",
         "package.json"
       ],
       "preferGlobal": true,
       "config": {
         "publishtest": false
       },
       "homepage": "https://docs.npmjs.com/",
       "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
       "repository": {
         "type": "git",
         "url": "https://github.com/npm/npm"
       },
       "bugs": {
         "url": "https://github.com/npm/npm/issues"
       },
       "dependencies": {
         "JSONStream": "~1.3.1",
         "abbrev": "~1.1.0",
         "ansi-regex": "~2.1.1",
         "ansicolors": "~0.3.2",
         "ansistyles": "~0.1.3"
       },
       "devDependencies": {
         "tacks": "~1.2.6",
         "tap": "~10.3.2"
       },
       "license": "Artistic-2.0"
     }
 
     """,
     "a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b"""
     """,
     "da39a3ee5e6b4b0d3255bfef95601890afd80709": b"",
-    # 626364
-    hash_to_hex(b"bcd"): b"unimportant content for bcd",
-    # 636465
-    hash_to_hex(
-        b"cde"
-    ): b"""
+    # was 626364 / b'bcd'
+    "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd",
+    # was 636465 / b'cde' now yarn-parser package.json
+    "f5305243b3ce7ef8dc864ebc73794da304025beb": b"""
     {
       "name": "yarn-parser",
       "version": "1.0.0",
       "description": "Tiny web service for parsing yarn.lock files",
       "main": "index.js",
       "scripts": {
         "start": "node index.js",
         "test": "mocha"
       },
       "engines": {
         "node": "9.8.0"
       },
       "repository": {
         "type": "git",
         "url": "git+https://github.com/librariesio/yarn-parser.git"
       },
       "keywords": [
         "yarn",
         "parse",
         "lock",
         "dependencies"
       ],
       "author": "Andrew Nesbitt",
       "license": "AGPL-3.0",
       "bugs": {
         "url": "https://github.com/librariesio/yarn-parser/issues"
       },
       "homepage": "https://github.com/librariesio/yarn-parser#readme",
       "dependencies": {
         "@yarnpkg/lockfile": "^1.0.0",
         "body-parser": "^1.15.2",
         "express": "^4.14.0"
       },
       "devDependencies": {
         "chai": "^4.1.2",
         "mocha": "^5.2.0",
         "request": "^2.87.0",
         "test": "^0.6.0"
       }
     }
 
 """,
 }
 
+
 YARN_PARSER_METADATA = {
     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
     "url": "https://github.com/librariesio/yarn-parser#readme",
     "codeRepository": "git+git+https://github.com/librariesio/yarn-parser.git",
     "author": [{"type": "Person", "name": "Andrew Nesbitt"}],
     "license": "https://spdx.org/licenses/AGPL-3.0",
     "version": "1.0.0",
     "description": "Tiny web service for parsing yarn.lock files",
     "issueTracker": "https://github.com/librariesio/yarn-parser/issues",
     "name": "yarn-parser",
     "keywords": ["yarn", "parse", "lock", "dependencies"],
     "type": "SoftwareSourceCode",
 }
 
 
 json_dict_keys = strategies.one_of(
     strategies.characters(),
     strategies.just("type"),
     strategies.just("url"),
     strategies.just("name"),
     strategies.just("email"),
     strategies.just("@id"),
     strategies.just("@context"),
     strategies.just("repository"),
     strategies.just("license"),
     strategies.just("repositories"),
     strategies.just("licenses"),
 )
 """Hypothesis strategy that generates strings, with an emphasis on those
 that are often used as dictionary keys in metadata files."""
 
 
 generic_json_document = strategies.recursive(
     strategies.none()
     | strategies.booleans()
     | strategies.floats()
     | strategies.characters(),
     lambda children: (
         strategies.lists(children, min_size=1)
         | strategies.dictionaries(json_dict_keys, children, min_size=1)
     ),
 )
 """Hypothesis strategy that generates possible values for values of JSON
 metadata files."""
 
 
 def json_document_strategy(keys=None):
     """Generates an hypothesis strategy that generates metadata files
     for a JSON-based format that uses the given keys."""
     if keys is None:
         keys = strategies.characters()
     else:
         keys = strategies.one_of(map(strategies.just, keys))
 
     return strategies.dictionaries(keys, generic_json_document, min_size=1)
 
 
 def _tree_to_xml(root, xmlns, data):
     def encode(s):
         "Skips unpaired surrogates generated by json_document_strategy"
         return s.encode("utf8", "replace")
 
     def to_xml(data, indent=b" "):
         if data is None:
             return b""
         elif isinstance(data, (bool, str, int, float)):
             return indent + encode(str(data))
         elif isinstance(data, list):
             return b"\n".join(to_xml(v, indent=indent) for v in data)
         elif isinstance(data, dict):
             lines = []
             for (key, value) in data.items():
                 lines.append(indent + encode("<{}>".format(key)))
                 lines.append(to_xml(value, indent=indent + b" "))
                 lines.append(indent + encode("</{}>".format(key)))
             return b"\n".join(lines)
         else:
             raise TypeError(data)
 
     return b"\n".join(
         [
             '<{} xmlns="{}">'.format(root, xmlns).encode(),
             to_xml(data),
             "</{}>".format(root).encode(),
         ]
     )
 
 
 class TreeToXmlTest(unittest.TestCase):
     def test_leaves(self):
         self.assertEqual(
             _tree_to_xml("root", "http://example.com", None),
             b'<root xmlns="http://example.com">\n\n</root>',
         )
         self.assertEqual(
             _tree_to_xml("root", "http://example.com", True),
             b'<root xmlns="http://example.com">\n True\n</root>',
         )
         self.assertEqual(
             _tree_to_xml("root", "http://example.com", "abc"),
             b'<root xmlns="http://example.com">\n abc\n</root>',
         )
         self.assertEqual(
             _tree_to_xml("root", "http://example.com", 42),
             b'<root xmlns="http://example.com">\n 42\n</root>',
         )
         self.assertEqual(
             _tree_to_xml("root", "http://example.com", 3.14),
             b'<root xmlns="http://example.com">\n 3.14\n</root>',
         )
 
     def test_dict(self):
         self.assertIn(
             _tree_to_xml("root", "http://example.com", {"foo": "bar", "baz": "qux"}),
             [
                 b'<root xmlns="http://example.com">\n'
                 b" <foo>\n  bar\n </foo>\n"
                 b" <baz>\n  qux\n </baz>\n"
                 b"</root>",
                 b'<root xmlns="http://example.com">\n'
                 b" <baz>\n  qux\n </baz>\n"
                 b" <foo>\n  bar\n </foo>\n"
                 b"</root>",
             ],
         )
 
     def test_list(self):
         self.assertEqual(
             _tree_to_xml(
                 "root", "http://example.com", [{"foo": "bar"}, {"foo": "baz"},]
             ),
             b'<root xmlns="http://example.com">\n'
             b" <foo>\n  bar\n </foo>\n"
             b" <foo>\n  baz\n </foo>\n"
             b"</root>",
         )
 
 
 def xml_document_strategy(keys, root, xmlns):
     """Generates an hypothesis strategy that generates metadata files
     for an XML format that uses the given keys."""
 
     return strategies.builds(
         functools.partial(_tree_to_xml, root, xmlns), json_document_strategy(keys)
     )
 
 
 def filter_dict(d, keys):
     "return a copy of the dict with keys deleted"
     if not isinstance(keys, (list, tuple)):
         keys = (keys,)
     return dict((k, v) for (k, v) in d.items() if k not in keys)
 
 
 def fill_obj_storage(obj_storage):
     """Add some content in an object storage."""
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
 
 
 def fill_storage(storage):
-    visit_types = {}
-    for visit in ORIGIN_VISITS:
-        storage.origin_add_one({"url": visit["url"]})
-        visit_types[visit["url"]] = visit["type"]
-    for snap in SNAPSHOTS:
-        origin_url = snap["origin"]
+    storage.origin_add(ORIGINS)
+    storage.directory_add([DIRECTORY, DIRECTORY2])
+    storage.revision_add(REVISIONS)
+    storage.snapshot_add(SNAPSHOTS)
+
+    for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
+        assert snapshot.id is not None
+
         visit = storage.origin_visit_add(
-            [
-                OriginVisit(
-                    origin=origin_url,
-                    date=now(),
-                    type=visit_types[origin_url],
-                    status="ongoing",
-                    snapshot=None,
-                )
-            ]
+            [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])]
         )[0]
-        snap_id = snap.get("id") or bytes([random.randint(0, 255) for _ in range(32)])
-        storage.snapshot_add([{"id": snap_id, "branches": snap["branches"]}])
         visit_status = OriginVisitStatus(
-            origin=origin_url,
+            origin=visit.origin,
             visit=visit.visit,
             date=now(),
             status="full",
-            snapshot=snap_id,
+            snapshot=snapshot.id,
         )
         storage.origin_visit_status_add([visit_status])
-    storage.revision_add(REVISIONS)
 
     contents = []
     for (obj_id, content) in OBJ_STORAGE_DATA.items():
         content_hashes = hashutil.MultiHash.from_data(content).digest()
         contents.append(
-            {
-                "data": content,
-                "length": len(content),
-                "status": "visible",
-                "sha1": hash_to_bytes(obj_id),
-                "sha1_git": hash_to_bytes(obj_id),
-                "sha256": content_hashes["sha256"],
-                "blake2s256": content_hashes["blake2s256"],
-            }
+            Content(
+                data=content,
+                length=len(content),
+                status="visible",
+                sha1=hash_to_bytes(obj_id),
+                sha1_git=hash_to_bytes(obj_id),
+                sha256=content_hashes["sha256"],
+                blake2s256=content_hashes["blake2s256"],
+            )
         )
     storage.content_add(contents)
-    storage.directory_add([{"id": DIRECTORY_ID, "entries": DIRECTORY_ENTRIES,}])
 
 
 class CommonContentIndexerTest(metaclass=abc.ABCMeta):
     legacy_get_format = False
     """True if and only if the tested indexer uses the legacy format.
     see: https://forge.softwareheritage.org/T1433
 
     """
 
     def get_indexer_results(self, ids):
         """Override this for indexers that don't have a mock storage."""
         return self.indexer.idx_storage.state
 
     def assert_legacy_results_ok(self, sha1s, expected_results=None):
         # XXX old format, remove this when all endpoints are
         #     updated to the new one
         #     see: https://forge.softwareheritage.org/T1433
         sha1s = [
             sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s
         ]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
             expected_results = self.expected_results
 
         self.assertEqual(
             len(expected_results),
             len(actual_results),
             (expected_results, actual_results),
         )
         for indexed_data in actual_results:
             _id = indexed_data["id"]
             expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
             expected_data["id"] = _id
             self.assertEqual(indexed_data, expected_data)
 
     def assert_results_ok(self, sha1s, expected_results=None):
         if self.legacy_get_format:
             self.assert_legacy_results_ok(sha1s, expected_results)
             return
 
         sha1s = [
             sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s
         ]
         actual_results = list(self.get_indexer_results(sha1s))
 
         if expected_results is None:
             expected_results = self.expected_results
 
         self.assertEqual(
             len(expected_results),
             len(actual_results),
             (expected_results, actual_results),
         )
         for indexed_data in actual_results:
             (_id, indexed_data) = list(indexed_data.items())[0]
             expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
             expected_data = [expected_data]
             self.assertEqual(indexed_data, expected_data)
 
     def test_index(self):
         """Known sha1 have their data indexed
 
         """
         sha1s = [self.id0, self.id1, self.id2]
 
         # when
         self.indexer.run(sha1s, policy_update="update-dups")
 
         self.assert_results_ok(sha1s)
 
         # 2nd pass
         self.indexer.run(sha1s, policy_update="ignore-dups")
 
         self.assert_results_ok(sha1s)
 
     def test_index_one_unknown_sha1(self):
         """Unknown sha1 are not indexed"""
         sha1s = [
             self.id1,
             "799a5ef812c53907562fe379d4b3851e69c7cb15",  # unknown
             "800a5ef812c53907562fe379d4b3851e69c7cb15",
         ]  # unknown
 
         # when
         self.indexer.run(sha1s, policy_update="update-dups")
 
         # then
         expected_results = {
             k: v for k, v in self.expected_results.items() if k in sha1s
         }
 
         self.assert_results_ok(sha1s, expected_results)
 
 
 class CommonContentIndexerRangeTest:
     """Allows to factorize tests on range indexer.
 
     """
 
     def setUp(self):
         self.contents = sorted(OBJ_STORAGE_DATA)
 
     def assert_results_ok(self, start, end, actual_results, expected_results=None):
         if expected_results is None:
             expected_results = self.expected_results
 
         actual_results = list(actual_results)
         for indexed_data in actual_results:
             _id = indexed_data["id"]
             assert isinstance(_id, bytes)
             indexed_data = indexed_data.copy()
             indexed_data["id"] = hash_to_hex(indexed_data["id"])
             self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
             self.assertTrue(start <= _id <= end)
             _tool_id = indexed_data["indexer_configuration_id"]
             self.assertEqual(_tool_id, self.indexer.tool["id"])
 
     def test__index_contents(self):
         """Indexing contents without existing data results in indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         # given
         actual_results = list(self.indexer._index_contents(start, end, indexed={}))
 
         self.assert_results_ok(start, end, actual_results)
 
     def test__index_contents_with_indexed_data(self):
         """Indexing contents with existing data results in less indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         data_indexed = [self.id0, self.id2]
 
         # given
         actual_results = self.indexer._index_contents(
             start, end, indexed=set(map(hash_to_bytes, data_indexed))
         )
 
         # craft the expected results
         expected_results = self.expected_results.copy()
         for already_indexed_key in data_indexed:
             expected_results.pop(already_indexed_key)
 
         self.assert_results_ok(start, end, actual_results, expected_results)
 
     def test_generate_content_get(self):
         """Optimal indexing should result in indexed data
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(start, end)
 
         # then
         self.assertEqual(actual_results, {"status": "uneventful"})
 
     def test_generate_content_get_input_as_bytes(self):
         """Optimal indexing should result in indexed data
 
         Input are in bytes here.
 
         """
         _start, _end = [self.contents[0], self.contents[2]]  # output hex ids
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
 
         # given
         actual_results = self.indexer.run(start, end, skip_existing=False)
         # no already indexed data so same result as prior test
 
         # then
         self.assertEqual(actual_results, {"status": "uneventful"})
 
     def test_generate_content_get_no_result(self):
         """No result indexed returns False"""
         _start, _end = [
             "0000000000000000000000000000000000000000",
             "0000000000000000000000000000000000000001",
         ]
         start, end = map(hashutil.hash_to_bytes, (_start, _end))
         # given
         actual_results = self.indexer.run(start, end, incremental=False)
 
         # then
         self.assertEqual(actual_results, {"status": "uneventful"})
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..8a1495e
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,40 @@
+[tox]
+envlist=black,flake8,mypy,py3
+
+[testenv]
+extras =
+  testing
+deps =
+  pytest-cov
+  swh-scheduler[testing] >= 0.5.0
+  swh-storage[testing] >= 0.10.0
+  dev: pdbpp
+commands =
+  pytest --doctest-modules \
+  !slow: --hypothesis-profile=fast \
+  slow:  --hypothesis-profile=slow \
+         {envsitepackagesdir}/swh/indexer \
+         --cov={envsitepackagesdir}/swh/indexer \
+         --cov-branch {posargs}
+
+[testenv:black]
+skip_install = true
+deps =
+  black
+commands =
+  {envpython} -m black --check swh
+
+[testenv:flake8]
+skip_install = true
+deps =
+  flake8
+commands =
+  {envpython} -m flake8
+
+[testenv:mypy]
+extras =
+  testing
+deps =
+  mypy
+commands =
+  mypy swh
diff --git a/version.txt b/version.txt
deleted file mode 100644
index a538b5a..0000000
--- a/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-v0.1.0-0-ga8307fc
\ No newline at end of file