diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..39d60022
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,15 @@
+*.pyc
+*.sw?
+*~
+/.coverage
+/.coverage.*
+.eggs/
+__pycache__
+build/
+dist/
+*.egg-info
+version.txt
+.vscode/
+.hypothesis/
+/.tox/
+.mypy_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..7ee9db8a
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,48 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.4.0
+  hooks:
+  - id: trailing-whitespace
+  - id: flake8
+  - id: check-json
+  - id: check-yaml
+
+- repo: https://github.com/codespell-project/codespell
+  rev: v1.16.0
+  hooks:
+  - id: codespell
+    exclude: TODO
+    args: [-L iff]
+
+- repo: local
+  hooks:
+  - id: mypy
+    name: mypy
+    entry: mypy
+    args: [swh]
+    pass_filenames: false
+    language: system
+    types: [python]
+
+  - id: check-bumped-dbversion
+    name: check-bumped-dbversion
+    files: 'sql/upgrades/.*\.sql'
+    entry: grep
+    args: ['insert into dbversion']
+    language: system
+
+- repo: https://github.com/python/black
+  rev: 19.10b0
+  hooks:
+  - id: black
+
+# unfortunately, we are far from being able to enable this...
+#- repo: https://github.com/PyCQA/pydocstyle.git
+#  rev: 4.0.0
+#  hooks:
+#  - id: pydocstyle
+#    name: pydocstyle
+#    description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions.
+#    entry: pydocstyle --convention=google
+#    language: python
+#    types: [python]
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..2d0a34af
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+Copyright (C) 2015 The Software Heritage developers
+
+See http://www.softwareheritage.org/ for more information.
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..0ad22b51
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,78 @@
+# Software Heritage Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as Software
+Heritage contributors and maintainers pledge to making participation in our
+project and our community a harassment-free experience for everyone, regardless
+of age, body size, disability, ethnicity, sex characteristics, gender identity
+and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at `conduct@softwareheritage.org`. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an
+incident.  Further details of specific enforcement policies may be posted
+separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 00000000..b89f3e04
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,3 @@
+Daniele Serafini
+Ishan Bhanuka
+Quentin Campos
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..94a9ed02
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/PKG-INFO b/PKG-INFO
index d0aedf47..1976d1a4 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,216 +1,216 @@
 Metadata-Version: 2.1
 Name: swh.storage
-Version: 0.10.0
+Version: 0.10.1
 Summary: Software Heritage storage manager
 Home-page: https://forge.softwareheritage.org/diffusion/DSTO/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-storage
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-storage/
 Description: swh-storage
         ===========
         
         Abstraction layer over the archive, allowing to access all stored source code
         artifacts as well as their metadata.
         
         See the
         [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html)
         for more details.
         
         ## Quick start
         
         ### Dependencies
         
         Python tests for this module include tests that cannot be run without a local
         Postgresql database, so you need the Postgresql server executable on your
         machine (no need to have a running Postgresql server). They also expect a
         cassandra server.
         
         #### Debian-like host
         
         ```
         $ sudo apt install libpq-dev postgresql-11 cassandra
         ```
         
         #### Non Debian-like host
         
         The tests expects `/usr/sbin/cassandra` to exist.
         
         Optionally, you can avoid running the cassandra tests.
         
         ```
         (swh) :~/swh-storage$ tox -- -m 'not cassandra'
         ```
         
         ### Installation
         
         It is strongly recommended to use a virtualenv. In the following, we
         consider you work in a virtualenv named `swh`. See the
         [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup)
         for a more details on how to setup a working environment.
         
         
         You can install the package directly from
         [pypi](https://pypi.org/p/swh.storage):
         
         ```
         (swh) :~$ pip install swh.storage
         [...]
         ```
         
         Or from sources:
         
         ```
         (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git
         [...]
         (swh) :~$ cd swh-storage
         (swh) :~/swh-storage$ pip install .
         [...]
         ```
         
         Then you can check it's properly installed:
         ```
         (swh) :~$ swh storage --help
         Usage: swh storage [OPTIONS] COMMAND [ARGS]...
         
           Software Heritage Storage tools.
         
         Options:
           -h, --help  Show this message and exit.
         
         Commands:
           rpc-serve  Software Heritage Storage RPC server.
         ```
         
         
         ## Tests
         
         The best way of running Python tests for this module is to use
         [tox](https://tox.readthedocs.io/).
         
         ```
         (swh) :~$ pip install tox
         ```
         
         ### tox
         
         From the sources directory, simply use tox:
         
         ```
         (swh) :~/swh-storage$ tox
         [...]
         ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ==========
         _______________________________ summary ________________________________
           flake8: commands succeeded
           py3: commands succeeded
           congratulations :)
         ```
         
         ## Development
         
         The storage server can be locally started. It requires a configuration file and
         a running Postgresql database.
         
         ### Sample configuration
         
         A typical configuration `storage.yml` file is:
         
         ```
         storage:
           cls: local
           args:
             db: "dbname=softwareheritage-dev user=<user> password=<pwd>"
             objstorage:
               cls: pathslicing
               args:
                 root: /tmp/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
         which means, this uses:
         
         - a local storage instance whose db connection is to
           `softwareheritage-dev` local instance,
         
         - the objstorage uses a local objstorage instance whose:
         
           - `root` path is /tmp/swh-storage,
         
           - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of
             the content (sha1) which will be stored on disk at first level
             with the first 2 hex characters, the second level with the next 2
             hex characters and the third level with the next 2 hex
             characters. And finally the complete hash file holding the raw
             content. For example: 00062f8bd330715c4f819373653d97b3cd34394c
             will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c
         
         Note that the `root` path should exist on disk before starting the server.
         
         
         ### Starting the storage server
         
         If the python package has been properly installed (e.g. in a virtual env), you
         should be able to use the command:
         
         ```
         (swh) :~/swh-storage$ swh storage rpc-serve storage.yml
         ```
         
         This runs a local swh-storage api at 5002 port.
         
         ```
         (swh) :~/swh-storage$ curl http://127.0.0.1:5002
         <html>
         <head><title>Software Heritage storage server</title></head>
         <body>
         <p>You have reached the
         <a href="https://www.softwareheritage.org/">Software Heritage</a>
         storage server.<br />
         See its
         <a href="https://docs.softwareheritage.org/devel/swh-storage/">documentation
         and API</a> for more information</p>
         ```
         
         ### And then what?
         
         In your upper layer
         ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/),
         [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/),
         etc...), you can define a remote storage with this snippet of yaml
         configuration.
         
         ```
         storage:
           cls: remote
           args:
             url: http://localhost:5002/
         ```
         
         You could directly define a local storage with the following snippet:
         
         ```
         storage:
           cls: local
           args:
             db: service=swh-dev
             objstorage:
               cls: pathslicing
               args:
                 root: /home/storage/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 Provides-Extra: schemata
 Provides-Extra: journal
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 00000000..58a761ea
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,3 @@
+_build/
+apidoc/
+*-stamp
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 00000000..b97c7532
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,2 @@
+include ../../swh-docs/Makefile.sphinx
+-include Makefile.local
diff --git a/docs/Makefile.local b/docs/Makefile.local
new file mode 100644
index 00000000..ed0b7b2a
--- /dev/null
+++ b/docs/Makefile.local
@@ -0,0 +1,26 @@
+sphinx/html: sql-autodoc images
+sphinx/clean: clean-sql-autodoc clean-images
+assets: sql-autodoc images
+
+sql-autodoc:
+	make -C ../sql/ doc
+
+images:
+	make -C images/
+clean-images:
+	make -C images/ clean
+
+clean: clean-sql-autodoc clean-images
+clean-sql-autodoc:
+	make -C ../sql/ clean
+
+distclean: clean distclean-sql-autodoc
+distclean-sql-autodoc:
+	make -C ../sql/ distclean
+
+.PHONY: sql-autodoc clean-sql-autodoc images clean-images
+
+
+# Local Variables:
+# mode: makefile
+# End:
diff --git a/docs/_static/.placeholder b/docs/_static/.placeholder
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/_templates/.placeholder b/docs/_templates/.placeholder
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/archive-copies.rst b/docs/archive-copies.rst
new file mode 100644
index 00000000..09f2ea40
--- /dev/null
+++ b/docs/archive-copies.rst
@@ -0,0 +1,48 @@
+:orphan:
+
+.. _archive-copies:
+
+Archive copies
+==============
+
+.. _swh-storage-copies-layout:
+.. figure:: images/swh-archive-copies.svg
+   :width: 1024px
+   :align: center
+
+   Layout of Software Heritage archive copies (click to zoom).
+
+The Software Heritage archive exists in several copies, to minimize the risk of
+losing archived source code artifacts. The layout of existing copies, their
+relationships, as well as their geographical and administrative domains are
+shown in the layout diagram above.
+
+We recall that the archive is conceptually organized as a graph, and
+specifically a Merkle DAG, see :ref:`data model <data-model>` for more
+information.
+
+Ingested source code artifacts land directly on the **primary copy**, which is
+updated live and also used as reference for deduplication purposes. There,
+different parts of the Merkle DAG as stored using different backend
+technologies. The leaves of the graph, i.e., *content objects* (or "blobs"),
+are stored in a key-value object storage, using their SHA1 identifiers as keys
+(see :ref:`persistent identifiers <persistent-identifiers>`). SHA1 collision
+avoidance is enforced by the :mod:`swh.storage` module. The *rest of the graph*
+is stored in a Postgres database (see :ref:`SQL storage <sql-storage>`).
+
+At the time of writing, the primary object storage contains about 5 billion
+blobs with a median size of 3 KB---yes, that is *a lot of very small
+files*---for a total compressed size of about 200 TB. The Postgres database
+takes about 8 TB, half of which required by indexes. In terms of graph metrics,
+the Merkle DAG has about 10 B nodes and 100 B edges.
+
+The **secondary copy** is hosted on Microsoft Azure cloud, using its native
+blob storage for the object storage and a large virtual machine to run a
+Postgres instance there. The database is kept up-to-date w.r.t. the primary
+copy using Postgres WAL replication. The object storage is kept up-to-date
+using :mod:`swh.archiver`.
+
+Archive copies (as opposed to archive mirrors) are operated by the Software
+Heritage Team at Inria. The primary archived copy is geographically located at
+Rocquencourt, France; the secondary copy hosted in the Europe West region of
+the Azure cloud.
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000..190deb7e
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1 @@
+from swh.docs.sphinx.conf import *  # NoQA
diff --git a/docs/extrinsic-metadata-specification.rst b/docs/extrinsic-metadata-specification.rst
new file mode 100644
index 00000000..d82bb55a
--- /dev/null
+++ b/docs/extrinsic-metadata-specification.rst
@@ -0,0 +1,251 @@
+:orphan:
+
+.. _extrinsic-metadata-specification:
+
+Extrinsic metadata specification
+================================
+
+:term:`Extrinsic metadata` is information about software that is not part
+of the source code itself but still closely related to the software.
+Typical sources for extrinsic metadata are: the hosting place of a
+repository, which can offer metadata via its web view or API; external
+registries like collaborative curation initiatives; and out-of-band
+information available at source code archival time.
+
+Since they are not part of the source code, a dedicated mechanism to fetch
+and store them is needed.
+
+This specification assumes the reader is familiar with Software Heritage's
+:ref:`architecture` and :ref:`data-model`.
+
+
+Metadata sources
+----------------
+
+Authorities
+^^^^^^^^^^^
+
+Metadata authorities are entities that provide metadata about an
+:term:`origin`. Metadata authorities include: code hosting places,
+:term:`deposit` submitters, and registries (eg. Wikidata).
+
+An authority is uniquely defined by these properties:
+
+  * its type, representing the kind of authority, which is one of these values:
+    * `deposit`, for metadata pushed to Software Heritage at the same time
+      as a software artifact
+    * `forge`, for metadata pulled from the same source as the one hosting
+      the software artifacts (which includes package managers)
+    * `registry`, for metadata pulled from a third-party
+  * its URL, which unambiguously identifies an instance of the authority type.
+
+Examples:
+
+=============== =================================
+type            url
+=============== =================================
+deposit         https://hal.archives-ouvertes.fr/
+deposit         https://hal.inria.fr/
+deposit         https://software.intel.com/
+forge           https://gitlab.com/
+forge           https://gitlab.inria.fr/
+forge           https://0xacab.org/
+forge           https://github.com/
+registry        https://www.wikidata.org/
+registry        https://swmath.org/
+registry        https://ascl.net/
+=============== =================================
+
+Metadata fetchers
+^^^^^^^^^^^^^^^^^
+
+Metadata fetchers are software components used to fetch metadata from
+a metadata authority, and ingest them into the Software Heritage archive.
+
+A metadata fetcher is uniquely defined by these properties:
+
+* its type
+* its version
+
+Examples:
+
+* :term:`loaders <loader>`, which may either discover metadata as a
+  side-effect of loading source code, or be dedicated to fetching metadata.
+
+* :term:`listers <lister>`, which may discover metadata as a side-effect
+  of discovering origins.
+
+* :term:`deposit` submitters, which push metadata to SWH from a
+  third-party; usually at the same time as a :term:`software artifact`
+
+* crawlers, which fetch metadata from an authority in a way that is
+  none of the above (eg. by querying a specific API of the origin's forge).
+
+
+Storage API
+-----------
+
+Authorities and metadata fetchers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :term:`storage` API offers these endpoints to manipulate metadata
+authorities and metadata fetchers:
+
+* ``metadata_authority_add(type, url, metadata)``
+  which adds a new metadata authority to the storage.
+
+* ``metadata_authority_get(type, url)``
+  which looks up a known authority (there is at most one) and if it is
+  known, returns a dictionary with keys ``type``, ``url``, and ``metadata``.
+
+* ``metadata_fetcher_add(name, version, metadata)``
+  which adds a new metadata fetcher to the storage.
+
+* ``metadata_fetcher_get(name, version)``
+  which looks up a known fetcher (there is at most one) and if it is
+  known, returns a dictionary with keys ``name``, ``version``, and
+  ``metadata``.
+
+These `metadata` fields contain JSON-encodable dictionaries
+with information about the authority/fetcher, in a format specific to each
+authority/fetcher.
+With authority, the `metadata` field is reserved for information describing
+and qualifying the authority.
+With fetchers, the `metadata` field is reserved for configuration metadata
+and other technical usage.
+
+Origin metadata
+^^^^^^^^^^^^^^^
+
+Extrinsic metadata are stored in SWH's :term:`storage database`.
+The storage API offers three endpoints to manipulate origin metadata:
+
+* Adding metadata::
+
+      origin_metadata_add(origin_url, discovery_date,
+                          authority, fetcher,
+                          format, metadata)
+
+  which adds a new `metadata` byte string obtained from a given authority
+  and associated to the origin.
+  `discovery_date` is a Python datetime.
+  `authority` must be a dict containing keys `type` and `url`, and
+  `fetcher` a dict containing keys `name` and `version`.
+  The authority and fetcher must be known to the storage before using this
+  endpoint.
+  `format` is a text field indicating the format of the content of the
+  `metadata` byte string.
+
+* Getting latest metadata::
+
+      origin_metadata_get_latest(origin_url, authority)
+
+  where `authority` must be a dict containing keys `type` and `url`,
+  which returns a dictionary corresponding to the latest metadata entry
+  added from this origin, in the format::
+
+      {
+        'origin_url': ...,
+        'authority': {'type': ..., 'url': ...},
+        'fetcher': {'name': ..., 'version': ...},
+        'discovery_date': ...,
+        'format': '...',
+        'metadata': b'...'
+      }
+
+
+* Getting all metadata::
+
+      origin_metadata_get(origin_url,
+                          authority,
+                          page_token, limit)
+
+  where `authority` must be a dict containing keys `type` and `url`
+  which returns a dictionary with keys:
+
+  * `next_page_token`, which is an opaque token to be used as
+    `page_token` for retrieving the next page. if absent, there is
+    no more pages to gather.
+  * `results`: list of dictionaries, one for each metadata item
+    deposited, corresponding to the given origin and obtained from the
+    specified authority.
+
+  Each of these dictionaries is in the following format::
+
+      {
+        'authority': {'type': ..., 'url': ...},
+        'fetcher': {'name': ..., 'version': ...},
+        'discovery_date': ...,
+        'format': '...',
+        'metadata': b'...'
+      }
+
+The parameters ``page_token`` and ``limit`` are used for pagination based on
+an arbitrary order. An initial query to ``origin_metadata_get`` must set
+``page_token`` to ``None``, and further query must use the value from the
+previous query's ``next_page_token`` to get the next page of results.
+
+``metadata`` is a bytes array (eventually encoded using Base64).
+Its format is specific to each authority; and is treated as an opaque value
+by the storage.
+Unifying these various formats into a common language is outside the scope
+of this specification.
+
+Artifact metadata
+^^^^^^^^^^^^^^^^^
+
+In addition to origin metadata, the storage database stores metadata on
+all software artifacts supported by the data model.
+
+This works similarly to origin metadata, with one major difference:
+extrinsic metadata can be given on a specific artifact within a specified
+context (for example: a directory in a specific revision from a specific
+visit on a specific origin) which will be stored along the metadata itself.
+
+For example, two origins may develop the same file independently;
+the information about authorship, licensing or even description may vary
+about the same artifact in a different context.
+This is why it is important to qualify the metadata with the complete
+context for which it is intended, if any.
+
+for each artifact type ``<X>``, there are two endpoints
+to manipulate metadata associated with artifacts of that type:
+
+* Adding metadata::
+
+      <X>_metadata_add(id, context, discovery_date,
+                       authority, fetcher,
+                       format, metadata)
+
+
+* Getting all metadata::
+
+      <X>_metadata_get(id,
+                       authority,
+                       after,
+                       page_token, limit)
+
+
+definited similarly to ``origin_metadata_add`` and ``origin_metadata_get``,
+but where ``id`` is a core SWHID (with type matching ``<X>``),
+and with an extra ``context`` (argument when adding metadata, and dictionary
+key when getting them) that is a dictionary with keys
+depending on the artifact type ``<X>``:
+
+* for ``snapshot``: ``origin`` (a URL) and ``visit`` (an integer)
+* for ``release``: those above, plus ``snapshot``
+  (the core SWHID of a snapshot)
+* for ``revision``: all those above, plus ``release``
+  (the core SWHID of a release)
+* for ``directory``: all those above, plus ``revision``
+  (the core SWHID of a revision)
+  and ``path`` (a byte string), representing the path to this directory
+  from the root of the ``revision``
+* for ``content``: all those above, plus ``directory``
+  (the core SWHID of a directory)
+
+All keys are optional, but should be provided whenever possible.
+The dictionary may be empty, if metadata is fully independent from context.
+
+In all cases, ``visit`` should only be provided if ``origin`` is
+(as visit ids are only unique with respect to an origin).
diff --git a/docs/images/.gitignore b/docs/images/.gitignore
new file mode 100644
index 00000000..542dcd32
--- /dev/null
+++ b/docs/images/.gitignore
@@ -0,0 +1,2 @@
+swh-archive-copies.pdf
+swh-archive-copies.svg
diff --git a/docs/images/Makefile b/docs/images/Makefile
new file mode 100644
index 00000000..59782050
--- /dev/null
+++ b/docs/images/Makefile
@@ -0,0 +1,16 @@
+
+BUILD_TARGETS =
+BUILD_TARGETS += swh-archive-copies.pdf swh-archive-copies.svg
+
+all: $(BUILD_TARGETS)
+
+
+%.svg: %.dia
+	inkscape -l $@ $<
+
+%.pdf: %.dia
+	inkscape -A $@ $<
+
+
+clean:
+	-rm -f $(BUILD_TARGETS)
diff --git a/docs/images/swh-archive-copies.dia b/docs/images/swh-archive-copies.dia
new file mode 100644
index 00000000..bb64fb00
Binary files /dev/null and b/docs/images/swh-archive-copies.dia differ
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 00000000..502967a3
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,45 @@
+.. _swh-storage:
+
+Software Heritage - Storage
+===========================
+
+Abstraction layer over the archive, allowing to access all stored source code
+artifacts as well as their metadata
+
+
+The Software Heritage storage consist of a high-level storage layer
+(:mod:`swh.storage`) that exposes a client/server API
+(:mod:`swh.storage.api`). The API is exposed by a server
+(:mod:`swh.storage.api.server`) and accessible via a client
+(:mod:`swh.storage.api.client`).
+
+The low-level implementation of the storage is split between an object storage
+(:ref:`swh.objstorage <swh-objstorage>`), which stores all "blobs" (i.e., the
+leaves of the :ref:`data-model`) and a SQL representation of the rest of the
+graph (:mod:`swh.storage.storage`).
+
+
+Database schema
+---------------
+
+* :ref:`sql-storage`
+
+
+Archive copies
+--------------
+
+* :ref:`archive-copies`
+
+Specifications
+--------------
+
+* :ref:`extrinsic-metadata-specification`
+
+
+Reference Documentation
+-----------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   /apidoc/swh.storage
diff --git a/docs/sql-storage.rst b/docs/sql-storage.rst
new file mode 100644
index 00000000..01cc2e61
--- /dev/null
+++ b/docs/sql-storage.rst
@@ -0,0 +1,16 @@
+:orphan:
+
+.. _sql-storage:
+
+SQL storage
+===========
+
+Postgres DB schema
+------------------
+
+.. _swh-storage-db-schema:
+.. figure:: ../sql/doc/sql/db-schema.svg
+   :width: 1024px
+   :align: center
+
+   Postgres DB schema of high-level Software Heritage storage (click to zoom).
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 00000000..99c0bcc6
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,60 @@
+[mypy]
+namespace_packages = True
+
+# due to the conditional import logic on swh.journal, in some cases a specific
+# type: ignore is needed, in other it isn't...
+warn_unused_ignores = False
+
+# support for sqlalchemy magic: see https://github.com/dropbox/sqlalchemy-stubs
+plugins = sqlmypy
+
+
+# 3rd party libraries without stubs (yet)
+
+[mypy-cassandra.*]
+ignore_missing_imports = True
+
+[mypy-confluent_kafka.*]
+ignore_missing_imports = True
+
+[mypy-deprecated.*]
+ignore_missing_imports = True
+
+# only shipped indirectly via hypothesis
+[mypy-django.*]
+ignore_missing_imports = True
+
+[mypy-msgpack.*]
+ignore_missing_imports = True
+
+[mypy-multiprocessing.util]
+ignore_missing_imports = True
+
+[mypy-pkg_resources.*]
+ignore_missing_imports = True
+
+[mypy-psycopg2.*]
+ignore_missing_imports = True
+
+[mypy-pytest.*]
+ignore_missing_imports = True
+
+[mypy-pytest_cov.*]
+ignore_missing_imports = True
+
+[mypy-pytest_kafka.*]
+ignore_missing_imports = True
+
+[mypy-systemd.daemon.*]
+ignore_missing_imports = True
+
+[mypy-tenacity.*]
+ignore_missing_imports = True
+
+# temporary work-around for landing typing support in spite of the current
+# journal<->storage dependency loop
+[mypy-swh.journal.*]
+ignore_missing_imports = True
+
+[mypy-pytest_postgresql.*]
+ignore_missing_imports = True
diff --git a/setup.py b/setup.py
index 1f37b14f..72480105 100755
--- a/setup.py
+++ b/setup.py
@@ -1,77 +1,79 @@
 #!/usr/bin/env python3
-# Copyright (C) 2015-2018  The Software Heritage developers
+# Copyright (C) 2015-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from setuptools import setup, find_packages
 
 from os import path
 from io import open
 
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
 with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 
 def parse_requirements(name=None):
     if name:
         reqf = "requirements-%s.txt" % name
     else:
         reqf = "requirements.txt"
 
     requirements = []
     if not path.exists(reqf):
         return requirements
 
     with open(reqf) as f:
         for line in f.readlines():
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
             requirements.append(line)
     return requirements
 
 
 setup(
     name="swh.storage",
     description="Software Heritage storage manager",
     long_description=long_description,
     long_description_content_type="text/markdown",
     python_requires=">=3.7",
     author="Software Heritage developers",
     author_email="swh-devel@inria.fr",
     url="https://forge.softwareheritage.org/diffusion/DSTO/",
+    setup_requires=["setuptools-scm"],
     packages=find_packages(),
+    use_scm_version=True,
     scripts=["bin/swh-storage-add-dir",],
     entry_points="""
         [console_scripts]
         swh-storage=swh.storage.cli:main
         [swh.cli.subcommands]
         storage=swh.storage.cli:storage
+        [pytest11]
+        pytest_swh_storage=swh.storage.pytest_plugin
     """,
-    setup_requires=["vcversioner"],
     install_requires=parse_requirements() + parse_requirements("swh"),
     extras_require={
         "testing": (parse_requirements("test") + parse_requirements("swh-journal")),
         "schemata": ["SQLAlchemy"],
         "journal": parse_requirements("swh-journal"),
     },
-    vcversioner={},
     include_package_data=True,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Development Status :: 5 - Production/Stable",
     ],
     project_urls={
         "Bug Reports": "https://forge.softwareheritage.org/maniphest",
         "Funding": "https://www.softwareheritage.org/donate",
         "Source": "https://forge.softwareheritage.org/source/swh-storage",
         "Documentation": "https://docs.softwareheritage.org/devel/swh-storage/",
     },
 )
diff --git a/swh.storage.egg-info/PKG-INFO b/swh.storage.egg-info/PKG-INFO
index d0aedf47..1976d1a4 100644
--- a/swh.storage.egg-info/PKG-INFO
+++ b/swh.storage.egg-info/PKG-INFO
@@ -1,216 +1,216 @@
 Metadata-Version: 2.1
 Name: swh.storage
-Version: 0.10.0
+Version: 0.10.1
 Summary: Software Heritage storage manager
 Home-page: https://forge.softwareheritage.org/diffusion/DSTO/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-storage
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-storage/
 Description: swh-storage
         ===========
         
         Abstraction layer over the archive, allowing to access all stored source code
         artifacts as well as their metadata.
         
         See the
         [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html)
         for more details.
         
         ## Quick start
         
         ### Dependencies
         
         Python tests for this module include tests that cannot be run without a local
         Postgresql database, so you need the Postgresql server executable on your
         machine (no need to have a running Postgresql server). They also expect a
         cassandra server.
         
         #### Debian-like host
         
         ```
         $ sudo apt install libpq-dev postgresql-11 cassandra
         ```
         
         #### Non Debian-like host
         
         The tests expects `/usr/sbin/cassandra` to exist.
         
         Optionally, you can avoid running the cassandra tests.
         
         ```
         (swh) :~/swh-storage$ tox -- -m 'not cassandra'
         ```
         
         ### Installation
         
         It is strongly recommended to use a virtualenv. In the following, we
         consider you work in a virtualenv named `swh`. See the
         [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup)
         for a more details on how to setup a working environment.
         
         
         You can install the package directly from
         [pypi](https://pypi.org/p/swh.storage):
         
         ```
         (swh) :~$ pip install swh.storage
         [...]
         ```
         
         Or from sources:
         
         ```
         (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git
         [...]
         (swh) :~$ cd swh-storage
         (swh) :~/swh-storage$ pip install .
         [...]
         ```
         
         Then you can check it's properly installed:
         ```
         (swh) :~$ swh storage --help
         Usage: swh storage [OPTIONS] COMMAND [ARGS]...
         
           Software Heritage Storage tools.
         
         Options:
           -h, --help  Show this message and exit.
         
         Commands:
           rpc-serve  Software Heritage Storage RPC server.
         ```
         
         
         ## Tests
         
         The best way of running Python tests for this module is to use
         [tox](https://tox.readthedocs.io/).
         
         ```
         (swh) :~$ pip install tox
         ```
         
         ### tox
         
         From the sources directory, simply use tox:
         
         ```
         (swh) :~/swh-storage$ tox
         [...]
         ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ==========
         _______________________________ summary ________________________________
           flake8: commands succeeded
           py3: commands succeeded
           congratulations :)
         ```
         
         ## Development
         
         The storage server can be locally started. It requires a configuration file and
         a running Postgresql database.
         
         ### Sample configuration
         
         A typical configuration `storage.yml` file is:
         
         ```
         storage:
           cls: local
           args:
             db: "dbname=softwareheritage-dev user=<user> password=<pwd>"
             objstorage:
               cls: pathslicing
               args:
                 root: /tmp/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
         which means, this uses:
         
         - a local storage instance whose db connection is to
           `softwareheritage-dev` local instance,
         
         - the objstorage uses a local objstorage instance whose:
         
           - `root` path is /tmp/swh-storage,
         
           - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of
             the content (sha1) which will be stored on disk at first level
             with the first 2 hex characters, the second level with the next 2
             hex characters and the third level with the next 2 hex
             characters. And finally the complete hash file holding the raw
             content. For example: 00062f8bd330715c4f819373653d97b3cd34394c
             will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c
         
         Note that the `root` path should exist on disk before starting the server.
         
         
         ### Starting the storage server
         
         If the python package has been properly installed (e.g. in a virtual env), you
         should be able to use the command:
         
         ```
         (swh) :~/swh-storage$ swh storage rpc-serve storage.yml
         ```
         
         This runs a local swh-storage api at 5002 port.
         
         ```
         (swh) :~/swh-storage$ curl http://127.0.0.1:5002
         <html>
         <head><title>Software Heritage storage server</title></head>
         <body>
         <p>You have reached the
         <a href="https://www.softwareheritage.org/">Software Heritage</a>
         storage server.<br />
         See its
         <a href="https://docs.softwareheritage.org/devel/swh-storage/">documentation
         and API</a> for more information</p>
         ```
         
         ### And then what?
         
         In your upper layer
         ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/),
         [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/),
         etc...), you can define a remote storage with this snippet of yaml
         configuration.
         
         ```
         storage:
           cls: remote
           args:
             url: http://localhost:5002/
         ```
         
         You could directly define a local storage with the following snippet:
         
         ```
         storage:
           cls: local
           args:
             db: service=swh-dev
             objstorage:
               cls: pathslicing
               args:
                 root: /home/storage/swh-storage/
                 slicing: 0:2/2:4/4:6
         ```
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 Provides-Extra: schemata
 Provides-Extra: journal
diff --git a/swh.storage.egg-info/SOURCES.txt b/swh.storage.egg-info/SOURCES.txt
index 623fcdc1..6827b03e 100644
--- a/swh.storage.egg-info/SOURCES.txt
+++ b/swh.storage.egg-info/SOURCES.txt
@@ -1,266 +1,293 @@
+.gitignore
+.pre-commit-config.yaml
+AUTHORS
+CODE_OF_CONDUCT.md
+CONTRIBUTORS
+LICENSE
 MANIFEST.in
 Makefile
 Makefile.local
 README.md
+mypy.ini
 pyproject.toml
 pytest.ini
+requirements-swh-journal.txt
+requirements-swh.txt
+requirements-test.txt
+requirements.txt
 setup.cfg
 setup.py
 tox.ini
 version.txt
 ./requirements-swh-journal.txt
 ./requirements-swh.txt
 ./requirements-test.txt
 ./requirements.txt
 bin/swh-storage-add-dir
+docs/.gitignore
+docs/Makefile
+docs/Makefile.local
+docs/archive-copies.rst
+docs/conf.py
+docs/extrinsic-metadata-specification.rst
+docs/index.rst
+docs/sql-storage.rst
+docs/_static/.placeholder
+docs/_templates/.placeholder
+docs/images/.gitignore
+docs/images/Makefile
+docs/images/swh-archive-copies.dia
 sql/.gitignore
 sql/Makefile
 sql/TODO
 sql/clusters.dot
 sql/bin/db-upgrade
 sql/bin/dot_add_content
+sql/doc/json
 sql/doc/json/.gitignore
 sql/doc/json/Makefile
 sql/doc/json/entity.lister_metadata.schema.json
 sql/doc/json/entity.metadata.schema.json
 sql/doc/json/entity_history.lister_metadata.schema.json
 sql/doc/json/entity_history.metadata.schema.json
 sql/doc/json/fetch_history.result.schema.json
 sql/doc/json/list_history.result.schema.json
 sql/doc/json/listable_entity.list_params.schema.json
 sql/doc/json/origin_visit.metadata.json
 sql/doc/json/tool.tool_configuration.schema.json
 sql/json/.gitignore
 sql/json/Makefile
 sql/json/entity.lister_metadata.schema.json
 sql/json/entity.metadata.schema.json
 sql/json/entity_history.lister_metadata.schema.json
 sql/json/entity_history.metadata.schema.json
 sql/json/fetch_history.result.schema.json
 sql/json/list_history.result.schema.json
 sql/json/listable_entity.list_params.schema.json
 sql/json/origin_visit.metadata.json
 sql/json/tool.tool_configuration.schema.json
 sql/upgrades/015.sql
 sql/upgrades/016.sql
 sql/upgrades/017.sql
 sql/upgrades/018.sql
 sql/upgrades/019.sql
 sql/upgrades/020.sql
 sql/upgrades/021.sql
 sql/upgrades/022.sql
 sql/upgrades/023.sql
 sql/upgrades/024.sql
 sql/upgrades/025.sql
 sql/upgrades/026.sql
 sql/upgrades/027.sql
 sql/upgrades/028.sql
 sql/upgrades/029.sql
 sql/upgrades/030.sql
 sql/upgrades/032.sql
 sql/upgrades/033.sql
 sql/upgrades/034.sql
 sql/upgrades/035.sql
 sql/upgrades/036.sql
 sql/upgrades/037.sql
 sql/upgrades/038.sql
 sql/upgrades/039.sql
 sql/upgrades/040.sql
 sql/upgrades/041.sql
 sql/upgrades/042.sql
 sql/upgrades/043.sql
 sql/upgrades/044.sql
 sql/upgrades/045.sql
 sql/upgrades/046.sql
 sql/upgrades/047.sql
 sql/upgrades/048.sql
 sql/upgrades/049.sql
 sql/upgrades/050.sql
 sql/upgrades/051.sql
 sql/upgrades/052.sql
 sql/upgrades/053.sql
 sql/upgrades/054.sql
 sql/upgrades/055.sql
 sql/upgrades/056.sql
 sql/upgrades/057.sql
 sql/upgrades/058.sql
 sql/upgrades/059.sql
 sql/upgrades/060.sql
 sql/upgrades/061.sql
 sql/upgrades/062.sql
 sql/upgrades/063.sql
 sql/upgrades/064.sql
 sql/upgrades/065.sql
 sql/upgrades/066.sql
 sql/upgrades/067.sql
 sql/upgrades/068.sql
 sql/upgrades/069.sql
 sql/upgrades/070.sql
 sql/upgrades/071.sql
 sql/upgrades/072.sql
 sql/upgrades/073.sql
 sql/upgrades/074.sql
 sql/upgrades/075.sql
 sql/upgrades/076.sql
 sql/upgrades/077.sql
 sql/upgrades/078.sql
 sql/upgrades/079.sql
 sql/upgrades/080.sql
 sql/upgrades/081.sql
 sql/upgrades/082.sql
 sql/upgrades/083.sql
 sql/upgrades/084.sql
 sql/upgrades/085.sql
 sql/upgrades/086.sql
 sql/upgrades/087.sql
 sql/upgrades/088.sql
 sql/upgrades/089.sql
 sql/upgrades/090.sql
 sql/upgrades/091.sql
 sql/upgrades/092.sql
 sql/upgrades/093.sql
 sql/upgrades/094.sql
 sql/upgrades/095.sql
 sql/upgrades/096.sql
 sql/upgrades/097.sql
 sql/upgrades/098.sql
 sql/upgrades/099.sql
 sql/upgrades/100.sql
 sql/upgrades/101.sql
 sql/upgrades/102.sql
 sql/upgrades/103.sql
 sql/upgrades/104.sql
 sql/upgrades/105.sql
 sql/upgrades/106.sql
 sql/upgrades/107.sql
 sql/upgrades/108.sql
 sql/upgrades/109.sql
 sql/upgrades/110.sql
 sql/upgrades/111.sql
 sql/upgrades/112.sql
 sql/upgrades/113.sql
 sql/upgrades/114.sql
 sql/upgrades/115.sql
 sql/upgrades/116.sql
 sql/upgrades/117.sql
 sql/upgrades/118.sql
 sql/upgrades/119.sql
 sql/upgrades/120.sql
 sql/upgrades/121.sql
 sql/upgrades/122.sql
 sql/upgrades/123.sql
 sql/upgrades/124.sql
 sql/upgrades/125.sql
 sql/upgrades/126.sql
 sql/upgrades/127.sql
 sql/upgrades/128.sql
 sql/upgrades/129.sql
 sql/upgrades/130.sql
 sql/upgrades/131.sql
 sql/upgrades/132.sql
 sql/upgrades/133.sql
 sql/upgrades/134.sql
 sql/upgrades/135.sql
 sql/upgrades/136.sql
 sql/upgrades/137.sql
 sql/upgrades/138.sql
 sql/upgrades/139.sql
 sql/upgrades/140.sql
 sql/upgrades/141.sql
 sql/upgrades/142.sql
 sql/upgrades/143.sql
 sql/upgrades/144.sql
 sql/upgrades/145.sql
 sql/upgrades/146.sql
 sql/upgrades/147.sql
 sql/upgrades/148.sql
 sql/upgrades/149.sql
 sql/upgrades/150.sql
 sql/upgrades/151.sql
 sql/upgrades/152.sql
 sql/upgrades/153.sql
 sql/upgrades/154.sql
 sql/upgrades/155.sql
 sql/upgrades/156.sql
 sql/upgrades/157.sql
 sql/upgrades/158.sql
 swh/__init__.py
 swh.storage.egg-info/PKG-INFO
 swh.storage.egg-info/SOURCES.txt
 swh.storage.egg-info/dependency_links.txt
 swh.storage.egg-info/entry_points.txt
 swh.storage.egg-info/requires.txt
 swh.storage.egg-info/top_level.txt
 swh/storage/__init__.py
 swh/storage/backfill.py
 swh/storage/buffer.py
 swh/storage/cli.py
 swh/storage/common.py
 swh/storage/converters.py
 swh/storage/db.py
 swh/storage/exc.py
 swh/storage/extrinsic_metadata.py
 swh/storage/filter.py
 swh/storage/fixer.py
 swh/storage/in_memory.py
 swh/storage/interface.py
 swh/storage/metrics.py
 swh/storage/objstorage.py
 swh/storage/py.typed
+swh/storage/pytest_plugin.py
 swh/storage/replay.py
 swh/storage/retry.py
 swh/storage/storage.py
 swh/storage/utils.py
 swh/storage/validate.py
 swh/storage/writer.py
 swh/storage/algos/__init__.py
 swh/storage/algos/diff.py
 swh/storage/algos/dir_iterators.py
 swh/storage/algos/origin.py
 swh/storage/algos/revisions_walker.py
 swh/storage/algos/snapshot.py
 swh/storage/api/__init__.py
 swh/storage/api/client.py
 swh/storage/api/serializers.py
 swh/storage/api/server.py
 swh/storage/cassandra/__init__.py
 swh/storage/cassandra/common.py
 swh/storage/cassandra/converters.py
 swh/storage/cassandra/cql.py
 swh/storage/cassandra/schema.py
 swh/storage/cassandra/storage.py
 swh/storage/sql/10-swh-init.sql
 swh/storage/sql/20-swh-enums.sql
 swh/storage/sql/30-swh-schema.sql
 swh/storage/sql/40-swh-func.sql
 swh/storage/sql/60-swh-indexes.sql
 swh/storage/tests/__init__.py
 swh/storage/tests/conftest.py
 swh/storage/tests/generate_data_test.py
 swh/storage/tests/storage_data.py
 swh/storage/tests/test_api_client.py
 swh/storage/tests/test_backfill.py
 swh/storage/tests/test_buffer.py
 swh/storage/tests/test_cassandra.py
 swh/storage/tests/test_cassandra_converters.py
 swh/storage/tests/test_cli.py
 swh/storage/tests/test_converters.py
 swh/storage/tests/test_db.py
 swh/storage/tests/test_exception.py
 swh/storage/tests/test_filter.py
 swh/storage/tests/test_in_memory.py
 swh/storage/tests/test_init.py
 swh/storage/tests/test_kafka_writer.py
 swh/storage/tests/test_metrics.py
 swh/storage/tests/test_replay.py
 swh/storage/tests/test_retry.py
 swh/storage/tests/test_revision_bw_compat.py
 swh/storage/tests/test_server.py
 swh/storage/tests/test_storage.py
 swh/storage/tests/test_utils.py
 swh/storage/tests/algos/__init__.py
 swh/storage/tests/algos/test_diff.py
 swh/storage/tests/algos/test_dir_iterator.py
 swh/storage/tests/algos/test_origin.py
 swh/storage/tests/algos/test_revisions_walker.py
-swh/storage/tests/algos/test_snapshot.py
\ No newline at end of file
+swh/storage/tests/algos/test_snapshot.py
+swh/storage/tests/data/storage.yml
\ No newline at end of file
diff --git a/swh.storage.egg-info/entry_points.txt b/swh.storage.egg-info/entry_points.txt
index a3379a55..c1dba848 100644
--- a/swh.storage.egg-info/entry_points.txt
+++ b/swh.storage.egg-info/entry_points.txt
@@ -1,6 +1,8 @@
 
         [console_scripts]
         swh-storage=swh.storage.cli:main
         [swh.cli.subcommands]
         storage=swh.storage.cli:storage
+        [pytest11]
+        pytest_swh_storage=swh.storage.pytest_plugin
     
\ No newline at end of file
diff --git a/swh/storage/tests/conftest.py b/swh/storage/pytest_plugin.py
similarity index 74%
copy from swh/storage/tests/conftest.py
copy to swh/storage/pytest_plugin.py
index 52d9b4f9..1b010923 100644
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/pytest_plugin.py
@@ -1,272 +1,208 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import glob
-import pytest
-import multiprocessing.util
-from typing import Union
 
 from os import path, environ
-from hypothesis import settings
-from typing import Dict
+from typing import Dict, Union
 
-try:
-    import pytest_cov.embed
-except ImportError:
-    pytest_cov = None
+import pytest
+
+import swh.storage
 
 from pytest_postgresql import factories
 from pytest_postgresql.janitor import DatabaseJanitor, psycopg2, Version
 
-import swh.storage
-
 from swh.core.utils import numfile_sortkey as sortkey
-from swh.model.tests.generate_testdata import gen_contents, gen_origins
-from swh.model.model import (
-    Content,
-    Directory,
-    Origin,
-    OriginVisit,
-    Release,
-    Revision,
-    SkippedContent,
-    Snapshot,
-)
-
-
-OBJECT_FACTORY = {
-    "content": Content.from_dict,
-    "directory": Directory.from_dict,
-    "origin": Origin.from_dict,
-    "origin_visit": OriginVisit.from_dict,
-    "release": Release.from_dict,
-    "revision": Revision.from_dict,
-    "skipped_content": SkippedContent.from_dict,
-    "snapshot": Snapshot.from_dict,
-}
+from swh.storage import get_storage
+from swh.storage.tests.storage_data import data
+
 
 SQL_DIR = path.join(path.dirname(swh.storage.__file__), "sql")
 
 environ["LC_ALL"] = "C.UTF-8"
 
 DUMP_FILES = path.join(SQL_DIR, "*.sql")
 
-# define tests profile. Full documentation is at:
-# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
-settings.register_profile("fast", max_examples=5, deadline=5000)
-settings.register_profile("slow", max_examples=20, deadline=5000)
-
-
-if pytest_cov is not None:
-    # pytest_cov + multiprocessing can cause a segmentation fault when starting
-    # the child process <https://forge.softwareheritage.org/P706>; so we're
-    # removing pytest-coverage's hook that runs when a child process starts.
-    # This means code run in child processes won't be counted in the coverage
-    # report, but this is not an issue because the only code that runs only in
-    # child processes is the RPC server.
-    for (key, value) in multiprocessing.util._afterfork_registry.items():
-        if value is pytest_cov.embed.multiprocessing_start:
-            del multiprocessing.util._afterfork_registry[key]
-            break
-    else:
-        assert False, "missing pytest_cov.embed.multiprocessing_start?"
-
 
 @pytest.fixture
 def swh_storage_backend_config(postgresql_proc, swh_storage_postgresql):
     yield {
         "cls": "local",
         "db": "postgresql://{user}@{host}:{port}/{dbname}".format(
             host=postgresql_proc.host,
             port=postgresql_proc.port,
             user="postgres",
             dbname="tests",
         ),
         "objstorage": {"cls": "memory", "args": {}},
         "journal_writer": {"cls": "memory",},
     }
 
 
 @pytest.fixture
 def swh_storage(swh_storage_backend_config):
-    return swh.storage.get_storage(cls="validate", storage=swh_storage_backend_config)
-
-
-@pytest.fixture
-def swh_contents(swh_storage):
-    contents = gen_contents(n=20)
-    swh_storage.content_add([c for c in contents if c["status"] != "absent"])
-    swh_storage.skipped_content_add([c for c in contents if c["status"] == "absent"])
-    return contents
-
-
-@pytest.fixture
-def swh_origins(swh_storage):
-    origins = gen_origins(n=100)
-    swh_storage.origin_add(origins)
-    return origins
+    return get_storage(cls="validate", storage=swh_storage_backend_config)
 
 
 # the postgres_fact factory fixture below is mostly a copy of the code
 # from pytest-postgresql. We need a custom version here to be able to
 # specify our version of the DBJanitor we use.
 def postgresql_fact(process_fixture_name, db_name=None, dump_files=DUMP_FILES):
     @pytest.fixture
     def postgresql_factory(request):
         """
         Fixture factory for PostgreSQL.
 
         :param FixtureRequest request: fixture request object
         :rtype: psycopg2.connection
         :returns: postgresql client
         """
         config = factories.get_config(request)
         if not psycopg2:
             raise ImportError("No module named psycopg2. Please install it.")
         proc_fixture = request.getfixturevalue(process_fixture_name)
 
         # _, config = try_import('psycopg2', request)
         pg_host = proc_fixture.host
         pg_port = proc_fixture.port
         pg_user = proc_fixture.user
         pg_options = proc_fixture.options
         pg_db = db_name or config["dbname"]
         with SwhDatabaseJanitor(
             pg_user,
             pg_host,
             pg_port,
             pg_db,
             proc_fixture.version,
             dump_files=dump_files,
         ):
             connection = psycopg2.connect(
                 dbname=pg_db,
                 user=pg_user,
                 host=pg_host,
                 port=pg_port,
                 options=pg_options,
             )
             yield connection
             connection.close()
 
     return postgresql_factory
 
 
 swh_storage_postgresql = postgresql_fact("postgresql_proc")
 
 
 # This version of the DatabaseJanitor implement a different setup/teardown
 # behavior than than the stock one: instead of dropping, creating and
 # initializing the database for each test, it create and initialize the db only
 # once, then it truncate the tables. This is needed to have acceptable test
 # performances.
 class SwhDatabaseJanitor(DatabaseJanitor):
     def __init__(
         self,
         user: str,
         host: str,
         port: str,
         db_name: str,
         version: Union[str, float, Version],
         dump_files: str = DUMP_FILES,
     ) -> None:
         super().__init__(user, host, port, db_name, version)
         self.dump_files = sorted(glob.glob(dump_files), key=sortkey)
 
     def db_setup(self):
         with psycopg2.connect(
             dbname=self.db_name, user=self.user, host=self.host, port=self.port,
         ) as cnx:
             with cnx.cursor() as cur:
                 for fname in self.dump_files:
                     with open(fname) as fobj:
                         sql = fobj.read().replace("concurrently", "").strip()
                         if sql:
                             cur.execute(sql)
             cnx.commit()
 
     def db_reset(self):
         with psycopg2.connect(
             dbname=self.db_name, user=self.user, host=self.host, port=self.port,
         ) as cnx:
             with cnx.cursor() as cur:
                 cur.execute(
                     "SELECT table_name FROM information_schema.tables "
                     "WHERE table_schema = %s",
                     ("public",),
                 )
                 tables = set(table for (table,) in cur.fetchall())
                 for table in tables:
                     cur.execute("truncate table %s cascade" % table)
 
                 cur.execute(
                     "SELECT sequence_name FROM information_schema.sequences "
                     "WHERE sequence_schema = %s",
                     ("public",),
                 )
                 seqs = set(seq for (seq,) in cur.fetchall())
                 for seq in seqs:
                     cur.execute("ALTER SEQUENCE %s RESTART;" % seq)
             cnx.commit()
 
     def init(self):
         with self.cursor() as cur:
             cur.execute(
                 "SELECT COUNT(1) FROM pg_database WHERE datname=%s;", (self.db_name,)
             )
             db_exists = cur.fetchone()[0] == 1
             if db_exists:
                 cur.execute(
                     "UPDATE pg_database SET datallowconn=true " "WHERE datname = %s;",
                     (self.db_name,),
                 )
 
         if db_exists:
             self.db_reset()
         else:
             with self.cursor() as cur:
                 cur.execute('CREATE DATABASE "{}";'.format(self.db_name))
             self.db_setup()
 
     def drop(self):
         pid_column = "pid"
         with self.cursor() as cur:
             cur.execute(
                 "UPDATE pg_database SET datallowconn=false " "WHERE datname = %s;",
                 (self.db_name,),
             )
             cur.execute(
                 "SELECT pg_terminate_backend(pg_stat_activity.{})"
                 "FROM pg_stat_activity "
                 "WHERE pg_stat_activity.datname = %s;".format(pid_column),
                 (self.db_name,),
             )
 
 
 @pytest.fixture
 def sample_data() -> Dict:
     """Pre-defined sample storage object data to manipulate
 
     Returns:
         Dict of data (keys: content, directory, revision, release, person,
         origin)
 
     """
-    from .storage_data import data
-
     return {
         "content": [data.cont, data.cont2],
         "content_metadata": [data.cont3],
         "skipped_content": [data.skipped_cont, data.skipped_cont2],
         "person": [data.person],
         "directory": [data.dir2, data.dir],
         "revision": [data.revision, data.revision2, data.revision3],
         "release": [data.release, data.release2, data.release3],
         "snapshot": [data.snapshot],
         "origin": [data.origin, data.origin2],
         "fetcher": [data.metadata_fetcher],
         "authority": [data.metadata_authority],
         "origin_metadata": [data.origin_metadata, data.origin_metadata2],
     }
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
index 52d9b4f9..7598d9a1 100644
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -1,272 +1,75 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import glob
 import pytest
 import multiprocessing.util
-from typing import Union
 
-from os import path, environ
 from hypothesis import settings
-from typing import Dict
 
 try:
     import pytest_cov.embed
 except ImportError:
     pytest_cov = None
 
-from pytest_postgresql import factories
-from pytest_postgresql.janitor import DatabaseJanitor, psycopg2, Version
-
-import swh.storage
-
-from swh.core.utils import numfile_sortkey as sortkey
 from swh.model.tests.generate_testdata import gen_contents, gen_origins
 from swh.model.model import (
     Content,
     Directory,
     Origin,
     OriginVisit,
     Release,
     Revision,
     SkippedContent,
     Snapshot,
 )
+from swh.storage.pytest_plugin import *  # noqa  # for retro compatibility
 
 
 OBJECT_FACTORY = {
     "content": Content.from_dict,
     "directory": Directory.from_dict,
     "origin": Origin.from_dict,
     "origin_visit": OriginVisit.from_dict,
     "release": Release.from_dict,
     "revision": Revision.from_dict,
     "skipped_content": SkippedContent.from_dict,
     "snapshot": Snapshot.from_dict,
 }
 
-SQL_DIR = path.join(path.dirname(swh.storage.__file__), "sql")
-
-environ["LC_ALL"] = "C.UTF-8"
-
-DUMP_FILES = path.join(SQL_DIR, "*.sql")
 
 # define tests profile. Full documentation is at:
 # https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
 settings.register_profile("fast", max_examples=5, deadline=5000)
 settings.register_profile("slow", max_examples=20, deadline=5000)
 
 
 if pytest_cov is not None:
     # pytest_cov + multiprocessing can cause a segmentation fault when starting
     # the child process <https://forge.softwareheritage.org/P706>; so we're
     # removing pytest-coverage's hook that runs when a child process starts.
     # This means code run in child processes won't be counted in the coverage
     # report, but this is not an issue because the only code that runs only in
     # child processes is the RPC server.
     for (key, value) in multiprocessing.util._afterfork_registry.items():
         if value is pytest_cov.embed.multiprocessing_start:
             del multiprocessing.util._afterfork_registry[key]
             break
     else:
         assert False, "missing pytest_cov.embed.multiprocessing_start?"
 
 
-@pytest.fixture
-def swh_storage_backend_config(postgresql_proc, swh_storage_postgresql):
-    yield {
-        "cls": "local",
-        "db": "postgresql://{user}@{host}:{port}/{dbname}".format(
-            host=postgresql_proc.host,
-            port=postgresql_proc.port,
-            user="postgres",
-            dbname="tests",
-        ),
-        "objstorage": {"cls": "memory", "args": {}},
-        "journal_writer": {"cls": "memory",},
-    }
-
-
-@pytest.fixture
-def swh_storage(swh_storage_backend_config):
-    return swh.storage.get_storage(cls="validate", storage=swh_storage_backend_config)
-
-
 @pytest.fixture
 def swh_contents(swh_storage):
     contents = gen_contents(n=20)
     swh_storage.content_add([c for c in contents if c["status"] != "absent"])
     swh_storage.skipped_content_add([c for c in contents if c["status"] == "absent"])
     return contents
 
 
 @pytest.fixture
 def swh_origins(swh_storage):
     origins = gen_origins(n=100)
     swh_storage.origin_add(origins)
     return origins
-
-
-# the postgres_fact factory fixture below is mostly a copy of the code
-# from pytest-postgresql. We need a custom version here to be able to
-# specify our version of the DBJanitor we use.
-def postgresql_fact(process_fixture_name, db_name=None, dump_files=DUMP_FILES):
-    @pytest.fixture
-    def postgresql_factory(request):
-        """
-        Fixture factory for PostgreSQL.
-
-        :param FixtureRequest request: fixture request object
-        :rtype: psycopg2.connection
-        :returns: postgresql client
-        """
-        config = factories.get_config(request)
-        if not psycopg2:
-            raise ImportError("No module named psycopg2. Please install it.")
-        proc_fixture = request.getfixturevalue(process_fixture_name)
-
-        # _, config = try_import('psycopg2', request)
-        pg_host = proc_fixture.host
-        pg_port = proc_fixture.port
-        pg_user = proc_fixture.user
-        pg_options = proc_fixture.options
-        pg_db = db_name or config["dbname"]
-        with SwhDatabaseJanitor(
-            pg_user,
-            pg_host,
-            pg_port,
-            pg_db,
-            proc_fixture.version,
-            dump_files=dump_files,
-        ):
-            connection = psycopg2.connect(
-                dbname=pg_db,
-                user=pg_user,
-                host=pg_host,
-                port=pg_port,
-                options=pg_options,
-            )
-            yield connection
-            connection.close()
-
-    return postgresql_factory
-
-
-swh_storage_postgresql = postgresql_fact("postgresql_proc")
-
-
-# This version of the DatabaseJanitor implement a different setup/teardown
-# behavior than than the stock one: instead of dropping, creating and
-# initializing the database for each test, it create and initialize the db only
-# once, then it truncate the tables. This is needed to have acceptable test
-# performances.
-class SwhDatabaseJanitor(DatabaseJanitor):
-    def __init__(
-        self,
-        user: str,
-        host: str,
-        port: str,
-        db_name: str,
-        version: Union[str, float, Version],
-        dump_files: str = DUMP_FILES,
-    ) -> None:
-        super().__init__(user, host, port, db_name, version)
-        self.dump_files = sorted(glob.glob(dump_files), key=sortkey)
-
-    def db_setup(self):
-        with psycopg2.connect(
-            dbname=self.db_name, user=self.user, host=self.host, port=self.port,
-        ) as cnx:
-            with cnx.cursor() as cur:
-                for fname in self.dump_files:
-                    with open(fname) as fobj:
-                        sql = fobj.read().replace("concurrently", "").strip()
-                        if sql:
-                            cur.execute(sql)
-            cnx.commit()
-
-    def db_reset(self):
-        with psycopg2.connect(
-            dbname=self.db_name, user=self.user, host=self.host, port=self.port,
-        ) as cnx:
-            with cnx.cursor() as cur:
-                cur.execute(
-                    "SELECT table_name FROM information_schema.tables "
-                    "WHERE table_schema = %s",
-                    ("public",),
-                )
-                tables = set(table for (table,) in cur.fetchall())
-                for table in tables:
-                    cur.execute("truncate table %s cascade" % table)
-
-                cur.execute(
-                    "SELECT sequence_name FROM information_schema.sequences "
-                    "WHERE sequence_schema = %s",
-                    ("public",),
-                )
-                seqs = set(seq for (seq,) in cur.fetchall())
-                for seq in seqs:
-                    cur.execute("ALTER SEQUENCE %s RESTART;" % seq)
-            cnx.commit()
-
-    def init(self):
-        with self.cursor() as cur:
-            cur.execute(
-                "SELECT COUNT(1) FROM pg_database WHERE datname=%s;", (self.db_name,)
-            )
-            db_exists = cur.fetchone()[0] == 1
-            if db_exists:
-                cur.execute(
-                    "UPDATE pg_database SET datallowconn=true " "WHERE datname = %s;",
-                    (self.db_name,),
-                )
-
-        if db_exists:
-            self.db_reset()
-        else:
-            with self.cursor() as cur:
-                cur.execute('CREATE DATABASE "{}";'.format(self.db_name))
-            self.db_setup()
-
-    def drop(self):
-        pid_column = "pid"
-        with self.cursor() as cur:
-            cur.execute(
-                "UPDATE pg_database SET datallowconn=false " "WHERE datname = %s;",
-                (self.db_name,),
-            )
-            cur.execute(
-                "SELECT pg_terminate_backend(pg_stat_activity.{})"
-                "FROM pg_stat_activity "
-                "WHERE pg_stat_activity.datname = %s;".format(pid_column),
-                (self.db_name,),
-            )
-
-
-@pytest.fixture
-def sample_data() -> Dict:
-    """Pre-defined sample storage object data to manipulate
-
-    Returns:
-        Dict of data (keys: content, directory, revision, release, person,
-        origin)
-
-    """
-    from .storage_data import data
-
-    return {
-        "content": [data.cont, data.cont2],
-        "content_metadata": [data.cont3],
-        "skipped_content": [data.skipped_cont, data.skipped_cont2],
-        "person": [data.person],
-        "directory": [data.dir2, data.dir],
-        "revision": [data.revision, data.revision2, data.revision3],
-        "release": [data.release, data.release2, data.release3],
-        "snapshot": [data.snapshot],
-        "origin": [data.origin, data.origin2],
-        "fetcher": [data.metadata_fetcher],
-        "authority": [data.metadata_authority],
-        "origin_metadata": [data.origin_metadata, data.origin_metadata2],
-    }
diff --git a/swh/storage/tests/data/storage.yml b/swh/storage/tests/data/storage.yml
new file mode 100644
index 00000000..97f9f67b
--- /dev/null
+++ b/swh/storage/tests/data/storage.yml
@@ -0,0 +1,13 @@
+storage:
+  cls: local
+  args:
+    db: dbname=%s
+
+    objstorage:
+      cls: pathslicing
+      args:
+        root: TMPDIR
+        slicing: "0:1/1:5"
+
+    journal_writer:
+      cls: inmemory