diff --git a/PKG-INFO b/PKG-INFO index 23b2f2b..a5473b6 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,36 +1,42 @@ Metadata-Version: 2.1 Name: swh.vault -Version: 1.3.0 +Version: 1.4.2 Summary: Software Heritage vault Home-page: https://forge.softwareheritage.org/diffusion/DVAU/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-vault Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-vault/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 -Description-Content-Type: text/markdown +Description-Content-Type: text/x-rst Provides-Extra: testing Provides-Extra: graph License-File: LICENSE License-File: AUTHORS -swh-vault -========= +Software Heritage - Vault +========================= User-facing service that allows to retrieve parts of the archive as -self-contained bundles. +self-contained bundles (e.g., individual releases, entire repository snapshots, +etc.) +The creation of a bundle is called "cooking" a bundle. -See the -[documentation](https://docs.softwareheritage.org/devel/swh-vault/index.html) -for more details. +Architecture +------------ + +The vault is made of two main parts: + +1. a stateful RPC server called the **backend** +2. Celery tasks, called **cookers** diff --git a/README.md b/README.md deleted file mode 100644 index d1bb762..0000000 --- a/README.md +++ /dev/null @@ -1,9 +0,0 @@ -swh-vault -========= - -User-facing service that allows to retrieve parts of the archive as -self-contained bundles. - -See the -[documentation](https://docs.softwareheritage.org/devel/swh-vault/index.html) -for more details. diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..f9aee20 --- /dev/null +++ b/README.rst @@ -0,0 +1,15 @@ +Software Heritage - Vault +========================= + +User-facing service that allows to retrieve parts of the archive as +self-contained bundles (e.g., individual releases, entire repository snapshots, +etc.) +The creation of a bundle is called "cooking" a bundle. + +Architecture +------------ + +The vault is made of two main parts: + +1. a stateful RPC server called the **backend** +2. Celery tasks, called **cookers** diff --git a/debian/changelog b/debian/changelog index 17257c3..46faed9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,510 +1,512 @@ -swh-vault (1.3.0-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-vault (1.4.2-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 1.4.2 - (tagged by Valentin Lorentz + on 2022-02-08 11:43:01 +0100) + * Upstream changes: - v1.4.2 - * Fix PyPI upload - -- Software Heritage autobuilder (on jenkins-debian1) Tue, 16 Nov 2021 14:06:52 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Feb 2022 10:45:44 +0000 swh-vault (1.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.0 - (tagged by Valentin Lorentz on 2021-11-16 15:02:00 +0100) * Upstream changes: - v1.3.0 - * docs: Various fixes - * Remove references to swh.model.identifiers - * git_bare: Send progress updates while cooking -- Software Heritage autobuilder (on jenkins-debian1) Tue, 16 Nov 2021 14:05:28 +0000 swh-vault (1.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.0 - (tagged by Valentin Lorentz on 2021-09-17 17:44:51 +0200) * Upstream changes: - v1.2.0 - * Add support for custom SMTP configuration - * Add the whole traceback in error messages. - * Make object corruption non-fatal. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 15:47:52 +0000 swh-vault (1.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.1.0 - (tagged by Valentin Lorentz on 2021-09-08 14:29:18 +0200) * Upstream changes: - v1.1.0 - * git_bare: Remove sample git hooks from output - * git_bare: Fix crash on submodules -- Software Heritage autobuilder (on jenkins-debian1) Wed, 08 Sep 2021 12:32:28 +0000 swh-vault (1.0.2-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.2 - (tagged by Valentin Lorentz on 2021-08-26 14:23:53 +0200) * Upstream changes: - v1.0.2 - * Fix compatibility with dulwich 0.19.11 - (needed for builds on debian 10) -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Aug 2021 12:27:30 +0000 swh-vault (1.0.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.1 - (tagged by Valentin Lorentz on 2021-08-26 14:13:53 +0200) * Upstream changes: - v1.0.1 - * Re-add pytest.mark.graph to fix debian builds. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Aug 2021 12:17:30 +0000 swh-vault (1.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.0 - (tagged by Valentin Lorentz on 2021-08-26 11:54:21 +0200) * Upstream changes: - v1.0.0 - * Feature-complete git-bare cooker - * Rename bundle types and use SWHIDs everywhere instead of raw sha1_git -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Aug 2021 09:57:07 +0000 swh-vault (0.6.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-29 13:18:27 +0200) * Upstream changes: - v0.6.4 - Fix tests when the umask is not 022 - tests: Fix support of Dulwich < 0.20 - conftest: Use postgresql keyword for the configuration -- Software Heritage autobuilder (on jenkins-debian1) Tue, 29 Jun 2021 11:20:54 +0000 swh-vault (0.6.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.3 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-29 11:55:43 +0200) * Upstream changes: - v0.6.3 - git_bare: Add support for filtered content with Git >= 2.21 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 29 Jun 2021 09:59:47 +0000 swh-vault (0.6.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.2 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-29 11:08:44 +0200) * Upstream changes: - v0.6.2 - Make swh.graph dependency optional 2/2 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 29 Jun 2021 09:12:28 +0000 swh-vault (0.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-29 10:12:19 +0200) * Upstream changes: - v0.6.1 - Make swh.graph dependency optional -- Software Heritage autobuilder (on jenkins-debian1) Tue, 29 Jun 2021 08:15:52 +0000 swh-vault (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-28 12:18:51 +0200) * Upstream changes: - v0.6.0 - git_bare: Add support for skipped/missing/absent/hidden contents - git_bare: Optionally access the objstorage directly - git_bare: Use batched content_get() instead of content_find() - git_bare: Use directory_get_entries instead of directory_ls, it should be faster - git_bare: Refactor the graph descent using explicit stacks instead of the call stack. - git_bare: When possible, use swh-graph instead of swh-storage to query revision - history - git_bare: Deduplicate object downloads and writes - Add a naive git bare cooker - cli: Add 'cook' command, to run cookers without Celery - tests: Run all directory tests on the gitfast cooker - tests: Add in_memory_backend.py - tests: Make test_directory_bogus_perms/test_revision_bogus_perms/ actually test the - cookers -- Software Heritage autobuilder (on jenkins-debian1) Mon, 28 Jun 2021 10:25:48 +0000 swh-vault (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Antoine Lambert on 2021-04-29 14:42:43 +0200) * Upstream changes: - version 0.5.1 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 29 Apr 2021 12:48:13 +0000 swh-vault (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-12-08 15:58:26 +0100) * Upstream changes: - v0.5.0 - vault: Remove deprecated services default config - cli: Remove deprecated logging configuration -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Dec 2020 15:01:11 +0000 swh-vault (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-23 13:50:22 +0100) * Upstream changes: - v0.4.0 - requirements-test.txt: Drop no longer needed test dependency - swh.vault.tests.conftest: Drop dead code -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Nov 2020 12:52:25 +0000 swh-vault (0.3.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.4 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-23 11:35:47 +0100) * Upstream changes: - v0.3.4 - test_server: Fix exception structure - conftest: Explicitely declare aiohttp pytest plugin use -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Nov 2020 10:37:50 +0000 swh-vault (0.3.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.3 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-18 18:02:35 +0100) * Upstream changes: - v0.3.3 - Fix api.server configuration adaptation issue -- Software Heritage autobuilder (on jenkins-debian1) Wed, 18 Nov 2020 18:40:45 +0000 swh-vault (0.3.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-17 17:46:37 +0100) * Upstream changes: - v0.3.1 - test_server: Simplify test server initialization to the minimum -- Software Heritage autobuilder (on jenkins-debian1) Tue, 17 Nov 2020 16:54:22 +0000 swh-vault (0.3.0-1~swh2) unstable-swh; urgency=medium * Fix dependency release -- Antoine R. Dumont (@ardumont) Tue, 17 Nov 2020 16:54:03 +0000 swh-vault (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-13 12:10:09 +0100) * Upstream changes: - v0.3.0 - Refactor vault configuration without the args indirection - vault.server: Introduce typed VaultInterface - Replace file modes literals to DentryPerms enum - Add tests on current configuration check for cooker instantiation - api.server: Add types and tests on configuration checks - swh.vault: Unify get_vault factory function with other factories - vault.tests: Make postgresql fixture faster -- Software Heritage autobuilder (on jenkins-debian1) Tue, 17 Nov 2020 16:22:52 +0000 swh-vault (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-19 09:52:04 +0200) * Upstream changes: - v0.2.0 - vault.config: Adapt scheduler configuration structure - test_cookers: Turn git_loader into a pytest fixture - tests: Fix loader git instantiation - tox.ini: pin black to the pre-commit version (19.10b0) to avoid flip- flops - Run isort after the CLI import changes -- Software Heritage autobuilder (on jenkins-debian1) Mon, 19 Oct 2020 07:54:03 +0000 swh-vault (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by David Douard on 2020-09-25 12:34:43 +0200) * Upstream changes: - v0.1.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Sep 2020 10:37:22 +0000 swh-vault (0.0.35-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.35 - (tagged by David Douard on 2020-09-11 15:15:26 +0200) * Upstream changes: - v0.0.35 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 11 Sep 2020 13:18:50 +0000 swh-vault (0.0.34-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.34 - (tagged by Antoine Lambert on 2020-08-18 13:55:51 +0200) * Upstream changes: - version 0.0.34 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 18 Aug 2020 11:58:22 +0000 swh-vault (0.0.33-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.33 - (tagged by Valentin Lorentz on 2020-05-05 17:56:33 +0200) * Upstream changes: - v0.0.33 - * Use swh-storage validation proxy. - * Use model objects to send to storage - * Add a pyproject.toml file to target py37 for black - * setup: Update the minimum required runtime python3 version - * setup.py: add documentation link - * Raise NotFoundExc within our RPC framework instead of returning 404. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 05 May 2020 15:59:51 +0000 swh-vault (0.0.32-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.32 - (tagged by Antoine Lambert on 2020-02-05 13:00:19 +0100) * Upstream changes: - version 0.0.32 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 05 Feb 2020 12:16:16 +0000 swh-vault (0.0.31-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.31 - (tagged by Stefano Zacchiroli on 2019-11-05 17:24:43 +0100) * Upstream changes: - v0.0.31 - * typing: minimal changes to make a no-op mypy run pass - * Remove indirection swh.vault.api.wsgi - * tox.ini: Fix py3 environment to use packaged tests - * CLI: drop obsolete alias "serve" for "rpc- serve" -- Software Heritage autobuilder (on jenkins-debian1) Tue, 05 Nov 2019 16:44:29 +0000 swh-vault (0.0.30-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.30 - (tagged by Antoine Lambert on 2019-07-29 11:17:23 +0200) * Upstream changes: - version 0.0.30 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 29 Jul 2019 09:22:02 +0000 swh-vault (0.0.29-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.29 - (tagged by Antoine Lambert on 2019-05-23 11:39:12 +0200) * Upstream changes: - version 0.0.29 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 May 2019 09:46:57 +0000 swh-vault (0.0.28-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.28 - (tagged by Antoine Lambert on 2019-05-23 11:00:51 +0200) * Upstream changes: - version 0.0.28 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 May 2019 09:05:34 +0000 swh-vault (0.0.27-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.27 - (tagged by Antoine Lambert on 2019-05-07 14:44:26 +0200) * Upstream changes: - version 0.0.27 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 May 2019 12:54:35 +0000 swh-vault (0.0.26-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.26 - (tagged by Antoine Lambert on 2019-04-26 11:59:23 +0200) * Upstream changes: - version 0.0.26 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 26 Apr 2019 10:06:45 +0000 swh-vault (0.0.25-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.25 - (tagged by Antoine R. Dumont (@ardumont) on 2019-03-29 12:19:19 +0100) * Upstream changes: - v0.0.25 - master vault.backend: Migrate email address to bot@swh.org - API: use default's APIError exception instead of the VaultAPIError - Remove debian packaging from master branch -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Mar 2019 11:28:28 +0000 swh-vault (0.0.24-1~swh3) unstable-swh; urgency=low * d/control: Update missing build dependency on postgresql-contrib -- Antoine Romain Dumont Mon, 18 Feb 2019 16:20:50 +0100 swh-vault (0.0.24-1~swh2) unstable-swh; urgency=low * d/control: Update missing build dependency on git * d/rules: Sanitize build locale -- Antoine Romain Dumont Mon, 18 Feb 2019 16:04:50 +0100 swh-vault (0.0.24-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.24 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-18 15:21:31 +0100) * Upstream changes: - v0.0.24 - MANIFEST.in: Fix packaging to include the sql schema definitions -- Software Heritage autobuilder (on jenkins-debian1) Mon, 18 Feb 2019 14:25:33 +0000 swh-vault (0.0.23-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.23 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-18 14:39:25 +0100) * Upstream changes: - v0.0.23 - test_cookers: Fix commit behavior when committing to another branch - Rewrite tests using pytest's fixtures and adapt them to recent refactorings - Normalize the configuration of VaultBackend and cooker - Make it possible to specify the config file via SWH_CONFIG_FILENAME env var - Refactor the VaultBackend to use BaseDb and pool-based db access - Add a swh.vault.api.wsgi module to instanciate the (singleton) wsgi app object -- Software Heritage autobuilder (on jenkins-debian1) Mon, 18 Feb 2019 13:48:28 +0000 swh-vault (0.0.22-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.22 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-14 10:12:41 +0100) * Upstream changes: - v0.0.22 - api/server: Do not read configuration at each request -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Feb 2019 09:16:23 +0000 swh-vault (0.0.21-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.21 - (tagged by David Douard on 2019-02-07 17:38:49 +0100) * Upstream changes: - v0.0.21 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 07 Feb 2019 16:44:51 +0000 swh-vault (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * swh.vault: Open a get_vault instantiation function * swh.vault.api.client: Permit to specify the query timeout option * swh.storage doesn't expose a db attribute any longer -- Antoine R. Dumont (@ardumont) Thu, 24 May 2018 12:31:50 +0200 swh-vault (0.0.19-1~swh1) unstable-swh; urgency=medium * version 0.0.19 -- Antoine Pietri Thu, 03 May 2018 17:49:18 +0200 swh-vault (0.0.18-1~swh1) unstable-swh; urgency=medium * version 0.0.18 -- Antoine Pietri Thu, 03 May 2018 17:10:24 +0200 swh-vault (0.0.17-1~swh1) unstable-swh; urgency=medium * version 0.0.17 -- Antoine Pietri Thu, 03 May 2018 13:16:59 +0200 swh-vault (0.0.16-1~swh1) unstable-swh; urgency=medium * version 0.0.16 -- Antoine Pietri Wed, 02 May 2018 13:41:05 +0200 swh-vault (0.0.15-1~swh1) unstable-swh; urgency=medium * version 0.0.15 -- Antoine Pietri Fri, 27 Apr 2018 18:46:06 +0200 swh-vault (0.0.14-1~swh1) unstable-swh; urgency=medium * version 0.0.14 -- Antoine Pietri Fri, 27 Apr 2018 17:11:50 +0200 swh-vault (0.0.13-1~swh1) unstable-swh; urgency=medium * version 0.0.13 -- Antoine Pietri Wed, 25 Apr 2018 15:52:33 +0200 swh-vault (0.0.12-1~swh1) unstable-swh; urgency=medium * version 0.0.12 -- Antoine Pietri Wed, 21 Feb 2018 15:30:25 +0100 swh-vault (0.0.11-1~swh1) unstable-swh; urgency=medium * version 0.0.11 -- Antoine Pietri Fri, 16 Feb 2018 16:09:10 +0100 swh-vault (0.0.10-1~swh1) unstable-swh; urgency=medium * version 0.0.10 -- Antoine Pietri Thu, 15 Feb 2018 16:08:05 +0100 swh-vault (0.0.9-1~swh1) unstable-swh; urgency=medium * version 0.0.9 -- Antoine Pietri Thu, 01 Feb 2018 18:21:29 +0100 swh-vault (0.0.8-1~swh1) unstable-swh; urgency=medium * version 0.0.8 -- Antoine Pietri Wed, 31 Jan 2018 17:54:55 +0100 swh-vault (0.0.7-1~swh1) unstable-swh; urgency=medium * version 0.0.7 -- Antoine Pietri Tue, 30 Jan 2018 18:21:07 +0100 swh-vault (0.0.6-1~swh1) unstable-swh; urgency=medium * version 0.0.6 -- Antoine Pietri Tue, 09 Jan 2018 16:37:41 +0100 swh-vault (0.0.5-1~swh1) unstable-swh; urgency=medium * version 0.0.5 -- Antoine Pietri Thu, 14 Dec 2017 19:33:01 +0100 swh-vault (0.0.4-1~swh1) unstable-swh; urgency=medium * version 0.0.4 -- Antoine Pietri Fri, 08 Dec 2017 15:33:54 +0100 swh-vault (0.0.3-1~swh1) unstable-swh; urgency=medium * version 0.0.3 -- Antoine Pietri Fri, 01 Dec 2017 15:31:34 +0100 swh-vault (0.0.2-1~swh1) unstable-swh; urgency=medium * version 0.0.2 -- Antoine Pietri Thu, 30 Nov 2017 15:50:43 +0100 swh-vault (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * version 0.0.1 -- Antoine Pietri Mon, 13 Nov 2017 16:22:47 +0100 diff --git a/docs/README.rst b/docs/README.rst new file mode 100644 index 0000000..f9aee20 --- /dev/null +++ b/docs/README.rst @@ -0,0 +1,15 @@ +Software Heritage - Vault +========================= + +User-facing service that allows to retrieve parts of the archive as +self-contained bundles (e.g., individual releases, entire repository snapshots, +etc.) +The creation of a bundle is called "cooking" a bundle. + +Architecture +------------ + +The vault is made of two main parts: + +1. a stateful RPC server called the **backend** +2. Celery tasks, called **cookers** diff --git a/docs/api.rst b/docs/api.rst index 832d5be..2d1559f 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,173 +1,172 @@ .. _vault-api-ref: Vault API Reference =================== Software source code **objects**---e.g., individual files, directories, commits, tagged releases, etc.---are stored in the Software Heritage (SWH) Archive in fully deduplicated form. That allows direct access to individual artifacts, but require some preparation ("cooking") when fast access to a large set of related objects (e.g., an entire repository) is required. The **Software Heritage Vault** takes care of that preparation by asynchronously assembling **bundles** of related source code objects, caching, and garbage collecting them as needed. The Vault is accessible via a RPC API documented below. -All endpoints are mounted at API root, which is currently at -https://archive.softwareheritage.org/api/1/. +All endpoints are mounted at API root, which is currently at :swh_web:`api/1/`. Unless otherwise stated, API endpoints respond to HTTP GET method. Object identification --------------------- The vault stores bundles corresponding to different kinds of objects (see :ref:`data-model`). The URL fragment ``:bundletype/:swhid`` is used throughout the vault API to identify vault objects. See :ref:`persistent-identifiers` for details on the syntax and meaning of ``:swhid``. Bundle types ------------ Flat ~~~~ Flat bundles are simple tarballs that can be read without any specialized software. When cooking directories, they are (very close to) the original directories that were ingested. When cooking other types of objects, they have multiple root directories, each corresponding to an original object (revision, ...) This is typically only useful to cook directories; cooking other types of objects (revisions, releases, snapshots) are usually done with ``git-bare`` as it is more efficient and closer to the original repository. You can extract the resulting bundle using: .. code:: shell tar xaf bundle.tar.gz gitfast ~~~~~~~ A gzip-compressed `git fast-export `_. You can extract the resulting bundle using: .. code:: shell git init zcat bundle.gitfast.gz | git fast-import git checkout HEAD git-bare ~~~~~~~~ A tarball that can be decompressed to get a real git repository. It is without a checkout, so it is the equivalent of what one would get with ``git clone --bare``. This is the most flexible bundle type, as it allow to perfectly recreate original git repositories, including branches. You can extract the resulting bundle using: .. code:: shell tar xaf bundle.tar.gz Then explore its content like a normal ("non-bare") git repository by cloning it: .. code:: shell git clone path/to/extracted/:swhid Cooking and status checking --------------------------- Vault bundles might be ready for retrieval or not. When they are not, they will need to be **cooked** before they can be retrieved. A cooked bundle will remain around until it expires; after expiration, it will need to be cooked again before it can be retrieved. Cooking is idempotent, and a no-op in between a previous cooking operation and expiration. .. http:post:: /vault/:bundletype/:swhid .. http:get:: /vault/:bundletype/:swhid **Request body**: optionally, an ``email`` POST parameter containing an e-mail to notify when the bundle cooking has ended. **Allowed HTTP Methods:** - :http:method:`post` to **request** a bundle cooking - :http:method:`get` to check the progress and status of the cooking - :http:method:`head` - :http:method:`options` **Response:** :statuscode 200: bundle available for cooking, status of the cooking :statuscode 400: malformed SWHID :statuscode 404: unavailable bundle or object not found .. sourcecode:: http HTTP/1.1 200 OK Content-Type: application/json { "id": 42, "fetch_url": "/api/1/vault/flat/:swhid/raw/", "swhid": ":swhid", "progress_message": "Creating tarball...", "status": "pending" } After a cooking request has been started, all subsequent GET and POST requests to the cooking URL return some JSON data containing information about the progress of the bundle creation. The JSON contains the following keys: - ``id``: the ID of the cooking request - ``fetch_url``: the URL that can be used for the retrieval of the bundle - ``swhid``: the identifier of the requested bundle - ``progress_message``: a string describing the current progress of the cooking. If the cooking failed, ``progress_message`` will contain the reason of the failure. - ``status``: one of the following values: - ``new``: the bundle request was created - ``pending``: the bundle is being cooked - ``done``: the bundle has been cooked and is ready for retrieval - ``failed``: the bundle cooking failed and can be retried Retrieval --------- Retrieve a specific bundle from the vault with: .. http:get:: /vault/:bundletype/:swhid/raw **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` **Response**: :statuscode 200: bundle available; response body is the bundle. :statuscode 404: unavailable bundle; client should request its cooking. diff --git a/docs/index.rst b/docs/index.rst index 74ab82d..7855d45 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,26 +1,49 @@ .. _swh-vault: -Software Heritage - Vault -========================= +.. include:: README.rst -User-facing service that allows to retrieve parts of the archive as -self-contained bundles (e.g., individual releases, entire repository snapshots, -etc.) +The Vault backend +~~~~~~~~~~~~~~~~~ +The Vault backend is the RPC server other |swh| components (mainly +:ref:`swh-web `) interact with. + +It is in charge of receiving cooking requests, scheduling corresponding tasks (via +:ref:`swh-scheduler ` and Celery), getting heartbeats and final +results from these, cooking tasks, and finally serving the results. + +It uses the same RPC protocol as the other components of the archive, and +its interface is described in :mod:`swh.vault.interface`. + +The cookers +~~~~~~~~~~~ + +Cookers are Python modules/classes, each in charge of cooking a type of bundle. +The main ones are :mod:`swh.vault.cookers.directory` for flat tarballs of directories, +and :mod:`swh.vault.cookers.git_bare` for bare ``.git`` repositories of any +type of git object. +They all derive from :class:`swh.vault.cookers.base.BaseVaultCooker`. + +The base cooker first notifies the backend the cooking task is in progress, +then runs the cooker (which does the bundle-specific handling and uploads the result), +then notifies the backend of the final result (success/failure). + +Cookers may notify the backend of the progress, so they can be displayed in +swh-web's vault interface, which polls the status from the vault backend. .. toctree:: :maxdepth: 2 :caption: Contents: getting-started.rst api.rst Reference Documentation ----------------------- .. toctree:: :maxdepth: 2 cli /apidoc/swh.vault diff --git a/requirements-swh-graph.txt b/requirements-swh-graph.txt index e7307fd..79e6a9b 100644 --- a/requirements-swh-graph.txt +++ b/requirements-swh-graph.txt @@ -1 +1 @@ -swh.graph >= v0.3.2 +swh.graph >= 0.3.2 diff --git a/requirements-swh.txt b/requirements-swh.txt index 70a99b0..9d34234 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core[db,http] >= 0.14.4 swh.model >= 3.0.0 swh.objstorage >= 0.0.17 swh.scheduler >= 0.7.0 -swh.storage >= 0.29.0 +swh.storage >= 0.43.1 diff --git a/requirements-test.txt b/requirements-test.txt index 9c23fb1..4d5fa7a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,10 +1,10 @@ -pytest +pytest < 7.0.0 # v7.0.0 removed _pytest.tmpdir.TempdirFactory, which is used by some of the pytest plugins we use dulwich >= 0.18.7 swh.loader.core swh.loader.git >= 0.8 swh.storage[testing] pytest-mock types-click types-python-dateutil types-pyyaml types-requests diff --git a/setup.py b/setup.py index 5c94d3d..832416c 100755 --- a/setup.py +++ b/setup.py @@ -1,74 +1,74 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: +with open(path.join(here, "README.rst"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.vault", description="Software Heritage vault", long_description=long_description, - long_description_content_type="text/markdown", + long_description_content_type="text/x-rst", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DVAU/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={ "testing": parse_requirements("test"), "graph": parse_requirements("swh-graph"), }, include_package_data=True, zip_safe=False, entry_points=""" [swh.cli.subcommands] vault=swh.vault.cli """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-vault", "Documentation": "https://docs.softwareheritage.org/devel/swh-vault/", }, ) diff --git a/swh.vault.egg-info/PKG-INFO b/swh.vault.egg-info/PKG-INFO index 23b2f2b..a5473b6 100644 --- a/swh.vault.egg-info/PKG-INFO +++ b/swh.vault.egg-info/PKG-INFO @@ -1,36 +1,42 @@ Metadata-Version: 2.1 Name: swh.vault -Version: 1.3.0 +Version: 1.4.2 Summary: Software Heritage vault Home-page: https://forge.softwareheritage.org/diffusion/DVAU/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-vault Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-vault/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 -Description-Content-Type: text/markdown +Description-Content-Type: text/x-rst Provides-Extra: testing Provides-Extra: graph License-File: LICENSE License-File: AUTHORS -swh-vault -========= +Software Heritage - Vault +========================= User-facing service that allows to retrieve parts of the archive as -self-contained bundles. +self-contained bundles (e.g., individual releases, entire repository snapshots, +etc.) +The creation of a bundle is called "cooking" a bundle. -See the -[documentation](https://docs.softwareheritage.org/devel/swh-vault/index.html) -for more details. +Architecture +------------ + +The vault is made of two main parts: + +1. a stateful RPC server called the **backend** +2. Celery tasks, called **cookers** diff --git a/swh.vault.egg-info/SOURCES.txt b/swh.vault.egg-info/SOURCES.txt index 4d0fc51..0185d5f 100644 --- a/swh.vault.egg-info/SOURCES.txt +++ b/swh.vault.egg-info/SOURCES.txt @@ -1,74 +1,75 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile -README.md +README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh-graph.txt requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile +docs/README.rst docs/api.rst docs/cli.rst docs/conf.py docs/getting-started.rst docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder sql/upgrades/002.sql sql/upgrades/003.sql swh/__init__.py swh.vault.egg-info/PKG-INFO swh.vault.egg-info/SOURCES.txt swh.vault.egg-info/dependency_links.txt swh.vault.egg-info/entry_points.txt swh.vault.egg-info/not-zip-safe swh.vault.egg-info/requires.txt swh.vault.egg-info/top_level.txt swh/vault/__init__.py swh/vault/backend.py swh/vault/cache.py swh/vault/cli.py swh/vault/cooking_tasks.py swh/vault/exc.py swh/vault/in_memory_backend.py swh/vault/interface.py swh/vault/py.typed swh/vault/to_disk.py swh/vault/api/__init__.py swh/vault/api/client.py swh/vault/api/serializers.py swh/vault/api/server.py swh/vault/cookers/__init__.py swh/vault/cookers/base.py swh/vault/cookers/directory.py swh/vault/cookers/git_bare.py swh/vault/cookers/revision_flat.py swh/vault/cookers/revision_gitfast.py swh/vault/cookers/utils.py swh/vault/sql/30-schema.sql swh/vault/tests/__init__.py swh/vault/tests/conftest.py swh/vault/tests/test_backend.py swh/vault/tests/test_cache.py swh/vault/tests/test_cli.py swh/vault/tests/test_cookers.py swh/vault/tests/test_cookers_base.py swh/vault/tests/test_git_bare_cooker.py swh/vault/tests/test_init.py swh/vault/tests/test_init_cookers.py swh/vault/tests/test_server.py swh/vault/tests/test_to_disk.py swh/vault/tests/vault_testing.py \ No newline at end of file diff --git a/swh.vault.egg-info/requires.txt b/swh.vault.egg-info/requires.txt index e328356..c9a8f15 100644 --- a/swh.vault.egg-info/requires.txt +++ b/swh.vault.egg-info/requires.txt @@ -1,26 +1,26 @@ click flask psycopg2 python-dateutil fastimport typing-extensions swh.core[db,http]>=0.14.4 swh.model>=3.0.0 swh.objstorage>=0.0.17 swh.scheduler>=0.7.0 -swh.storage>=0.29.0 +swh.storage>=0.43.1 [graph] -swh.graph>=v0.3.2 +swh.graph>=0.3.2 [testing] -pytest +pytest<7.0.0 dulwich>=0.18.7 swh.loader.core swh.loader.git>=0.8 swh.storage[testing] pytest-mock types-click types-python-dateutil types-pyyaml types-requests diff --git a/swh/__init__.py b/swh/__init__.py index 8d9f151..b36383a 100644 --- a/swh/__init__.py +++ b/swh/__init__.py @@ -1,4 +1,3 @@ from pkgutil import extend_path -from typing import List -__path__: List[str] = extend_path(__path__, __name__) +__path__ = extend_path(__path__, __name__) diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py index 58aa728..c583695 100644 --- a/swh/vault/cookers/__init__.py +++ b/swh/vault/cookers/__init__.py @@ -1,131 +1,129 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations import os from typing import Any, Dict, List, Type from swh.core.config import load_named_config from swh.core.config import read as read_config from swh.model.swhids import CoreSWHID, ObjectType from swh.storage import get_storage from swh.vault import get_vault from swh.vault.cookers.base import DEFAULT_CONFIG, DEFAULT_CONFIG_PATH, BaseVaultCooker from swh.vault.cookers.directory import DirectoryCooker from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.cookers.revision_flat import RevisionFlatCooker from swh.vault.cookers.revision_gitfast import RevisionGitfastCooker _COOKER_CLS: List[Type[BaseVaultCooker]] = [ DirectoryCooker, RevisionFlatCooker, RevisionGitfastCooker, GitBareCooker, ] COOKER_TYPES: Dict[str, List[Type[BaseVaultCooker]]] = {} for _cooker_cls in _COOKER_CLS: COOKER_TYPES.setdefault(_cooker_cls.BUNDLE_TYPE, []).append(_cooker_cls) def get_cooker_cls(bundle_type: str, object_type: ObjectType): cookers = COOKER_TYPES.get(bundle_type) if not cookers: raise ValueError(f"{bundle_type} is not a valid bundle type.") for cooker in cookers: try: cooker.check_object_type(object_type) except ValueError: pass else: return cooker raise ValueError( f"{object_type.name.lower()} objects do not have a {bundle_type} cooker" ) def check_config(cfg: Dict[str, Any]) -> Dict[str, Any]: """Ensure the configuration is ok to run a vault worker, and propagate defaults Raises: EnvironmentError if the configuration is not for remote instance ValueError if one of the following keys is missing: vault, storage Returns: New configuration dict to instantiate a vault worker instance """ cfg = cfg.copy() if "vault" not in cfg: raise ValueError("missing 'vault' configuration") vcfg = cfg["vault"] if vcfg["cls"] != "remote": raise EnvironmentError( "This vault backend can only be a 'remote' configuration" ) # TODO: Soft-deprecation of args key. Remove when ready. vcfg.update(vcfg.get("args", {})) # Default to top-level value if any if "storage" not in vcfg: vcfg["storage"] = cfg.get("storage") if not vcfg.get("storage"): raise ValueError("invalid configuration: missing 'storage' config entry.") return cfg def get_cooker(bundle_type: str, swhid: CoreSWHID): """Instantiate a cooker class of type bundle_type. Returns: Cooker class in charge of cooking the bundle_type with id swhid. Raises: ValueError in case of a missing top-level vault key configuration or a storage key. EnvironmentError in case the vault configuration reference a non remote class. """ if "SWH_CONFIG_FILENAME" in os.environ: cfg = read_config(os.environ["SWH_CONFIG_FILENAME"], DEFAULT_CONFIG) else: cfg = load_named_config(DEFAULT_CONFIG_PATH, DEFAULT_CONFIG) cooker_cls = get_cooker_cls(bundle_type, swhid.object_type) cfg = check_config(cfg) vcfg = cfg["vault"] storage = get_storage(**vcfg.pop("storage")) backend = get_vault(**vcfg) try: from swh.graph.client import RemoteGraphClient # optional dependency graph = RemoteGraphClient(**vcfg["graph"]) if vcfg.get("graph") else None except ModuleNotFoundError: if vcfg.get("graph"): raise EnvironmentError( "Graph configuration required but module is not installed." ) else: graph = None - return cooker_cls( - swhid, - backend=backend, - storage=storage, - graph=graph, - max_bundle_size=cfg["max_bundle_size"], - ) + kwargs = { + k: v for (k, v) in cfg.items() if k in ("max_bundle_size", "thread_pool_size") + } + + return cooker_cls(swhid, backend=backend, storage=storage, graph=graph, **kwargs,) diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py index 6108309..38549db 100644 --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -1,157 +1,159 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import io import logging import traceback from typing import ClassVar, Set from psycopg2.extensions import QueryCanceledError from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.interface import StorageInterface MAX_BUNDLE_SIZE = 2 ** 29 # 512 MiB DEFAULT_CONFIG_PATH = "vault/cooker" DEFAULT_CONFIG = { "max_bundle_size": ("int", MAX_BUNDLE_SIZE), } class PolicyError(Exception): """Raised when the bundle violates the cooking policy.""" pass class BundleTooLargeError(PolicyError): """Raised when the bundle is too large to be cooked.""" pass class BytesIOBundleSizeLimit(io.BytesIO): def __init__(self, *args, size_limit=None, **kwargs): super().__init__(*args, **kwargs) self.size_limit = size_limit def write(self, chunk): if ( self.size_limit is not None and self.getbuffer().nbytes + len(chunk) > self.size_limit ): raise BundleTooLargeError( "The requested bundle exceeds the maximum allowed " "size of {} bytes.".format(self.size_limit) ) return super().write(chunk) class BaseVaultCooker(metaclass=abc.ABCMeta): """Abstract base class for the vault's bundle creators This class describes a common API for the cookers. To define a new cooker, inherit from this class and override: - CACHE_TYPE_KEY: key to use for the bundle to reference in cache - def cook(): cook the object into a bundle """ SUPPORTED_OBJECT_TYPES: ClassVar[Set[ObjectType]] BUNDLE_TYPE: ClassVar[str] def __init__( self, swhid: CoreSWHID, backend, storage: StorageInterface, graph=None, objstorage=None, max_bundle_size: int = MAX_BUNDLE_SIZE, + thread_pool_size: int = 10, ): """Initialize the cooker. The type of the object represented by the id depends on the concrete class. Very likely, each type of bundle will have its own cooker class. Args: swhid: id of the object to be cooked into a bundle. backend: the vault backend (swh.vault.backend.VaultBackend). """ self.check_object_type(swhid.object_type) self.swhid = swhid self.obj_id = swhid.object_id self.backend = backend self.storage = storage self.objstorage = objstorage self.graph = graph self.max_bundle_size = max_bundle_size + self.thread_pool_size = thread_pool_size @classmethod def check_object_type(cls, object_type: ObjectType) -> None: if object_type not in cls.SUPPORTED_OBJECT_TYPES: raise ValueError(f"{cls.__name__} does not support {object_type} objects.") @abc.abstractmethod def check_exists(self): """Checks that the requested object exists and can be cooked. Override this in the cooker implementation. """ raise NotImplementedError @abc.abstractmethod def prepare_bundle(self): """Implementation of the cooker. Yields chunks of the bundle bytes. Override this with the cooker implementation. """ raise NotImplementedError def cache_type_key(self) -> str: assert self.BUNDLE_TYPE return self.BUNDLE_TYPE def write(self, chunk): self.fileobj.write(chunk) def cook(self): """Cook the requested object into a bundle """ self.backend.set_status(self.BUNDLE_TYPE, self.swhid, "pending") self.backend.set_progress(self.BUNDLE_TYPE, self.swhid, "Processing...") self.fileobj = BytesIOBundleSizeLimit(size_limit=self.max_bundle_size) try: try: self.prepare_bundle() except QueryCanceledError: raise PolicyError( "Timeout reached while assembling the requested bundle" ) bundle = self.fileobj.getvalue() # TODO: use proper content streaming instead of put_bundle() self.backend.put_bundle(self.cache_type_key(), self.swhid, bundle) except PolicyError as e: logging.info("Bundle cooking violated policy: %s", e) self.backend.set_status(self.BUNDLE_TYPE, self.swhid, "failed") self.backend.set_progress(self.BUNDLE_TYPE, self.swhid, str(e)) except Exception: self.backend.set_status(self.BUNDLE_TYPE, self.swhid, "failed") tb = traceback.format_exc() self.backend.set_progress( self.BUNDLE_TYPE, self.swhid, f"Internal Server Error. This incident will be reported.\n" f"The full error was:\n\n{tb}", ) logging.exception("Bundle cooking failed.") else: self.backend.set_status(self.BUNDLE_TYPE, self.swhid, "done") self.backend.set_progress(self.BUNDLE_TYPE, self.swhid, None) finally: self.backend.send_notif(self.BUNDLE_TYPE, self.swhid) diff --git a/swh/vault/cookers/git_bare.py b/swh/vault/cookers/git_bare.py index 8e0dc15..de0ac1c 100644 --- a/swh/vault/cookers/git_bare.py +++ b/swh/vault/cookers/git_bare.py @@ -1,620 +1,691 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ This cooker creates tarballs containing a bare .git directory, that can be unpacked and cloned like any git repository. It works in three steps: 1. Write objects one by one in :file:`.git/objects/` 2. Calls ``git repack`` to pack all these objects into git packfiles. 3. Creates a tarball of the resulting repository It keeps a set of all written (or about-to-be-written) object hashes in memory to avoid downloading and writing the same objects twice. + +The first step is the most complex. When swh-graph is available, this roughly does +the following: + +1. Find all the revisions and releases in the induced subgraph, adds them to + todo-lists +2. Grab a batch from (release/revision/directory/content) todo-lists, and load them. + Add directory and content objects they reference to the todo-list +3. If any todo-list is not empty, goto 1 + +When swh-graph is not available, steps 1 and 2 are merged, because revisions need +to be loaded in order to compute the subgraph. """ import datetime import enum import glob import logging +import multiprocessing.dummy import os.path import re import subprocess import tarfile import tempfile from typing import Any, Dict, Iterable, Iterator, List, NoReturn, Optional, Set, Tuple import zlib from swh.core.api.classes import stream_results_optional from swh.model import git_objects from swh.model.hashutil import hash_to_bytehex, hash_to_hex from swh.model.model import ( Person, Release, Revision, RevisionType, Sha1Git, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) -from swh.model.model import Content, DirectoryEntry +from swh.model.model import Content, Directory, DirectoryEntry from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.revisions_walker import DFSRevisionsWalker from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.vault.cookers.base import BaseVaultCooker from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE RELEASE_BATCH_SIZE = 10000 REVISION_BATCH_SIZE = 10000 DIRECTORY_BATCH_SIZE = 10000 CONTENT_BATCH_SIZE = 100 logger = logging.getLogger(__name__) class RootObjectType(enum.Enum): DIRECTORY = "directory" REVISION = "revision" SNAPSHOT = "snapshot" def assert_never(value: NoReturn, msg) -> NoReturn: """mypy makes sure this function is never called, through exhaustive checking of ``value`` in the parent function. See https://mypy.readthedocs.io/en/latest/literal_types.html#exhaustive-checks for details. """ assert False, msg class GitBareCooker(BaseVaultCooker): BUNDLE_TYPE = "git_bare" SUPPORTED_OBJECT_TYPES = {ObjectType[obj_type.name] for obj_type in RootObjectType} use_fsck = True obj_type: RootObjectType def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.obj_type = RootObjectType[self.swhid.object_type.name] def check_exists(self) -> bool: + """Returns whether the root object is present in the archive.""" if self.obj_type is RootObjectType.REVISION: return not list(self.storage.revision_missing([self.obj_id])) elif self.obj_type is RootObjectType.DIRECTORY: return not list(self.storage.directory_missing([self.obj_id])) elif self.obj_type is RootObjectType.SNAPSHOT: return not list(self.storage.snapshot_missing([self.obj_id])) else: assert_never(self.obj_type, f"Unexpected root object type: {self.obj_type}") def _push(self, stack: List[Sha1Git], obj_ids: Iterable[Sha1Git]) -> None: + """Adds all the given ``obj_ids`` to the given ``stack``, unless they are + already in ``self._seen``, and adds them to ``self._seen``.""" assert not isinstance(obj_ids, bytes) revision_ids = [id_ for id_ in obj_ids if id_ not in self._seen] self._seen.update(revision_ids) stack.extend(revision_ids) def _pop(self, stack: List[Sha1Git], n: int) -> List[Sha1Git]: + """Removes ``n`` object from the ``stack`` and returns them.""" obj_ids = stack[-n:] stack[-n:] = [] return obj_ids def prepare_bundle(self): - # Objects we will visit soon: + """Main entry point. Initializes the state, creates the bundle, and + sends it to the backend.""" + # Objects we will visit soon (aka. "todo-lists"): self._rel_stack: List[Sha1Git] = [] self._rev_stack: List[Sha1Git] = [] self._dir_stack: List[Sha1Git] = [] self._cnt_stack: List[Sha1Git] = [] # Set of objects already in any of the stacks: self._seen: Set[Sha1Git] = set() self._walker_state: Optional[Any] = None # Set of errors we expect git-fsck to raise at the end: self._expected_fsck_errors = set() with tempfile.TemporaryDirectory(prefix="swh-vault-gitbare-") as workdir: # Initialize a Git directory self.workdir = workdir self.gitdir = os.path.join(workdir, "clone.git") os.mkdir(self.gitdir) self.init_git() + self.nb_loaded = 0 + # Add the root object to the stack of objects to visit self.push_subgraph(self.obj_type, self.obj_id) # Load and write all the objects to disk self.load_objects() self.backend.set_progress( self.BUNDLE_TYPE, self.swhid, "Writing references..." ) # Write the root object as a ref (this step is skipped if it's a snapshot) # This must be done before repacking; git-repack ignores orphan objects. self.write_refs() self.backend.set_progress( self.BUNDLE_TYPE, self.swhid, "Checking content integrity" ) if self.use_fsck: self.git_fsck() self.backend.set_progress( self.BUNDLE_TYPE, self.swhid, "Creating final bundle" ) self.repack() self.write_archive() self.backend.set_progress(self.BUNDLE_TYPE, self.swhid, "Uploading bundle") def init_git(self) -> None: + """Creates an empty :file:`.git` directory.""" subprocess.run(["git", "-C", self.gitdir, "init", "--bare"], check=True) self.create_object_dirs() # Remove example hooks; they take ~40KB and we don't use them for filename in glob.glob(os.path.join(self.gitdir, "hooks", "*.sample")): os.unlink(filename) def create_object_dirs(self) -> None: + """Creates all possible subdirectories of :file:`.git/objects/`""" # Create all possible dirs ahead of time, so we don't have to check for # existence every time. for byte in range(256): try: os.mkdir(os.path.join(self.gitdir, "objects", f"{byte:02x}")) except FileExistsError: pass def repack(self) -> None: - # Add objects we wrote in a pack + """Moves all objects from :file:`.git/objects/` to a packfile.""" try: subprocess.run(["git", "-C", self.gitdir, "repack", "-d"], check=True) except subprocess.CalledProcessError: logging.exception("git-repack failed with:") # Remove their non-packed originals subprocess.run(["git", "-C", self.gitdir, "prune-packed"], check=True) def git_fsck(self) -> None: + """Runs git-fsck and ignores expected errors (eg. because of missing + objects).""" proc = subprocess.run( ["git", "-C", self.gitdir, "fsck"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env={"LANG": "C.utf8"}, ) # Split on newlines not followed by a space errors = re.split("\n(?! )", proc.stdout.decode()) errors = [ error for error in errors if error and not error.startswith("warning ") ] unexpected_errors = set(errors) - self._expected_fsck_errors if unexpected_errors: logging.error( "Unexpected errors from git-fsck after cooking %s: %s", self.swhid, "\n".join(sorted(unexpected_errors)), ) def write_refs(self, snapshot=None): + """Writes all files in :file:`.git/refs/`. + + For non-snapshot objects, this is only ``master``.""" refs: Dict[bytes, bytes] # ref name -> target if self.obj_type == RootObjectType.DIRECTORY: # We need a synthetic revision pointing to the directory author = Person.from_fullname( b"swh-vault, git-bare cooker " ) dt = datetime.datetime.now(tz=datetime.timezone.utc) dt = dt.replace(microsecond=0) # not supported by git date = TimestampWithTimezone.from_datetime(dt) revision = Revision( author=author, committer=author, date=date, committer_date=date, message=b"Initial commit", type=RevisionType.GIT, directory=self.obj_id, synthetic=True, ) - self.write_revision_node(revision.to_dict()) + self.write_revision_node(revision) refs = {b"refs/heads/master": hash_to_bytehex(revision.id)} elif self.obj_type == RootObjectType.REVISION: refs = {b"refs/heads/master": hash_to_bytehex(self.obj_id)} elif self.obj_type == RootObjectType.SNAPSHOT: if snapshot is None: # refs were already written in a previous step return branches = [] for (branch_name, branch) in snapshot.branches.items(): if branch is None: logging.error( "%s has dangling branch: %r", snapshot.swhid(), branch_name ) else: branches.append((branch_name, branch)) refs = { branch_name: ( b"ref: " + branch.target if branch.target_type == TargetType.ALIAS else hash_to_bytehex(branch.target) ) for (branch_name, branch) in branches } else: assert_never(self.obj_type, f"Unexpected root object type: {self.obj_type}") for (ref_name, ref_target) in refs.items(): path = os.path.join(self.gitdir.encode(), ref_name) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as fd: fd.write(ref_target) def write_archive(self): + """Creates the final .tar file.""" with tarfile.TarFile(mode="w", fileobj=self.fileobj) as tf: tf.add(self.gitdir, arcname=f"{self.swhid}.git", recursive=True) def _obj_path(self, obj_id: Sha1Git): + """Returns the absolute path of file (in :file:`.git/objects/`) that will + contain the git object identified by the ``obj_id``.""" return os.path.join(self.gitdir, self._obj_relative_path(obj_id)) def _obj_relative_path(self, obj_id: Sha1Git): + """Same as :meth:`_obj_path`, but relative.""" obj_id_hex = hash_to_hex(obj_id) directory = obj_id_hex[0:2] filename = obj_id_hex[2:] return os.path.join("objects", directory, filename) def object_exists(self, obj_id: Sha1Git) -> bool: + """Returns whether the object identified by the given ``obj_id`` was already + written to a file in :file:`.git/object/`. + + This function ignores objects contained in a git pack.""" return os.path.exists(self._obj_path(obj_id)) def write_object(self, obj_id: Sha1Git, obj: bytes) -> bool: """Writes a git object on disk. Returns whether it was already written.""" # Git requires objects to be zlib-compressed; but repacking decompresses and # removes them, so we don't need to compress them too much. data = zlib.compress(obj, level=1) with open(self._obj_path(obj_id), "wb") as fd: fd.write(data) return True def push_subgraph(self, obj_type: RootObjectType, obj_id) -> None: + """Adds graph induced by the given ``obj_id`` without recursing through + directories, to the todo-lists. + + If swh-graph is not available, this immediately loads revisions, as they + need to be fetched in order to compute the subgraph, and fetching them + immediately avoids duplicate fetches.""" if self.obj_type is RootObjectType.REVISION: self.push_revision_subgraph(obj_id) elif self.obj_type is RootObjectType.DIRECTORY: self._push(self._dir_stack, [obj_id]) elif self.obj_type is RootObjectType.SNAPSHOT: self.push_snapshot_subgraph(obj_id) else: assert_never(self.obj_type, f"Unexpected root object type: {self.obj_type}") def load_objects(self) -> None: + """Repeatedly loads objects in the todo-lists, until all lists are empty.""" while self._rel_stack or self._rev_stack or self._dir_stack or self._cnt_stack: nb_remaining = ( len(self._rel_stack) + len(self._rev_stack) + len(self._dir_stack) + len(self._cnt_stack) ) # We assume assume nb_remaining is a lower bound. # When the snapshot was loaded with swh-graph, this should be the exact # value, though. self.backend.set_progress( self.BUNDLE_TYPE, self.swhid, - f"Processing... {len(self._seen)} objects processed\n" + f"Processing... {self.nb_loaded} objects processed\n" f"Over {nb_remaining} remaining", ) release_ids = self._pop(self._rel_stack, RELEASE_BATCH_SIZE) if release_ids: self.load_releases(release_ids) + self.nb_loaded += len(release_ids) revision_ids = self._pop(self._rev_stack, REVISION_BATCH_SIZE) if revision_ids: self.load_revisions(revision_ids) + self.nb_loaded += len(revision_ids) directory_ids = self._pop(self._dir_stack, DIRECTORY_BATCH_SIZE) if directory_ids: self.load_directories(directory_ids) + self.nb_loaded += len(directory_ids) content_ids = self._pop(self._cnt_stack, CONTENT_BATCH_SIZE) if content_ids: self.load_contents(content_ids) + self.nb_loaded += len(content_ids) def push_revision_subgraph(self, obj_id: Sha1Git) -> None: - """Fetches a revision and all its children, and writes them to disk""" + """Fetches the graph of revisions induced by the given ``obj_id`` and adds + them to ``self._rev_stack``. + + If swh-graph is not available, this requires fetching the revisions themselves, + so they are directly loaded instead.""" loaded_from_graph = False if self.graph: from swh.graph.client import GraphArgumentException # First, try to cook using swh-graph, as it is more efficient than # swh-storage for querying the history obj_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id,) try: revision_ids = ( swhid.object_id for swhid in map( CoreSWHID.from_string, self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"), ) ) self._push(self._rev_stack, revision_ids) except GraphArgumentException as e: logger.info( "Revision %s not found in swh-graph, falling back to fetching " "history using swh-storage. %s", hash_to_hex(obj_id), e.args[0], ) else: loaded_from_graph = True if not loaded_from_graph: # If swh-graph is not available, or the revision is not yet in # swh-graph, fall back to self.storage.revision_log. # self.storage.revision_log also gives us the full revisions, # so we load them right now instead of just pushing them on the stack. - walker = DFSRevisionsWalker(self.storage, obj_id, state=self._walker_state) + walker = DFSRevisionsWalker( + self.storage, obj_id, state=self._walker_state, ignore_displayname=True + ) for revision in walker: - self.write_revision_node(revision) + self.write_revision_node(Revision.from_dict(revision)) + self.nb_loaded += 1 self._push(self._dir_stack, [revision["directory"]]) # Save the state, so the next call to the walker won't return the same # revisions self._walker_state = walker.export_state() def push_snapshot_subgraph(self, obj_id: Sha1Git) -> None: - """Fetches a snapshot and all its children, and writes them to disk""" + """Fetches a snapshot and all its children, excluding directories and contents, + and pushes them to the todo-lists. + + Also loads revisions if swh-graph is not available, see + :meth:`push_revision_subgraph`.""" loaded_from_graph = False if self.graph: revision_ids = [] release_ids = [] directory_ids = [] content_ids = [] from swh.graph.client import GraphArgumentException # First, try to cook using swh-graph, as it is more efficient than # swh-storage for querying the history obj_swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=obj_id,) try: swhids: Iterable[CoreSWHID] = map( CoreSWHID.from_string, self.graph.visit_nodes(str(obj_swhid), edges="snp:*,rel:*,rev:rev"), ) for swhid in swhids: if swhid.object_type is ObjectType.REVISION: revision_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.RELEASE: release_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.DIRECTORY: directory_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.CONTENT: content_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.SNAPSHOT: assert ( swhid.object_id == obj_id ), f"Snapshot {obj_id.hex()} references a different snapshot" else: assert_never( swhid.object_type, f"Unexpected SWHID object type: {swhid}" ) except GraphArgumentException as e: logger.info( "Snapshot %s not found in swh-graph, falling back to fetching " "history for each branch. %s", hash_to_hex(obj_id), e.args[0], ) else: self._push(self._rev_stack, revision_ids) self._push(self._rel_stack, release_ids) self._push(self._dir_stack, directory_ids) self._push(self._cnt_stack, content_ids) loaded_from_graph = True # TODO: when self.graph is available and supports edge labels, use it # directly to get branch names. snapshot: Optional[Snapshot] = snapshot_get_all_branches(self.storage, obj_id) assert snapshot, "Unknown snapshot" # should have been caught by check_exists() for branch in snapshot.branches.values(): if not loaded_from_graph: if branch is None: logging.warning("Dangling branch: %r", branch) continue assert isinstance(branch, SnapshotBranch) # for mypy if branch.target_type is TargetType.REVISION: self.push_revision_subgraph(branch.target) elif branch.target_type is TargetType.RELEASE: self.push_releases_subgraphs([branch.target]) elif branch.target_type is TargetType.ALIAS: # Nothing to do, this for loop also iterates on the target branch # (if it exists) pass elif branch.target_type is TargetType.DIRECTORY: self._push(self._dir_stack, [branch.target]) elif branch.target_type is TargetType.CONTENT: self._push(self._cnt_stack, [branch.target]) elif branch.target_type is TargetType.SNAPSHOT: if swhid.object_id != obj_id: raise NotImplementedError( f"{swhid} has a snapshot as a branch." ) else: assert_never( branch.target_type, f"Unexpected target type: {self.obj_type}" ) self.write_refs(snapshot=snapshot) def load_revisions(self, obj_ids: List[Sha1Git]) -> None: """Given a list of revision ids, loads these revisions and their directories; - but not their parent revisions.""" - ret: List[Optional[Revision]] = self.storage.revision_get(obj_ids) + but not their parent revisions (ie. this is not recursive).""" + ret: List[Optional[Revision]] = self.storage.revision_get( + obj_ids, ignore_displayname=True + ) revisions: List[Revision] = list(filter(None, ret)) if len(ret) != len(revisions): logger.error("Missing revision(s), ignoring them.") for revision in revisions: - self.write_revision_node(revision.to_dict()) + self.write_revision_node(revision) self._push(self._dir_stack, (rev.directory for rev in revisions)) - def write_revision_node(self, revision: Dict[str, Any]) -> bool: + def write_revision_node(self, revision: Revision) -> bool: """Writes a revision object to disk""" - git_object = git_objects.revision_git_object(revision) - return self.write_object(revision["id"], git_object) + git_object = revision.raw_manifest or git_objects.revision_git_object(revision) + return self.write_object(revision.id, git_object) def load_releases(self, obj_ids: List[Sha1Git]) -> List[Release]: """Loads release objects, and returns them.""" - ret = self.storage.release_get(obj_ids) + ret = self.storage.release_get(obj_ids, ignore_displayname=True) releases = list(filter(None, ret)) if len(ret) != len(releases): logger.error("Missing release(s), ignoring them.") for release in releases: - self.write_release_node(release.to_dict()) + self.write_release_node(release) return releases def push_releases_subgraphs(self, obj_ids: List[Sha1Git]) -> None: """Given a list of release ids, loads these releases and adds their target to the list of objects to visit""" for release in self.load_releases(obj_ids): + self.nb_loaded += 1 assert release.target, "{release.swhid(}) has no target" if release.target_type is ModelObjectType.REVISION: self.push_revision_subgraph(release.target) elif release.target_type is ModelObjectType.DIRECTORY: self._push(self._dir_stack, [release.target]) elif release.target_type is ModelObjectType.CONTENT: self._push(self._cnt_stack, [release.target]) elif release.target_type is ModelObjectType.RELEASE: self.push_releases_subgraphs([release.target]) elif release.target_type is ModelObjectType.SNAPSHOT: raise NotImplementedError( f"{release.swhid()} targets a snapshot: {release.target!r}" ) else: assert_never( release.target_type, f"Unexpected release target type: {release.target_type}", ) - def write_release_node(self, release: Dict[str, Any]) -> bool: + def write_release_node(self, release: Release) -> bool: """Writes a release object to disk""" - git_object = git_objects.release_git_object(release) - return self.write_object(release["id"], git_object) + git_object = release.raw_manifest or git_objects.release_git_object(release) + return self.write_object(release.id, git_object) def load_directories(self, obj_ids: List[Sha1Git]) -> None: - for obj_id in obj_ids: - self.load_directory(obj_id) + if not obj_ids: + return - def load_directory(self, obj_id: Sha1Git) -> None: + raw_manifests = self.storage.directory_get_raw_manifest(obj_ids) + + with multiprocessing.dummy.Pool(min(self.thread_pool_size, len(obj_ids))) as p: + for _ in p.imap_unordered( + lambda obj_id: self.load_directory(obj_id, raw_manifests.get(obj_id)), + obj_ids, + ): + pass + + def load_directory(self, obj_id: Sha1Git, raw_manifest: Optional[bytes]) -> None: # Load the directory entries_it: Optional[Iterable[DirectoryEntry]] = stream_results_optional( self.storage.directory_get_entries, obj_id ) if entries_it is None: logger.error("Missing swh:1:dir:%s, ignoring.", hash_to_hex(obj_id)) return - entries = [entry.to_dict() for entry in entries_it] - directory = {"id": obj_id, "entries": entries} - git_object = git_objects.directory_git_object(directory) + directory = Directory( + id=obj_id, entries=tuple(entries_it), raw_manifest=raw_manifest + ) + git_object = raw_manifest or git_objects.directory_git_object(directory) self.write_object(obj_id, git_object) # Add children to the stack entry_loaders: Dict[str, Optional[List[Sha1Git]]] = { "file": self._cnt_stack, "dir": self._dir_stack, "rev": None, # Do not include submodule targets (rejected by git-fsck) } - for entry in directory["entries"]: - stack = entry_loaders[entry["type"]] + for entry in directory.entries: + stack = entry_loaders[entry.type] if stack is not None: - self._push(stack, [entry["target"]]) + self._push(stack, [entry.target]) def load_contents(self, obj_ids: List[Sha1Git]) -> None: # TODO: add support of filtered objects, somehow? # It's tricky, because, by definition, we can't write a git object with # the expected hash, so git-fsck *will* choke on it. contents = self.storage.content_get(obj_ids, "sha1_git") visible_contents = [] for (obj_id, content) in zip(obj_ids, contents): if content is None: # FIXME: this may also happen for missing content self.write_content(obj_id, SKIPPED_MESSAGE) self._expect_mismatched_object_error(obj_id) elif content.status == "visible": visible_contents.append(content) elif content.status == "hidden": self.write_content(obj_id, HIDDEN_MESSAGE) self._expect_mismatched_object_error(obj_id) elif content.status == "absent": assert False, f"content_get returned absent content {content.swhid()}" else: # TODO: When content.status will have type Literal, replace this with # assert_never assert False, f"{content.swhid} has status: {content.status!r}" contents_and_data: Iterator[Tuple[Content, Optional[bytes]]] if self.objstorage is None: contents_and_data = ( (content, self.storage.content_get_data(content.sha1)) for content in visible_contents ) else: contents_and_data = zip( visible_contents, self.objstorage.get_batch(c.sha1 for c in visible_contents), ) for (content, datum) in contents_and_data: if datum is None: logger.error( "%s is visible, but is missing data. Skipping.", content.swhid() ) continue self.write_content(content.sha1_git, datum) def write_content(self, obj_id: Sha1Git, content: bytes) -> None: header = git_objects.git_object_header("blob", len(content)) self.write_object(obj_id, header + content) def _expect_mismatched_object_error(self, obj_id): obj_id_hex = hash_to_hex(obj_id) obj_path = self._obj_relative_path(obj_id) # For Git < 2.21: self._expected_fsck_errors.add( f"error: sha1 mismatch for ./{obj_path} (expected {obj_id_hex})" ) # For Git >= 2.21: self._expected_fsck_errors.add( f"error: hash mismatch for ./{obj_path} (expected {obj_id_hex})" ) self._expected_fsck_errors.add( f"error: {obj_id_hex}: object corrupt or missing: ./{obj_path}" ) self._expected_fsck_errors.add(f"missing blob {obj_id_hex}") diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py index 2121962..12844b2 100644 --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -1,1078 +1,1189 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import datetime import glob import gzip import io import os import pathlib import shutil import subprocess import tarfile import tempfile import unittest import unittest.mock import dulwich.fastexport import dulwich.index import dulwich.objects import dulwich.porcelain import dulwich.repo import pytest from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil from swh.model.model import ( - Directory, - DirectoryEntry, Person, + Release, Revision, RevisionType, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, TimestampWithTimezone, ) +from swh.model.model import Content, Directory, DirectoryEntry +from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType from swh.vault.cookers import DirectoryCooker, GitBareCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE class TestRepo: """A tiny context manager for a test git repository, with some utility functions to perform basic git stuff. """ def __init__(self, repo_dir=None): self.repo_dir = repo_dir def __enter__(self): if self.repo_dir: self.tmp_dir = None self.repo = dulwich.repo.Repo(self.repo_dir) else: self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-") self.repo_dir = self.tmp_dir.__enter__() self.repo = dulwich.repo.Repo.init(self.repo_dir) self.author_name = b"Test Author" self.author_email = b"test@softwareheritage.org" self.author = b"%s <%s>" % (self.author_name, self.author_email) self.base_date = 258244200 self.counter = 0 return pathlib.Path(self.repo_dir) def __exit__(self, exc, value, tb): if self.tmp_dir is not None: self.tmp_dir.__exit__(exc, value, tb) self.repo_dir = None def checkout(self, rev_sha): rev = self.repo[rev_sha] dulwich.index.build_index_from_tree( str(self.repo_dir), self.repo.index_path(), self.repo.object_store, rev.tree ) def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs): name = self.author_name email = self.author_email date = "%d +0000" % (self.base_date + self.counter) env = { # Set git commit format "GIT_AUTHOR_NAME": name, "GIT_AUTHOR_EMAIL": email, "GIT_AUTHOR_DATE": date, "GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "GIT_COMMITTER_DATE": date, # Ignore all the system-wide and user configurations "GIT_CONFIG_NOSYSTEM": "1", "HOME": str(self.tmp_dir), "XDG_CONFIG_HOME": str(self.tmp_dir), } kwargs.setdefault("env", {}).update(env) subprocess.check_call( ("git", "-C", self.repo_dir) + cmd, stdout=stdout, **kwargs ) def commit(self, message="Commit test\n", ref=b"HEAD"): """Commit the current working tree in a new commit with message on the branch 'ref'. At the end of the commit, the reference should stay the same and the index should be clean. """ paths = [ os.path.relpath(path, self.repo_dir) for path in glob.glob(self.repo_dir + "/**/*", recursive=True) ] self.repo.stage(paths) message = message.encode() + b"\n" ret = self.repo.do_commit( message=message, committer=self.author, commit_timestamp=self.base_date + self.counter, commit_timezone=0, ref=ref, ) self.counter += 1 # committing on another branch leaves # dangling files in index if ref != b"HEAD": # XXX this should work (but does not) # dulwich.porcelain.reset(self.repo, 'hard') self.git_shell("reset", "--hard", "HEAD") return ret def tag(self, name, target=b"HEAD", message=None): dulwich.porcelain.tag_create( self.repo, name, message=message, annotated=message is not None, objectish=target, ) def merge(self, parent_sha_list, message="Merge branches."): self.git_shell( "merge", "--allow-unrelated-histories", "-m", message, *[p.decode() for p in parent_sha_list], ) self.counter += 1 return self.repo.refs[b"HEAD"] def print_debug_graph(self, reflog=False): args = ["log", "--all", "--graph", "--decorate"] if reflog: args.append("--reflog") self.git_shell(*args, stdout=None) @pytest.fixture def git_loader(swh_storage,): """Instantiate a Git Loader using the storage instance as storage. """ def _create_loader(directory): return GitLoaderFromDisk( swh_storage, "fake_origin", directory=directory, visit_date=datetime.datetime.now(datetime.timezone.utc), ) return _create_loader @contextlib.contextmanager def cook_extract_directory_dircooker(storage, swhid, fsck=True): """Context manager that cooks a directory and extract it.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = DirectoryCooker(swhid, backend=backend, storage=storage) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) yield pathlib.Path(td) / str(swhid) cooker.storage = None @contextlib.contextmanager def cook_extract_directory_gitfast(storage, swhid, fsck=True): """Context manager that cooks a revision containing a directory and extract it, using RevisionGitfastCooker""" test_repo = TestRepo() with test_repo as p: date = TimestampWithTimezone.from_datetime( datetime.datetime.now(datetime.timezone.utc) ) revision = Revision( directory=swhid.object_id, message=b"dummy message", author=Person.from_fullname(b"someone"), committer=Person.from_fullname(b"someone"), date=date, committer_date=date, type=RevisionType.GIT, synthetic=False, ) storage.revision_add([revision]) with cook_stream_revision_gitfast( storage, revision.swhid() ) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) test_repo.checkout(b"HEAD") shutil.rmtree(p / ".git") yield p @contextlib.contextmanager def cook_extract_directory_git_bare(storage, swhid, fsck=True, direct_objstorage=False): """Context manager that cooks a revision and extract it, using GitBareCooker""" backend = unittest.mock.MagicMock() backend.storage = storage # Cook the object cooker = GitBareCooker( swhid, backend=backend, storage=storage, objstorage=storage.objstorage if direct_objstorage else None, ) cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) # Extract it with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) # Clone it with Dulwich with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: clone_dir = pathlib.Path(clone_dir) subprocess.check_call( ["git", "clone", os.path.join(td, f"{swhid}.git"), clone_dir,] ) shutil.rmtree(clone_dir / ".git") yield clone_dir @pytest.fixture( scope="module", params=[ cook_extract_directory_dircooker, cook_extract_directory_gitfast, cook_extract_directory_git_bare, ], ) def cook_extract_directory(request): """A fixture that is instantiated as either cook_extract_directory_dircooker or cook_extract_directory_git_bare.""" return request.param @contextlib.contextmanager def cook_stream_revision_gitfast(storage, swhid): """Context manager that cooks a revision and stream its fastexport.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = RevisionGitfastCooker(swhid, backend=backend, storage=storage) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj) yield fastexport_stream cooker.storage = None @contextlib.contextmanager def cook_extract_revision_gitfast(storage, swhid, fsck=True): """Context manager that cooks a revision and extract it, using RevisionGitfastCooker""" test_repo = TestRepo() with cook_stream_revision_gitfast(storage, swhid) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) yield test_repo, p @contextlib.contextmanager def cook_extract_git_bare(storage, swhid, fsck=True): """Context manager that cooks a revision and extract it, using GitBareCooker""" backend = unittest.mock.MagicMock() backend.storage = storage # Cook the object cooker = GitBareCooker(swhid, backend=backend, storage=storage) cooker.use_fsck = fsck # Some tests try edge-cases that git-fsck rejects cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) # Extract it with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) # Clone it with Dulwich with tempfile.TemporaryDirectory(prefix="tmp-vault-clone-") as clone_dir: clone_dir = pathlib.Path(clone_dir) subprocess.check_call( ["git", "clone", os.path.join(td, f"{swhid}.git"), clone_dir,] ) test_repo = TestRepo(clone_dir) with test_repo: yield test_repo, clone_dir @contextlib.contextmanager def cook_extract_revision_git_bare(storage, swhid, fsck=True): with cook_extract_git_bare(storage, swhid, fsck=fsck,) as res: yield res @pytest.fixture( scope="module", params=[cook_extract_revision_gitfast, cook_extract_revision_git_bare], ) def cook_extract_revision(request): """A fixture that is instantiated as either cook_extract_revision_gitfast or cook_extract_revision_git_bare.""" return request.param @contextlib.contextmanager def cook_extract_snapshot_git_bare(storage, swhid, fsck=True): with cook_extract_git_bare(storage, swhid, fsck=fsck,) as res: yield res @pytest.fixture( scope="module", params=[cook_extract_snapshot_git_bare], ) def cook_extract_snapshot(request): """Equivalent to cook_extract_snapshot_git_bare; but analogous to cook_extract_revision in case we ever have more cookers supporting snapshots""" return request.param TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) TEST_EXECUTABLE = b"\x42\x40\x00\x00\x05" class TestDirectoryCooker: def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) def test_directory_filtered_objects(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) # FIXME: storage.content_update() should be changed to allow things # like that with loader.storage.get_db().transaction() as cur: cur.execute( """update content set status = 'visible' where sha1 = %s""", (id_1,), ) cur.execute( """update content set status = 'hidden' where sha1 = %s""", (id_2,), ) cur.execute( """ insert into skipped_content (sha1, sha1_git, sha256, blake2s256, length, reason) select sha1, sha1_git, sha256, blake2s256, length, 'no reason' from content where sha1 = %s """, (id_3,), ) cur.execute("delete from content where sha1 = %s", (id_3,)) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE def test_directory_bogus_perms(self, git_loader, cook_extract_directory): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "file").chmod(0o664) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o775) (rp / "wat").write_text(TEST_CONTENT) (rp / "wat").chmod(0o604) # Disable mode cleanup with unittest.mock.patch("dulwich.index.cleanup_mode", lambda mode: mode): c = repo.commit() # Make sure Dulwich didn't normalize the permissions itself. # (if it did, then the test can't check the cooker normalized them) tree_id = repo.repo[c].tree assert {entry.mode for entry in repo.repo[tree_id].items()} == { 0o100775, 0o100664, 0o100604, } # Disable mode checks with unittest.mock.patch("dulwich.objects.Tree.check", lambda self: None): loader = git_loader(str(rp)) loader.load() # Make sure swh-loader didn't normalize them either dir_entries = loader.storage.directory_ls(hashutil.bytehex_to_hash(tree_id)) assert {entry["perms"] for entry in dir_entries} == { 0o100664, 0o100775, 0o100604, } obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 @pytest.mark.parametrize("direct_objstorage", [True, False]) def test_directory_objstorage( self, swh_storage, git_loader, mocker, direct_objstorage ): """Like test_directory_simple, but using swh_objstorage directly, without going through swh_storage.content_get_data()""" repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) # Set-up spies storage_content_get_data = mocker.patch.object( swh_storage, "content_get_data", wraps=swh_storage.content_get_data ) objstorage_content_batch = mocker.patch.object( swh_storage.objstorage, "get_batch", wraps=swh_storage.objstorage.get_batch ) with cook_extract_directory_git_bare( loader.storage, swhid, direct_objstorage=direct_objstorage ) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) if direct_objstorage: storage_content_get_data.assert_not_called() objstorage_content_batch.assert_called() else: storage_content_get_data.assert_called() objstorage_content_batch.assert_not_called() def test_directory_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) with cook_extract_directory_dircooker( swh_storage, dir.swhid(), fsck=False ) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev class RepoFixtures: """Shared loading and checking methods that can be reused by different types of tests.""" def load_repo_simple(self, git_loader): # # 1--2--3--4--5--6--7 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("add file2") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) (rp / "bin1").write_bytes(TEST_EXECUTABLE) (rp / "bin1").chmod(0o755) repo.commit("add bin1") (rp / "link1").symlink_to("file1") repo.commit("link link1 to file1") (rp / "file2").unlink() repo.commit("remove file2") (rp / "bin1").rename(rp / "bin") repo.commit("rename bin1 to bin") loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) return (loader, swhid) def check_revision_simple(self, ert, p, swhid): ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT assert (p / "link1").is_symlink() assert os.readlink(str(p / "link1")) == "file1" assert (p / "bin").stat().st_mode == 0o100755 assert (p / "bin").read_bytes() == TEST_EXECUTABLE assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() def load_repo_two_roots(self, git_loader): # # 1----3---4 # / # 2---- # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") del repo.repo.refs[b"refs/heads/master"] # git update-ref -d HEAD (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") repo.merge([c1]) (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3") obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_revision_two_roots(self, ert, p, swhid): assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() (c3,) = ert.repo[hashutil.hash_to_bytehex(swhid.object_id)].parents assert len(ert.repo[c3].parents) == 2 def load_repo_two_heads(self, git_loader): # # 1---2----4 <-- master and b1 # \ # ----3 <-- b2 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("Add file1") (rp / "file2").write_text(TEST_CONTENT) c2 = repo.commit("Add file2") repo.repo.refs[b"refs/heads/b2"] = c2 # branch b2 from master (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3", ref=b"refs/heads/b2") (rp / "file4").write_text(TEST_CONTENT) c4 = repo.commit("add file4", ref=b"refs/heads/master") repo.repo.refs[b"refs/heads/b1"] = c4 # branch b1 from master obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_snapshot_two_heads(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] == ert.repo.refs[b"refs/remotes/origin/b1"] ) c4_id = hashutil.hash_to_bytehex(swhid.object_id) c3_id = ert.repo.refs[b"refs/remotes/origin/b2"] assert ert.repo[c3_id].parents == ert.repo[c4_id].parents def load_repo_two_double_fork_merge(self, git_loader): # # 2---4---6 # / / / # 1---3---5 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") # create commit 1 repo.repo.refs[b"refs/heads/c1"] = c1 # branch c1 from master (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") # create commit 2 (rp / "file3").write_text(TEST_CONTENT) c3 = repo.commit("Add file3", ref=b"refs/heads/c1") # create commit 3 on c1 repo.repo.refs[b"refs/heads/c3"] = c3 # branch c3 from c1 repo.merge([c3]) # create commit 4 (rp / "file5").write_text(TEST_CONTENT) c5 = repo.commit("Add file3", ref=b"refs/heads/c3") # create commit 5 on c3 repo.merge([c5]) # create commit 6 obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_revision_two_double_fork_merge(self, ert, p, swhid): assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() def check_snapshot_two_double_fork_merge(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] ) (c4_id, c5_id) = ert.repo[swhid.object_id.hex().encode()].parents assert c5_id == ert.repo.refs[b"refs/remotes/origin/c3"] (c2_id, c3_id) = ert.repo[c4_id].parents assert c3_id == ert.repo.refs[b"refs/remotes/origin/c1"] def load_repo_triple_merge(self, git_loader): # # .---.---5 # / / / # 2 3 4 # / / / # 1---.---. # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Commit 1") repo.repo.refs[b"refs/heads/b1"] = c1 repo.repo.refs[b"refs/heads/b2"] = c1 repo.commit("Commit 2") c3 = repo.commit("Commit 3", ref=b"refs/heads/b1") c4 = repo.commit("Commit 4", ref=b"refs/heads/b2") repo.merge([c3, c4]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_revision_triple_merge(self, ert, p, swhid): assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex() def check_snapshot_triple_merge(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] ) (c2_id, c3_id, c4_id) = ert.repo[swhid.object_id.hex().encode()].parents assert c3_id == ert.repo.refs[b"refs/remotes/origin/b1"] assert c4_id == ert.repo.refs[b"refs/remotes/origin/b2"] assert ( ert.repo[c2_id].parents == ert.repo[c3_id].parents == ert.repo[c4_id].parents ) def load_repo_filtered_objects(self, git_loader): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) repo.commit() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() # FIXME: storage.content_update() should be changed to allow things # like that with loader.storage.get_db().transaction() as cur: cur.execute( """update content set status = 'visible' where sha1 = %s""", (id_1,), ) cur.execute( """update content set status = 'hidden' where sha1 = %s""", (id_2,), ) cur.execute( """ insert into skipped_content (sha1, sha1_git, sha256, blake2s256, length, reason) select sha1, sha1_git, sha256, blake2s256, length, 'no reason' from content where sha1 = %s """, (id_3,), ) cur.execute("delete from content where sha1 = %s", (id_3,)) return (loader, swhid) def check_revision_filtered_objects(self, ert, p, swhid): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE def load_repo_null_fields(self, git_loader): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) c = repo.commit("initial commit") loader = git_loader(str(rp)) loader.load() repo.repo.refs[b"HEAD"].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_revision = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir_id, metadata={}, synthetic=True, ) storage = loader.storage storage.revision_add([test_revision]) return (loader, test_revision.swhid()) def check_revision_null_fields(self, ert, p, swhid): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 def load_repo_tags(self, git_loader): # v-- t2 # # 1---2----5 <-- master, t5, and t5a (annotated) # \ # ----3----4 <-- t4a (annotated) # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("Add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") # create c2 repo.tag(b"t2") (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3") (rp / "file4").write_text(TEST_CONTENT) repo.commit("add file4") repo.tag(b"t4a", message=b"tag 4") # Go back to c2 repo.git_shell("reset", "--hard", "HEAD^^") (rp / "file5").write_text(TEST_CONTENT) repo.commit("add file5") # create c5 repo.tag(b"t5") repo.tag(b"t5a", message=b"tag 5") obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid) def check_snapshot_tags(self, ert, p, swhid): assert ( hashutil.hash_to_bytehex(swhid.object_id) == ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"] == ert.repo.refs[b"refs/remotes/origin/HEAD"] == ert.repo.refs[b"refs/remotes/origin/master"] == ert.repo.refs[b"refs/tags/t5"] ) c2_id = ert.repo.refs[b"refs/tags/t2"] c5_id = hashutil.hash_to_bytehex(swhid.object_id) assert ert.repo[c5_id].parents == [c2_id] t5a = ert.repo[ert.repo.refs[b"refs/tags/t5a"]] # TODO: investigate why new dulwich adds \n assert t5a.message in (b"tag 5", b"tag 5\n") assert t5a.object == (dulwich.objects.Commit, c5_id) t4a = ert.repo[ert.repo.refs[b"refs/tags/t4a"]] (_, c4_id) = t4a.object assert ert.repo[c4_id].message == b"add file4\n" # TODO: ditto (c3_id,) = ert.repo[c4_id].parents assert ert.repo[c3_id].message == b"add file3\n" # TODO: ditto assert ert.repo[c3_id].parents == [c2_id] class TestRevisionCooker(RepoFixtures): def test_revision_simple(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_simple(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_simple(ert, p, swhid) def test_revision_two_roots(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_two_roots(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_two_roots(ert, p, swhid) def test_revision_two_double_fork_merge(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_two_double_fork_merge(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_two_double_fork_merge(ert, p, swhid) def test_revision_triple_merge(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_triple_merge(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_triple_merge(ert, p, swhid) def test_revision_filtered_objects(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_filtered_objects(git_loader) with cook_extract_revision(loader.storage, swhid) as (ert, p): self.check_revision_filtered_objects(ert, p, swhid) def test_revision_null_fields(self, git_loader, cook_extract_revision): (loader, swhid) = self.load_repo_null_fields(git_loader) with cook_extract_revision(loader.storage, swhid, fsck=False) as (ert, p): self.check_revision_null_fields(ert, p, swhid) @pytest.mark.parametrize("ingest_target_revision", [False, True]) def test_revision_submodule( self, swh_storage, cook_extract_revision, ingest_target_revision ): date = TimestampWithTimezone.from_datetime( datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0) ) target_rev = Revision( message=b"target_rev", author=Person.from_fullname(b"me "), date=date, committer=Person.from_fullname(b"me "), committer_date=date, parents=(), type=RevisionType.GIT, directory=bytes.fromhex("3333333333333333333333333333333333333333"), metadata={}, synthetic=True, ) if ingest_target_revision: swh_storage.revision_add([target_rev]) dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=target_rev.id, perms=0o160000, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"msg", author=Person.from_fullname(b"me "), date=date, committer=Person.from_fullname(b"me "), committer_date=date, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p): ert.checkout(b"HEAD") pattern = b"160000 submodule\x00%s" % target_rev.id tree = ert.repo[b"HEAD"].tree assert pattern in ert.repo[tree].as_raw_string() class TestSnapshotCooker(RepoFixtures): def test_snapshot_simple(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_simple(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_simple(ert, p, main_rev_id) def test_snapshot_two_roots(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_roots(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_two_roots(ert, p, main_rev_id) def test_snapshot_two_heads(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_heads(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_snapshot_two_heads(ert, p, main_rev_id) def test_snapshot_two_double_fork_merge(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_double_fork_merge(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_two_double_fork_merge(ert, p, main_rev_id) self.check_snapshot_two_double_fork_merge(ert, p, main_rev_id) def test_snapshot_triple_merge(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_triple_merge(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_triple_merge(ert, p, main_rev_id) self.check_snapshot_triple_merge(ert, p, main_rev_id) def test_snapshot_filtered_objects(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_filtered_objects(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_filtered_objects(ert, p, main_rev_id) def test_snapshot_tags(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_tags(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_snapshot_tags(ert, p, main_rev_id) + + def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): + """Tests that objects that were originally malformed: + + * are still interpreted somewhat correctly (if the loader could make sense of + them), especially that they still have links to children + * have their original manifest in the bundle + """ + date = TimestampWithTimezone.from_numeric_offset( + Timestamp(1643819927, 0), 0, False + ) + + content = Content.from_data(b"foo") + swh_storage.content_add([content]) + + # disordered + # fmt: off + malformed_dir_manifest = ( + b"" + + b"100644 file2\x00" + content.sha1_git + + b"100644 file1\x00" + content.sha1_git + ) + # fmt: on + directory = Directory( + entries=( + DirectoryEntry( + name=b"file1", type="file", perms=0o100644, target=content.sha1_git + ), + DirectoryEntry( + name=b"file2", type="file", perms=0o100644, target=content.sha1_git + ), + ), + raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + + malformed_dir_manifest, + ) + swh_storage.directory_add([directory]) + + # 'committer' and 'author' swapped + # fmt: off + malformed_rev_manifest = ( + b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + + b"committer me 1643819927 +0000\n" + + b"author me 1643819927 +0000\n" + + b"\n" + + b"rev" + ) + # fmt: on + revision = Revision( + message=b"rev", + author=Person.from_fullname(b"me "), + date=date, + committer=Person.from_fullname(b"me "), + committer_date=date, + parents=(), + type=RevisionType.GIT, + directory=directory.id, + synthetic=True, + raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + + malformed_rev_manifest, + ) + swh_storage.revision_add([revision]) + + # 'tag' and 'tagger' swapped + # fmt: off + malformed_rel_manifest = ( + b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + + b"type commit\n" + + b"tagger me 1643819927 +0000\n" + + b"tag v1.1.0\n" + ) + # fmt: on + + release = Release( + name=b"v1.1.0", + message=None, + author=Person.from_fullname(b"me "), + date=date, + target=revision.id, + target_type=ModelObjectType.REVISION, + synthetic=True, + raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + + malformed_rel_manifest, + ) + swh_storage.release_add([release]) + + snapshot = Snapshot( + branches={ + b"refs/tags/v1.1.0": SnapshotBranch( + target=release.id, target_type=TargetType.RELEASE + ), + b"HEAD": SnapshotBranch( + target=revision.id, target_type=TargetType.REVISION + ), + } + ) + swh_storage.snapshot_add([snapshot]) + + with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): + tag = ert.repo[b"refs/tags/v1.1.0"] + assert tag.as_raw_string() == malformed_rel_manifest + + commit = ert.repo[tag.object[1]] + assert commit.as_raw_string() == malformed_rev_manifest + + tree = ert.repo[commit.tree] + assert tree.as_raw_string() == malformed_dir_manifest diff --git a/swh/vault/tests/test_git_bare_cooker.py b/swh/vault/tests/test_git_bare_cooker.py index 6eadb5e..8a7c008 100644 --- a/swh/vault/tests/test_git_bare_cooker.py +++ b/swh/vault/tests/test_git_bare_cooker.py @@ -1,434 +1,567 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ This module contains additional tests for the bare cooker. Generic cooker tests (eg. without swh-graph) in test_cookers.py also run on the bare cooker. """ import datetime import io import subprocess import tarfile import tempfile import unittest.mock import attr +import dulwich.repo import pytest from pytest import param from swh.model.from_disk import DentryPerms from swh.model.model import ( Content, Directory, DirectoryEntry, ObjectType, Person, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, + Timestamp, TimestampWithTimezone, ) from swh.vault.cookers.git_bare import GitBareCooker from swh.vault.in_memory_backend import InMemoryVaultBackend @pytest.mark.graph @pytest.mark.parametrize( "snapshot,up_to_date_graph,tag,weird_branches", [ # 'no snp' implies no tag or tree, because there can only be one root object param( False, False, False, False, id="no snp, outdated graph, no tag/tree/blob" ), param(False, True, False, False, id="no snp, updated graph, no tag/tree/blob"), param(True, False, False, False, id="snp, outdated graph, no tag/tree/blob"), param(True, True, False, False, id="snp, updated graph, no tag/tree/blob"), param(True, False, True, False, id="snp, outdated graph, w/ tag, no tree/blob"), param(True, True, True, False, id="snp, updated graph, w/ tag, no tree/blob"), param( True, False, True, True, id="snp, outdated graph, w/ tag, tree, and blob" ), param(True, True, True, True, id="snp, updated graph, w/ tag, tree, and blob"), ], ) def test_graph_revisions(swh_storage, up_to_date_graph, snapshot, tag, weird_branches): r""" Build objects:: snp /|||\ / ||| \ rel2 <----° /|\ \----> rel4 | / | \ | v / v \ v rev1 <------ rev2 <----° dir4 \ rel3 | | | \ | v v v \ | dir1 dir2 dir3 | | | / | | | | v / v v v v cnt1 <----° cnt2 cnt3 cnt4 cnt5 If up_to_date_graph is true, then swh-graph contains all objects. Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph. If tag is False, rel2 is excluded. If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded. """ from swh.graph.naive_client import NaiveClient as GraphClient # Create objects: date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) ) author = Person.from_fullname(b"Foo ") cnt1 = Content.from_data(b"correct") cnt2 = Content.from_data(b"horse") cnt3 = Content.from_data(b"battery") cnt4 = Content.from_data(b"staple") cnt5 = Content.from_data(b"Tr0ub4dor&3") dir1 = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), ) ) dir2 = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), DirectoryEntry( name=b"file2", type="file", perms=DentryPerms.content, target=cnt2.sha1_git, ), ) ) dir3 = Directory( entries=( DirectoryEntry( name=b"file3", type="file", perms=DentryPerms.content, target=cnt3.sha1_git, ), ) ) dir4 = Directory( entries=( DirectoryEntry( name=b"directory3", type="dir", perms=DentryPerms.directory, target=dir3.id, ), ) ) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir2.id, parents=(rev1.id,), type=RevisionType.GIT, synthetic=True, ) rel2 = Release( name=b"1.0.0", message=b"tag2", target_type=ObjectType.REVISION, target=rev2.id, synthetic=True, ) rel3 = Release( name=b"1.0.0-blob", message=b"tagged-blob", target_type=ObjectType.CONTENT, target=cnt5.sha1_git, synthetic=True, ) rel4 = Release( name=b"1.0.0-weird", message=b"weird release", target_type=ObjectType.RELEASE, target=rel3.id, synthetic=True, ) # Create snapshot: branches = { b"refs/heads/master": SnapshotBranch( target=rev2.id, target_type=TargetType.REVISION ), } if tag: branches[b"refs/tags/1.0.0"] = SnapshotBranch( target=rel2.id, target_type=TargetType.RELEASE ) if weird_branches: branches[b"refs/heads/tree-ref"] = SnapshotBranch( target=dir4.id, target_type=TargetType.DIRECTORY ) branches[b"refs/heads/blob-ref"] = SnapshotBranch( target=cnt4.sha1_git, target_type=TargetType.CONTENT ) branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch( target=rel4.id, target_type=TargetType.RELEASE ) snp = Snapshot(branches=branches) # "Fill" swh-graph if up_to_date_graph: nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1), (rev2, dir2), (rev2, rev1), (snp, rev2), ] if tag: nodes.append(rel2) edges.append((rel2, rev2)) edges.append((snp, rel2)) if weird_branches: nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4]) edges.extend( [ (dir3, cnt3), (dir4, dir3), (snp, dir4), (snp, cnt4), (snp, rel4), (rel4, rel3), (rel3, cnt5), ] ) else: nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (dir3, cnt3), (rev1, dir1), ] if tag: nodes.append(rel2) if weird_branches: nodes.extend([cnt3, dir3]) edges.extend([(dir3, cnt3)]) nodes = [str(n.swhid()) for n in nodes] edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges] # Add all objects to storage swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5]) swh_storage.directory_add([dir1, dir2, dir3, dir4]) swh_storage.revision_add([rev1, rev2]) swh_storage.release_add([rel2, rel3, rel4]) swh_storage.snapshot_add([snp]) # Add spy on swh_storage, to make sure revision_log is not called # (the graph must be used instead) swh_storage = unittest.mock.MagicMock(wraps=swh_storage) # Add all objects to graph swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) # Cook backend = InMemoryVaultBackend() if snapshot: cooked_swhid = snp.swhid() else: cooked_swhid = rev2.swhid() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) if weird_branches: # git-fsck now rejects refs pointing to trees and blobs, # but some old git repos have them. cooker.use_fsck = False cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) output = subprocess.check_output( [ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", ] ) assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" # Make sure the graph was used instead of swh_storage.revision_log if snapshot: if up_to_date_graph: # The graph has everything, so the first call succeeds and returns # all objects transitively pointed by the snapshot swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"),] ) else: # The graph does not have everything, so the first call returns nothing. # However, the second call (on the top rev) succeeds and returns # all objects but the rev and the rel swh_graph.visit_nodes.assert_has_calls( [ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), unittest.mock.call(str(rev2.swhid()), edges="rev:rev"), ] ) else: swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")] ) if up_to_date_graph: swh_storage.revision_log.assert_not_called() swh_storage.revision_shortlog.assert_not_called() else: swh_storage.revision_log.assert_called() @pytest.mark.parametrize( "mismatch_on", ["content", "directory", "revision1", "revision2", "none"] ) def test_checksum_mismatch(swh_storage, mismatch_on): date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc) ) author = Person.from_fullname(b"Foo ") wrong_hash = b"\x12\x34" * 10 cnt1 = Content.from_data(b"Tr0ub4dor&3") if mismatch_on == "content": cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash) dir1 = Directory( entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), ) ) if mismatch_on == "directory": dir1 = attr.evolve(dir1, id=wrong_hash) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision1": rev1 = attr.evolve(rev1, id=wrong_hash) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, parents=(rev1.id,), type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision2": rev2 = attr.evolve(rev2, id=wrong_hash) cooked_swhid = rev2.swhid() swh_storage.content_add([cnt1]) swh_storage.directory_add([dir1]) swh_storage.revision_add([rev1, rev2]) backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=None, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if mismatch_on != "revision2": # git-log fails if the head revision is corrupted # TODO: we need to find a way to make this somewhat usable output = subprocess.check_output( [ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", ] ) assert output.decode() == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" + + +@pytest.mark.parametrize( + "use_graph", + [ + pytest.param(False, id="without-graph"), + pytest.param(True, id="with-graph", marks=pytest.mark.graph), + ], +) +def test_ignore_displayname(swh_storage, use_graph): + """Tests the original authorship information is used instead of + configured display names; otherwise objects would not match their hash, + and git-fsck/git-clone would fail. + + This tests both with and without swh-graph, as both configurations use different + code paths to fetch revisions. + """ + + date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) + legacy_person = Person.from_fullname(b"old me ") + current_person = Person.from_fullname(b"me ") + + content = Content.from_data(b"foo") + swh_storage.content_add([content]) + + directory = Directory( + entries=( + DirectoryEntry( + name=b"file1", type="file", perms=0o100644, target=content.sha1_git + ), + ), + ) + swh_storage.directory_add([directory]) + + revision = Revision( + message=b"rev", + author=legacy_person, + date=date, + committer=legacy_person, + committer_date=date, + parents=(), + type=RevisionType.GIT, + directory=directory.id, + synthetic=True, + ) + swh_storage.revision_add([revision]) + + release = Release( + name=b"v1.1.0", + message=None, + author=legacy_person, + date=date, + target=revision.id, + target_type=ObjectType.REVISION, + synthetic=True, + ) + swh_storage.release_add([release]) + + snapshot = Snapshot( + branches={ + b"refs/tags/v1.1.0": SnapshotBranch( + target=release.id, target_type=TargetType.RELEASE + ), + b"HEAD": SnapshotBranch( + target=revision.id, target_type=TargetType.REVISION + ), + } + ) + swh_storage.snapshot_add([snapshot]) + + # Add all objects to graph + if use_graph: + from swh.graph.naive_client import NaiveClient as GraphClient + + nodes = [ + str(x.swhid()) for x in [content, directory, revision, release, snapshot] + ] + edges = [ + (str(x.swhid()), str(y.swhid())) + for (x, y) in [ + (directory, content), + (revision, directory), + (release, revision), + (snapshot, release), + (snapshot, revision), + ] + ] + swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) + else: + swh_graph = None + + # Set a display name + with swh_storage.db() as db: + with db.transaction() as cur: + cur.execute( + "UPDATE person set displayname = %s where fullname = %s", + (current_person.fullname, legacy_person.fullname), + ) + + # Check the display name did apply in the storage + assert swh_storage.revision_get([revision.id])[0] == attr.evolve( + revision, author=current_person, committer=current_person, + ) + + # Cook + cooked_swhid = snapshot.swhid() + backend = InMemoryVaultBackend() + cooker = GitBareCooker( + cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, + ) + + cooker.cook() + + # Get bundle + bundle = backend.fetch("git_bare", cooked_swhid) + + # Extract bundle and make sure both revisions are in it + with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: + with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: + tf.extractall(tempdir) + + # If we are here, it means git-fsck succeeded when called by cooker.cook(), + # so we already know the original person was used. Let's double-check. + + repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") + + tag = repo[b"refs/tags/v1.1.0"] + assert tag.tagger == legacy_person.fullname + + commit = repo[tag.object[1]] + assert commit.author == legacy_person.fullname diff --git a/tox.ini b/tox.ini index d776af5..7c64f9e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,76 +1,76 @@ [tox] envlist=black,flake8,mypy,py3 [testenv] extras = testing graph deps = pytest-cov commands = pytest --cov={envsitepackagesdir}/swh/vault \ {envsitepackagesdir}/swh/vault \ --cov-branch {posargs} [testenv:black] skip_install = true deps = black==19.10b0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:mypy] extras = testing graph deps = - mypy + mypy==0.920 commands = mypy swh # build documentation outside swh-environment using the current # git HEAD of swh-docs, is executed on CI for each diff to prevent # breaking doc build [testenv:sphinx] whitelist_externals = make usedevelop = true extras = testing graph deps = # fetch and install swh-docs in develop mode -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs # build documentation only inside swh-environment using local state # of swh-docs package [testenv:sphinx-dev] whitelist_externals = make usedevelop = true extras = testing graph deps = # install swh-docs in develop mode -e ../swh-docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs