diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a2dcf9c..b80dd837 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,57 +1,49 @@ repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 - hooks: - - id: trailing-whitespace - - id: check-json - - id: check-yaml + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: check-json + - id: check-yaml -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 - hooks: - - id: flake8 + - repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 -- repo: https://github.com/codespell-project/codespell - rev: v1.16.0 - hooks: - - id: codespell - exclude: TODO - args: [-L iff, -L gae] + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + name: Check source code spelling + args: [-L iff, -L gae, -L sur] + stages: [commit] + - id: codespell + name: Check commit message spelling + stages: [commit-msg] -- repo: local - hooks: - - id: mypy - name: mypy - entry: mypy - args: [swh] - pass_filenames: false - language: system - types: [python] + - repo: local + hooks: + - id: mypy + name: mypy + entry: mypy + args: [swh] + pass_filenames: false + language: system + types: [python] + - id: check-bumped-dbversion + name: check-bumped-dbversion + files: 'sql/upgrades/.*\.sql' + entry: grep + args: ["insert into dbversion"] + language: system - - id: check-bumped-dbversion - name: check-bumped-dbversion - files: 'sql/upgrades/.*\.sql' - entry: grep - args: ['insert into dbversion'] - language: system + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort -- repo: https://github.com/PyCQA/isort - rev: 5.5.2 - hooks: - - id: isort - -- repo: https://github.com/python/black - rev: 19.10b0 - hooks: - - id: black - -# unfortunately, we are far from being able to enable this... -#- repo: https://github.com/PyCQA/pydocstyle.git -# rev: 4.0.0 -# hooks: -# - id: pydocstyle -# name: pydocstyle -# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. -# entry: pydocstyle --convention=google -# language: python -# types: [python] + - repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black diff --git a/PKG-INFO b/PKG-INFO index a4dc5adc..b0e720db 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,250 +1,250 @@ Metadata-Version: 2.1 Name: swh.storage -Version: 0.43.1 +Version: 1.0.0 Summary: Software Heritage storage manager Home-page: https://forge.softwareheritage.org/diffusion/DSTO/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-storage Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-storage/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: journal License-File: LICENSE License-File: AUTHORS swh-storage =========== Abstraction layer over the archive, allowing to access all stored source code artifacts as well as their metadata. See the [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html) for more details. ## Quick start ### Dependencies Python tests for this module include tests that cannot be run without a local Postgresql database, so you need the Postgresql server executable on your machine (no need to have a running Postgresql server). They also expect a cassandra server. #### Debian-like host ``` $ sudo apt install libpq-dev postgresql-11 cassandra ``` #### Non Debian-like host The tests expects the path to `cassandra` to either be unspecified, it is then looked up at `/usr/sbin/cassandra`, either specified through the environment variable `SWH_CASSANDRA_BIN`. Optionally, you can avoid running the cassandra tests. ``` (swh) :~/swh-storage$ tox -- -m 'not cassandra' ``` ### Installation It is strongly recommended to use a virtualenv. In the following, we consider you work in a virtualenv named `swh`. See the [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup) for a more details on how to setup a working environment. You can install the package directly from [pypi](https://pypi.org/p/swh.storage): ``` (swh) :~$ pip install swh.storage [...] ``` Or from sources: ``` (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git [...] (swh) :~$ cd swh-storage (swh) :~/swh-storage$ pip install . [...] ``` Then you can check it's properly installed: ``` (swh) :~$ swh storage --help Usage: swh storage [OPTIONS] COMMAND [ARGS]... Software Heritage Storage tools. Options: -h, --help Show this message and exit. Commands: rpc-serve Software Heritage Storage RPC server. ``` ## Tests The best way of running Python tests for this module is to use [tox](https://tox.readthedocs.io/). ``` (swh) :~$ pip install tox ``` ### tox From the sources directory, simply use tox: ``` (swh) :~/swh-storage$ tox [...] ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ========== _______________________________ summary ________________________________ flake8: commands succeeded py3: commands succeeded congratulations :) ``` Note: it is possible to set the `JAVA_HOME` environment variable to specify the version of the JVM to be used by Cassandra. For example, at the time of writing this, Cassandra does not support java 14, so one may want to use for example java 11: ``` (swh) :~/swh-storage$ export JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64/bin/java (swh) :~/swh-storage$ tox [...] ``` ## Development The storage server can be locally started. It requires a configuration file and a running Postgresql database. ### Sample configuration A typical configuration `storage.yml` file is: ``` storage: cls: postgresql db: "dbname=softwareheritage-dev user= password=" objstorage: cls: pathslicing root: /tmp/swh-storage/ slicing: 0:2/2:4/4:6 ``` which means, this uses: - a local storage instance whose db connection is to `softwareheritage-dev` local instance, - the objstorage uses a local objstorage instance whose: - `root` path is /tmp/swh-storage, - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of the content (sha1) which will be stored on disk at first level with the first 2 hex characters, the second level with the next 2 hex characters and the third level with the next 2 hex characters. And finally the complete hash file holding the raw content. For example: 00062f8bd330715c4f819373653d97b3cd34394c will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c Note that the `root` path should exist on disk before starting the server. ### Starting the storage server If the python package has been properly installed (e.g. in a virtual env), you should be able to use the command: ``` (swh) :~/swh-storage$ swh storage rpc-serve storage.yml ``` This runs a local swh-storage api at 5002 port. ``` (swh) :~/swh-storage$ curl http://127.0.0.1:5002 Software Heritage storage server

You have reached the Software Heritage storage server.
See its documentation and API for more information

``` ### And then what? In your upper layer ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/), [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/), etc...), you can define a remote storage with this snippet of yaml configuration. ``` storage: cls: remote url: http://localhost:5002/ ``` You could directly define a postgresql storage with the following snippet: ``` storage: cls: postgresql db: service=swh-dev objstorage: cls: pathslicing root: /home/storage/swh-storage/ slicing: 0:2/2:4/4:6 ``` ## Cassandra As an alternative to PostgreSQL, swh-storage can use Cassandra as a database backend. It can be used like this: ``` storage: cls: cassandra hosts: - localhost objstorage: cls: pathslicing root: /home/storage/swh-storage/ slicing: 0:2/2:4/4:6 ``` The Cassandra swh-storage implementation supports both Cassandra >= 4.0-alpha2 and ScyllaDB >= 4.4 (and possibly earlier versions, but this is untested). While the main code supports both transparently, running tests or configuring the schema requires specific code when using ScyllaDB, enabled by setting the `SWH_USE_SCYLLADB=1` environment variable. diff --git a/requirements-swh.txt b/requirements-swh.txt index 8d375057..bacba9c8 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ -swh.core[db,http] >= 0.14.0 +swh.core[db,http] >= 2 swh.counters >= v0.8.0 swh.model >= 4.4.0 swh.objstorage >= 0.2.2 diff --git a/requirements-test.txt b/requirements-test.txt index a33e143a..44d0d3f2 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,16 +1,17 @@ hypothesis >= 3.11.0 pytest < 7.0.0 # v7.0.0 removed _pytest.tmpdir.TempdirFactory, which is used by some of the pytest plugins we use pytest-mock # pytz is in fact a dep of swh.model[testing] and should not be necessary, but # the dep on swh.model in the main requirements-swh.txt file shadows this one # adding the [testing] extra. swh.model[testing] >= 0.0.50 pytz pytest-redis pytest-xdist types-python-dateutil types-pytz types-pyyaml types-redis types-requests +types-toml diff --git a/swh.storage.egg-info/PKG-INFO b/swh.storage.egg-info/PKG-INFO index a4dc5adc..b0e720db 100644 --- a/swh.storage.egg-info/PKG-INFO +++ b/swh.storage.egg-info/PKG-INFO @@ -1,250 +1,250 @@ Metadata-Version: 2.1 Name: swh.storage -Version: 0.43.1 +Version: 1.0.0 Summary: Software Heritage storage manager Home-page: https://forge.softwareheritage.org/diffusion/DSTO/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-storage Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-storage/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: journal License-File: LICENSE License-File: AUTHORS swh-storage =========== Abstraction layer over the archive, allowing to access all stored source code artifacts as well as their metadata. See the [documentation](https://docs.softwareheritage.org/devel/swh-storage/index.html) for more details. ## Quick start ### Dependencies Python tests for this module include tests that cannot be run without a local Postgresql database, so you need the Postgresql server executable on your machine (no need to have a running Postgresql server). They also expect a cassandra server. #### Debian-like host ``` $ sudo apt install libpq-dev postgresql-11 cassandra ``` #### Non Debian-like host The tests expects the path to `cassandra` to either be unspecified, it is then looked up at `/usr/sbin/cassandra`, either specified through the environment variable `SWH_CASSANDRA_BIN`. Optionally, you can avoid running the cassandra tests. ``` (swh) :~/swh-storage$ tox -- -m 'not cassandra' ``` ### Installation It is strongly recommended to use a virtualenv. In the following, we consider you work in a virtualenv named `swh`. See the [developer setup guide](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup) for a more details on how to setup a working environment. You can install the package directly from [pypi](https://pypi.org/p/swh.storage): ``` (swh) :~$ pip install swh.storage [...] ``` Or from sources: ``` (swh) :~$ git clone https://forge.softwareheritage.org/source/swh-storage.git [...] (swh) :~$ cd swh-storage (swh) :~/swh-storage$ pip install . [...] ``` Then you can check it's properly installed: ``` (swh) :~$ swh storage --help Usage: swh storage [OPTIONS] COMMAND [ARGS]... Software Heritage Storage tools. Options: -h, --help Show this message and exit. Commands: rpc-serve Software Heritage Storage RPC server. ``` ## Tests The best way of running Python tests for this module is to use [tox](https://tox.readthedocs.io/). ``` (swh) :~$ pip install tox ``` ### tox From the sources directory, simply use tox: ``` (swh) :~/swh-storage$ tox [...] ========= 315 passed, 6 skipped, 15 warnings in 40.86 seconds ========== _______________________________ summary ________________________________ flake8: commands succeeded py3: commands succeeded congratulations :) ``` Note: it is possible to set the `JAVA_HOME` environment variable to specify the version of the JVM to be used by Cassandra. For example, at the time of writing this, Cassandra does not support java 14, so one may want to use for example java 11: ``` (swh) :~/swh-storage$ export JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64/bin/java (swh) :~/swh-storage$ tox [...] ``` ## Development The storage server can be locally started. It requires a configuration file and a running Postgresql database. ### Sample configuration A typical configuration `storage.yml` file is: ``` storage: cls: postgresql db: "dbname=softwareheritage-dev user= password=" objstorage: cls: pathslicing root: /tmp/swh-storage/ slicing: 0:2/2:4/4:6 ``` which means, this uses: - a local storage instance whose db connection is to `softwareheritage-dev` local instance, - the objstorage uses a local objstorage instance whose: - `root` path is /tmp/swh-storage, - slicing scheme is `0:2/2:4/4:6`. This means that the identifier of the content (sha1) which will be stored on disk at first level with the first 2 hex characters, the second level with the next 2 hex characters and the third level with the next 2 hex characters. And finally the complete hash file holding the raw content. For example: 00062f8bd330715c4f819373653d97b3cd34394c will be stored at 00/06/2f/00062f8bd330715c4f819373653d97b3cd34394c Note that the `root` path should exist on disk before starting the server. ### Starting the storage server If the python package has been properly installed (e.g. in a virtual env), you should be able to use the command: ``` (swh) :~/swh-storage$ swh storage rpc-serve storage.yml ``` This runs a local swh-storage api at 5002 port. ``` (swh) :~/swh-storage$ curl http://127.0.0.1:5002 Software Heritage storage server

You have reached the Software Heritage storage server.
See its documentation and API for more information

``` ### And then what? In your upper layer ([loader-git](https://forge.softwareheritage.org/source/swh-loader-git/), [loader-svn](https://forge.softwareheritage.org/source/swh-loader-svn/), etc...), you can define a remote storage with this snippet of yaml configuration. ``` storage: cls: remote url: http://localhost:5002/ ``` You could directly define a postgresql storage with the following snippet: ``` storage: cls: postgresql db: service=swh-dev objstorage: cls: pathslicing root: /home/storage/swh-storage/ slicing: 0:2/2:4/4:6 ``` ## Cassandra As an alternative to PostgreSQL, swh-storage can use Cassandra as a database backend. It can be used like this: ``` storage: cls: cassandra hosts: - localhost objstorage: cls: pathslicing root: /home/storage/swh-storage/ slicing: 0:2/2:4/4:6 ``` The Cassandra swh-storage implementation supports both Cassandra >= 4.0-alpha2 and ScyllaDB >= 4.4 (and possibly earlier versions, but this is untested). While the main code supports both transparently, running tests or configuring the schema requires specific code when using ScyllaDB, enabled by setting the `SWH_USE_SCYLLADB=1` environment variable. diff --git a/swh.storage.egg-info/SOURCES.txt b/swh.storage.egg-info/SOURCES.txt index 3f465599..bf07ad3b 100644 --- a/swh.storage.egg-info/SOURCES.txt +++ b/swh.storage.egg-info/SOURCES.txt @@ -1,337 +1,337 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.md conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh-journal.txt requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini ./requirements-swh-journal.txt ./requirements-swh.txt ./requirements-test.txt ./requirements.txt bin/swh-storage-add-dir docs/.gitignore docs/Makefile docs/Makefile.local docs/archive-copies.rst docs/cli.rst docs/conf.py docs/extrinsic-metadata-specification.rst docs/index.rst docs/sql-storage.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/.gitignore docs/images/Makefile docs/images/swh-archive-copies.dia sql/.gitignore sql/Makefile sql/TODO sql/clusters.dot sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json sql/doc/json/.gitignore sql/doc/json/Makefile sql/doc/json/entity.lister_metadata.schema.json sql/doc/json/entity.metadata.schema.json sql/doc/json/entity_history.lister_metadata.schema.json sql/doc/json/entity_history.metadata.schema.json sql/doc/json/fetch_history.result.schema.json sql/doc/json/list_history.result.schema.json sql/doc/json/listable_entity.list_params.schema.json sql/doc/json/origin_visit.metadata.json sql/doc/json/tool.tool_configuration.schema.json sql/json/.gitignore sql/json/Makefile sql/json/entity.lister_metadata.schema.json sql/json/entity.metadata.schema.json sql/json/entity_history.lister_metadata.schema.json sql/json/entity_history.metadata.schema.json sql/json/fetch_history.result.schema.json sql/json/list_history.result.schema.json sql/json/listable_entity.list_params.schema.json sql/json/origin_visit.metadata.json sql/json/tool.tool_configuration.schema.json -sql/upgrades/015.sql -sql/upgrades/016.sql -sql/upgrades/017.sql -sql/upgrades/018.sql -sql/upgrades/019.sql -sql/upgrades/020.sql -sql/upgrades/021.sql -sql/upgrades/022.sql -sql/upgrades/023.sql -sql/upgrades/024.sql -sql/upgrades/025.sql -sql/upgrades/026.sql -sql/upgrades/027.sql -sql/upgrades/028.sql -sql/upgrades/029.sql -sql/upgrades/030.sql -sql/upgrades/032.sql -sql/upgrades/033.sql -sql/upgrades/034.sql -sql/upgrades/035.sql -sql/upgrades/036.sql -sql/upgrades/037.sql -sql/upgrades/038.sql -sql/upgrades/039.sql -sql/upgrades/040.sql -sql/upgrades/041.sql -sql/upgrades/042.sql -sql/upgrades/043.sql -sql/upgrades/044.sql -sql/upgrades/045.sql -sql/upgrades/046.sql -sql/upgrades/047.sql -sql/upgrades/048.sql -sql/upgrades/049.sql -sql/upgrades/050.sql -sql/upgrades/051.sql -sql/upgrades/052.sql -sql/upgrades/053.sql -sql/upgrades/054.sql -sql/upgrades/055.sql -sql/upgrades/056.sql -sql/upgrades/057.sql -sql/upgrades/058.sql -sql/upgrades/059.sql -sql/upgrades/060.sql -sql/upgrades/061.sql -sql/upgrades/062.sql -sql/upgrades/063.sql -sql/upgrades/064.sql -sql/upgrades/065.sql -sql/upgrades/066.sql -sql/upgrades/067.sql -sql/upgrades/068.sql -sql/upgrades/069.sql -sql/upgrades/070.sql -sql/upgrades/071.sql -sql/upgrades/072.sql -sql/upgrades/073.sql -sql/upgrades/074.sql -sql/upgrades/075.sql -sql/upgrades/076.sql -sql/upgrades/077.sql -sql/upgrades/078.sql -sql/upgrades/079.sql -sql/upgrades/080.sql -sql/upgrades/081.sql -sql/upgrades/082.sql -sql/upgrades/083.sql -sql/upgrades/084.sql -sql/upgrades/085.sql -sql/upgrades/086.sql -sql/upgrades/087.sql -sql/upgrades/088.sql -sql/upgrades/089.sql -sql/upgrades/090.sql -sql/upgrades/091.sql -sql/upgrades/092.sql -sql/upgrades/093.sql -sql/upgrades/094.sql -sql/upgrades/095.sql -sql/upgrades/096.sql -sql/upgrades/097.sql -sql/upgrades/098.sql -sql/upgrades/099.sql -sql/upgrades/100.sql -sql/upgrades/101.sql -sql/upgrades/102.sql -sql/upgrades/103.sql -sql/upgrades/104.sql -sql/upgrades/105.sql -sql/upgrades/106.sql -sql/upgrades/107.sql -sql/upgrades/108.sql -sql/upgrades/109.sql -sql/upgrades/110.sql -sql/upgrades/111.sql -sql/upgrades/112.sql -sql/upgrades/113.sql -sql/upgrades/114.sql -sql/upgrades/115.sql -sql/upgrades/116.sql -sql/upgrades/117.sql -sql/upgrades/118.sql -sql/upgrades/119.sql -sql/upgrades/120.sql -sql/upgrades/121.sql -sql/upgrades/122.sql -sql/upgrades/123.sql -sql/upgrades/124.sql -sql/upgrades/125.sql -sql/upgrades/126.sql -sql/upgrades/127.sql -sql/upgrades/128.sql -sql/upgrades/129.sql -sql/upgrades/130.sql -sql/upgrades/131.sql -sql/upgrades/132.sql -sql/upgrades/133.sql -sql/upgrades/134.sql -sql/upgrades/135.sql -sql/upgrades/136.sql -sql/upgrades/137.sql -sql/upgrades/138.sql -sql/upgrades/139.sql -sql/upgrades/140.sql -sql/upgrades/141.sql -sql/upgrades/142.sql -sql/upgrades/143.sql -sql/upgrades/144.sql -sql/upgrades/145.sql -sql/upgrades/146.sql -sql/upgrades/147.sql -sql/upgrades/148.sql -sql/upgrades/149.sql -sql/upgrades/150.sql -sql/upgrades/151.sql -sql/upgrades/152.sql -sql/upgrades/153.sql -sql/upgrades/154.sql -sql/upgrades/155.sql -sql/upgrades/156.sql -sql/upgrades/157.sql -sql/upgrades/158.sql -sql/upgrades/159.sql -sql/upgrades/160.sql -sql/upgrades/161.sql -sql/upgrades/162.sql -sql/upgrades/163.sql -sql/upgrades/164.sql -sql/upgrades/165.sql -sql/upgrades/166.sql -sql/upgrades/167.sql -sql/upgrades/168.sql -sql/upgrades/169.sql -sql/upgrades/170.sql -sql/upgrades/171.sql -sql/upgrades/172.sql -sql/upgrades/173.sql -sql/upgrades/174.sql -sql/upgrades/175.sql -sql/upgrades/176.sql -sql/upgrades/177.sql -sql/upgrades/178.sql -sql/upgrades/179.sql -sql/upgrades/180.sql -sql/upgrades/181.sql -sql/upgrades/182.sql swh/__init__.py swh.storage.egg-info/PKG-INFO swh.storage.egg-info/SOURCES.txt swh.storage.egg-info/dependency_links.txt swh.storage.egg-info/entry_points.txt swh.storage.egg-info/requires.txt swh.storage.egg-info/top_level.txt swh/storage/__init__.py swh/storage/backfill.py swh/storage/cli.py swh/storage/common.py swh/storage/exc.py swh/storage/fixer.py swh/storage/in_memory.py swh/storage/interface.py swh/storage/metrics.py swh/storage/migrate_extrinsic_metadata.py swh/storage/objstorage.py swh/storage/py.typed swh/storage/pytest_plugin.py swh/storage/replay.py swh/storage/utils.py swh/storage/writer.py swh/storage/algos/__init__.py swh/storage/algos/diff.py swh/storage/algos/dir_iterators.py swh/storage/algos/origin.py swh/storage/algos/revisions_walker.py swh/storage/algos/snapshot.py swh/storage/api/__init__.py swh/storage/api/client.py swh/storage/api/serializers.py swh/storage/api/server.py swh/storage/cassandra/__init__.py swh/storage/cassandra/common.py swh/storage/cassandra/converters.py swh/storage/cassandra/cql.py swh/storage/cassandra/model.py swh/storage/cassandra/schema.py swh/storage/cassandra/storage.py swh/storage/postgresql/__init__.py swh/storage/postgresql/converters.py swh/storage/postgresql/db.py swh/storage/postgresql/storage.py swh/storage/proxies/buffer.py swh/storage/proxies/counter.py swh/storage/proxies/filter.py swh/storage/proxies/retry.py swh/storage/proxies/tenacious.py swh/storage/proxies/validate.py swh/storage/sql/10-superuser-init.sql swh/storage/sql/15-flavor.sql swh/storage/sql/20-enums.sql swh/storage/sql/30-schema.sql swh/storage/sql/40-funcs.sql swh/storage/sql/60-indexes.sql swh/storage/sql/logical_replication/replication_source.sql +swh/storage/sql/upgrades/015.sql +swh/storage/sql/upgrades/016.sql +swh/storage/sql/upgrades/017.sql +swh/storage/sql/upgrades/018.sql +swh/storage/sql/upgrades/019.sql +swh/storage/sql/upgrades/020.sql +swh/storage/sql/upgrades/021.sql +swh/storage/sql/upgrades/022.sql +swh/storage/sql/upgrades/023.sql +swh/storage/sql/upgrades/024.sql +swh/storage/sql/upgrades/025.sql +swh/storage/sql/upgrades/026.sql +swh/storage/sql/upgrades/027.sql +swh/storage/sql/upgrades/028.sql +swh/storage/sql/upgrades/029.sql +swh/storage/sql/upgrades/030.sql +swh/storage/sql/upgrades/032.sql +swh/storage/sql/upgrades/033.sql +swh/storage/sql/upgrades/034.sql +swh/storage/sql/upgrades/035.sql +swh/storage/sql/upgrades/036.sql +swh/storage/sql/upgrades/037.sql +swh/storage/sql/upgrades/038.sql +swh/storage/sql/upgrades/039.sql +swh/storage/sql/upgrades/040.sql +swh/storage/sql/upgrades/041.sql +swh/storage/sql/upgrades/042.sql +swh/storage/sql/upgrades/043.sql +swh/storage/sql/upgrades/044.sql +swh/storage/sql/upgrades/045.sql +swh/storage/sql/upgrades/046.sql +swh/storage/sql/upgrades/047.sql +swh/storage/sql/upgrades/048.sql +swh/storage/sql/upgrades/049.sql +swh/storage/sql/upgrades/050.sql +swh/storage/sql/upgrades/051.sql +swh/storage/sql/upgrades/052.sql +swh/storage/sql/upgrades/053.sql +swh/storage/sql/upgrades/054.sql +swh/storage/sql/upgrades/055.sql +swh/storage/sql/upgrades/056.sql +swh/storage/sql/upgrades/057.sql +swh/storage/sql/upgrades/058.sql +swh/storage/sql/upgrades/059.sql +swh/storage/sql/upgrades/060.sql +swh/storage/sql/upgrades/061.sql +swh/storage/sql/upgrades/062.sql +swh/storage/sql/upgrades/063.sql +swh/storage/sql/upgrades/064.sql +swh/storage/sql/upgrades/065.sql +swh/storage/sql/upgrades/066.sql +swh/storage/sql/upgrades/067.sql +swh/storage/sql/upgrades/068.sql +swh/storage/sql/upgrades/069.sql +swh/storage/sql/upgrades/070.sql +swh/storage/sql/upgrades/071.sql +swh/storage/sql/upgrades/072.sql +swh/storage/sql/upgrades/073.sql +swh/storage/sql/upgrades/074.sql +swh/storage/sql/upgrades/075.sql +swh/storage/sql/upgrades/076.sql +swh/storage/sql/upgrades/077.sql +swh/storage/sql/upgrades/078.sql +swh/storage/sql/upgrades/079.sql +swh/storage/sql/upgrades/080.sql +swh/storage/sql/upgrades/081.sql +swh/storage/sql/upgrades/082.sql +swh/storage/sql/upgrades/083.sql +swh/storage/sql/upgrades/084.sql +swh/storage/sql/upgrades/085.sql +swh/storage/sql/upgrades/086.sql +swh/storage/sql/upgrades/087.sql +swh/storage/sql/upgrades/088.sql +swh/storage/sql/upgrades/089.sql +swh/storage/sql/upgrades/090.sql +swh/storage/sql/upgrades/091.sql +swh/storage/sql/upgrades/092.sql +swh/storage/sql/upgrades/093.sql +swh/storage/sql/upgrades/094.sql +swh/storage/sql/upgrades/095.sql +swh/storage/sql/upgrades/096.sql +swh/storage/sql/upgrades/097.sql +swh/storage/sql/upgrades/098.sql +swh/storage/sql/upgrades/099.sql +swh/storage/sql/upgrades/100.sql +swh/storage/sql/upgrades/101.sql +swh/storage/sql/upgrades/102.sql +swh/storage/sql/upgrades/103.sql +swh/storage/sql/upgrades/104.sql +swh/storage/sql/upgrades/105.sql +swh/storage/sql/upgrades/106.sql +swh/storage/sql/upgrades/107.sql +swh/storage/sql/upgrades/108.sql +swh/storage/sql/upgrades/109.sql +swh/storage/sql/upgrades/110.sql +swh/storage/sql/upgrades/111.sql +swh/storage/sql/upgrades/112.sql +swh/storage/sql/upgrades/113.sql +swh/storage/sql/upgrades/114.sql +swh/storage/sql/upgrades/115.sql +swh/storage/sql/upgrades/116.sql +swh/storage/sql/upgrades/117.sql +swh/storage/sql/upgrades/118.sql +swh/storage/sql/upgrades/119.sql +swh/storage/sql/upgrades/120.sql +swh/storage/sql/upgrades/121.sql +swh/storage/sql/upgrades/122.sql +swh/storage/sql/upgrades/123.sql +swh/storage/sql/upgrades/124.sql +swh/storage/sql/upgrades/125.sql +swh/storage/sql/upgrades/126.sql +swh/storage/sql/upgrades/127.sql +swh/storage/sql/upgrades/128.sql +swh/storage/sql/upgrades/129.sql +swh/storage/sql/upgrades/130.sql +swh/storage/sql/upgrades/131.sql +swh/storage/sql/upgrades/132.sql +swh/storage/sql/upgrades/133.sql +swh/storage/sql/upgrades/134.sql +swh/storage/sql/upgrades/135.sql +swh/storage/sql/upgrades/136.sql +swh/storage/sql/upgrades/137.sql +swh/storage/sql/upgrades/138.sql +swh/storage/sql/upgrades/139.sql +swh/storage/sql/upgrades/140.sql +swh/storage/sql/upgrades/141.sql +swh/storage/sql/upgrades/142.sql +swh/storage/sql/upgrades/143.sql +swh/storage/sql/upgrades/144.sql +swh/storage/sql/upgrades/145.sql +swh/storage/sql/upgrades/146.sql +swh/storage/sql/upgrades/147.sql +swh/storage/sql/upgrades/148.sql +swh/storage/sql/upgrades/149.sql +swh/storage/sql/upgrades/150.sql +swh/storage/sql/upgrades/151.sql +swh/storage/sql/upgrades/152.sql +swh/storage/sql/upgrades/153.sql +swh/storage/sql/upgrades/154.sql +swh/storage/sql/upgrades/155.sql +swh/storage/sql/upgrades/156.sql +swh/storage/sql/upgrades/157.sql +swh/storage/sql/upgrades/158.sql +swh/storage/sql/upgrades/159.sql +swh/storage/sql/upgrades/160.sql +swh/storage/sql/upgrades/161.sql +swh/storage/sql/upgrades/162.sql +swh/storage/sql/upgrades/163.sql +swh/storage/sql/upgrades/164.sql +swh/storage/sql/upgrades/165.sql +swh/storage/sql/upgrades/166.sql +swh/storage/sql/upgrades/167.sql +swh/storage/sql/upgrades/168.sql +swh/storage/sql/upgrades/169.sql +swh/storage/sql/upgrades/170.sql +swh/storage/sql/upgrades/171.sql +swh/storage/sql/upgrades/172.sql +swh/storage/sql/upgrades/173.sql +swh/storage/sql/upgrades/174.sql +swh/storage/sql/upgrades/175.sql +swh/storage/sql/upgrades/176.sql +swh/storage/sql/upgrades/177.sql +swh/storage/sql/upgrades/178.sql +swh/storage/sql/upgrades/179.sql +swh/storage/sql/upgrades/180.sql +swh/storage/sql/upgrades/181.sql +swh/storage/sql/upgrades/182.sql swh/storage/tests/__init__.py swh/storage/tests/conftest.py swh/storage/tests/storage_data.py swh/storage/tests/storage_tests.py swh/storage/tests/test_api_client.py swh/storage/tests/test_backfill.py swh/storage/tests/test_buffer.py swh/storage/tests/test_cassandra.py swh/storage/tests/test_cassandra_converters.py swh/storage/tests/test_cassandra_migration.py swh/storage/tests/test_cli.py swh/storage/tests/test_counter.py swh/storage/tests/test_exception.py swh/storage/tests/test_filter.py swh/storage/tests/test_in_memory.py swh/storage/tests/test_init.py swh/storage/tests/test_kafka_writer.py swh/storage/tests/test_metrics.py swh/storage/tests/test_postgresql.py swh/storage/tests/test_postgresql_converters.py swh/storage/tests/test_pytest_plugin.py swh/storage/tests/test_replay.py swh/storage/tests/test_retry.py swh/storage/tests/test_revision_bw_compat.py swh/storage/tests/test_serializers.py swh/storage/tests/test_server.py swh/storage/tests/test_storage_data.py swh/storage/tests/test_tenacious.py swh/storage/tests/test_utils.py swh/storage/tests/test_validate.py swh/storage/tests/algos/__init__.py swh/storage/tests/algos/test_diff.py swh/storage/tests/algos/test_dir_iterator.py swh/storage/tests/algos/test_origin.py swh/storage/tests/algos/test_revisions_walker.py swh/storage/tests/algos/test_snapshot.py swh/storage/tests/data/storage.yml swh/storage/tests/migrate_extrinsic_metadata/test_cran.py swh/storage/tests/migrate_extrinsic_metadata/test_debian.py swh/storage/tests/migrate_extrinsic_metadata/test_deposit.py swh/storage/tests/migrate_extrinsic_metadata/test_gnu.py swh/storage/tests/migrate_extrinsic_metadata/test_nixguix.py swh/storage/tests/migrate_extrinsic_metadata/test_npm.py swh/storage/tests/migrate_extrinsic_metadata/test_pypi.py \ No newline at end of file diff --git a/swh.storage.egg-info/entry_points.txt b/swh.storage.egg-info/entry_points.txt index d3369c8e..ae4d3eec 100644 --- a/swh.storage.egg-info/entry_points.txt +++ b/swh.storage.egg-info/entry_points.txt @@ -1,4 +1,2 @@ - - [swh.cli.subcommands] - storage=swh.storage.cli - \ No newline at end of file +[swh.cli.subcommands] +storage = swh.storage.cli diff --git a/swh.storage.egg-info/requires.txt b/swh.storage.egg-info/requires.txt index 75004d1a..262af7e1 100644 --- a/swh.storage.egg-info/requires.txt +++ b/swh.storage.egg-info/requires.txt @@ -1,33 +1,34 @@ aiohttp cassandra-driver!=3.21.0,>=3.19.0 click deprecated flask iso8601 mypy_extensions psycopg2 redis tenacity>=6.2 typing-extensions -swh.core[db,http]>=0.14.0 +swh.core[db,http]>=2 swh.counters>=v0.8.0 swh.model>=4.4.0 swh.objstorage>=0.2.2 [journal] swh.journal>=0.9 [testing] hypothesis>=3.11.0 pytest<7.0.0 pytest-mock swh.model[testing]>=0.0.50 pytz pytest-redis pytest-xdist types-python-dateutil types-pytz types-pyyaml types-redis types-requests +types-toml swh.journal>=0.9 diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py index cd342de8..cdbe7c49 100644 --- a/swh/storage/__init__.py +++ b/swh/storage/__init__.py @@ -1,122 +1,125 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import importlib from typing import TYPE_CHECKING, Any, Dict, List import warnings if TYPE_CHECKING: from .interface import StorageInterface STORAGE_IMPLEMENTATIONS = { "remote": ".api.client.RemoteStorage", "memory": ".in_memory.InMemoryStorage", "cassandra": ".cassandra.CassandraStorage", "postgresql": ".postgresql.storage.Storage", # deprecated "local": ".postgresql.storage.Storage", # proxy storages "buffer": ".proxies.buffer.BufferingProxyStorage", "counter": ".proxies.counter.CountingProxyStorage", "filter": ".proxies.filter.FilteringProxyStorage", "retry": ".proxies.retry.RetryingProxyStorage", "tenacious": ".proxies.tenacious.TenaciousProxyStorage", "validate": ".proxies.validate.ValidatingProxyStorage", } def get_storage(cls: str, **kwargs) -> "StorageInterface": """Get a storage object of class `storage_class` with arguments `storage_args`. Args: cls (str): storage's class, can be: - ``local`` to use a postgresql database - ``cassandra`` to use a cassandra database - ``remote`` to connect to a swh-storage server - ``memory`` for an in-memory storage, useful for fast tests - ``filter``, ``buffer``, ... to use specific storage "proxies", see their respective documentations args (dict): dictionary with keys Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ if "args" in kwargs: warnings.warn( 'Explicit "args" key is deprecated, use keys directly instead.', DeprecationWarning, ) kwargs = kwargs["args"] if cls == "pipeline": return get_storage_pipeline(**kwargs) if cls == "local": warnings.warn( 'The "local" storage class is deprecated, use "postgresql" instead.', DeprecationWarning, ) class_path = STORAGE_IMPLEMENTATIONS.get(cls) if class_path is None: raise ValueError( "Unknown storage class `%s`. Supported: %s" % (cls, ", ".join(STORAGE_IMPLEMENTATIONS)) ) (module_path, class_name) = class_path.rsplit(".", 1) module = importlib.import_module(module_path, package=__package__) Storage = getattr(module, class_name) check_config = kwargs.pop("check_config", {}) storage = Storage(**kwargs) if check_config: if not storage.check_config(**check_config): raise EnvironmentError("storage check config failed") return storage +get_datastore = get_storage + + def get_storage_pipeline( steps: List[Dict[str, Any]], check_config=None ) -> "StorageInterface": """Recursively get a storage object that may use other storage objects as backends. Args: steps (List[dict]): List of dicts that may be used as kwargs for `get_storage`. Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ storage_config = None for step in reversed(steps): if "args" in step: warnings.warn( 'Explicit "args" key is deprecated, use keys directly ' "instead.", DeprecationWarning, ) step = { "cls": step["cls"], **step["args"], } if storage_config: step["storage"] = storage_config step["check_config"] = check_config storage_config = step if storage_config is None: raise ValueError("'pipeline' has no steps.") return get_storage(**storage_config) diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py index 40797931..25035684 100644 --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -1,1625 +1,1630 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from collections import defaultdict import contextlib from contextlib import contextmanager import datetime import itertools import operator from typing import Any, Counter, Dict, Iterable, List, Optional, Sequence, Tuple import attr import psycopg2 import psycopg2.errors import psycopg2.pool from swh.core.api.serializers import msgpack_dumps, msgpack_loads from swh.core.db.common import db_transaction, db_transaction_generator from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex from swh.model.model import ( SHA1_SIZE, Content, Directory, DirectoryEntry, ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1, Sha1Git, SkippedContent, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.exc import HashCollision, StorageArgumentException, StorageDBError from swh.storage.interface import ( VISIT_STATUSES, ListOrder, PagedResult, PartialBranches, ) from swh.storage.metrics import process_metrics, send_metric, timed from swh.storage.objstorage import ObjStorage from swh.storage.utils import ( extract_collision_hash, get_partition_bounds_bytes, map_optional, now, ) from swh.storage.writer import JournalWriter from . import converters from .db import Db # Max block size of contents to return BULK_BLOCK_CONTENT_LEN_MAX = 10000 EMPTY_SNAPSHOT_ID = hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e") """Identifier for the empty snapshot""" VALIDATION_EXCEPTIONS = ( KeyError, TypeError, ValueError, psycopg2.errors.CheckViolation, psycopg2.errors.IntegrityError, psycopg2.errors.InvalidTextRepresentation, psycopg2.errors.NotNullViolation, psycopg2.errors.NumericValueOutOfRange, psycopg2.errors.UndefinedFunction, # (raised on wrong argument typs) ) """Exceptions raised by postgresql when validation of the arguments failed.""" @contextlib.contextmanager def convert_validation_exceptions(): """Catches postgresql errors related to invalid arguments, and re-raises a StorageArgumentException.""" try: yield except psycopg2.errors.UniqueViolation: # This only happens because of concurrent insertions, but it is # a subclass of IntegrityError; so we need to catch and reraise it # before the next clause converts it to StorageArgumentException. raise except VALIDATION_EXCEPTIONS as e: raise StorageArgumentException(str(e)) class Storage: """SWH storage proxy, encompassing DB and object storage """ def __init__( self, db, objstorage, min_pool_conns=1, max_pool_conns=10, journal_writer=None ): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection obj_root: path to the root of the object storage """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) # See comment below self._db.cursor().execute("SET TIME ZONE 'UTC'") else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) self.journal_writer = JournalWriter(journal_writer) self.objstorage = ObjStorage(objstorage) def get_db(self): if self._db: return self._db else: db = Db.from_pool(self._pool) # Workaround for psycopg2 < 2.9.0 not handling fractional timezones, # which may happen on old revision/release dates on systems configured # with non-UTC timezones. # https://www.psycopg.org/docs/usage.html#time-zones-handling db.cursor().execute("SET TIME ZONE 'UTC'") return db def put_db(self, db): if db is not self._db: db.put_conn() @contextmanager def db(self): db = None try: db = self.get_db() yield db finally: if db: self.put_db(db) @timed @db_transaction() def check_config(self, *, check_write: bool, db: Db, cur=None) -> bool: if not self.objstorage.check_config(check_write=check_write): return False if not db.check_dbversion(): return False # Check permissions on one of the tables if check_write: check = "INSERT" else: check = "SELECT" cur.execute("select has_table_privilege(current_user, 'content', %s)", (check,)) return cur.fetchone()[0] + @db_transaction() + def get_current_version(self, *, db: Db, cur=None): + """Returns the current code (expected) version""" + return db.current_version + def _content_unique_key(self, hash, db): """Given a hash (tuple or dict), return a unique key from the aggregation of keys. """ keys = db.content_hash_keys if isinstance(hash, tuple): return hash return tuple([hash[k] for k in keys]) def _content_add_metadata(self, db, cur, content): """Add content to the postgresql database but not the object storage. """ # create temporary table for metadata injection db.mktemp("content", cur) db.copy_to( (c.to_dict() for c in content), "tmp_content", db.content_add_keys, cur ) # move metadata in place try: db.content_add_from_temp(cur) except psycopg2.IntegrityError as e: if e.diag.sqlstate == "23505" and e.diag.table_name == "content": message_detail = e.diag.message_detail if message_detail: hash_name, hash_id = extract_collision_hash(message_detail) collision_contents_hashes = [ c.hashes() for c in content if c.get_hash(hash_name) == hash_id ] else: constraint_to_hash_name = { "content_pkey": "sha1", "content_sha1_git_idx": "sha1_git", "content_sha256_idx": "sha256", } hash_name = constraint_to_hash_name.get(e.diag.constraint_name) hash_id = None collision_contents_hashes = None raise HashCollision( hash_name, hash_id, collision_contents_hashes ) from None else: raise @timed @process_metrics def content_add(self, content: List[Content]) -> Dict[str, int]: ctime = now() contents = [attr.evolve(c, ctime=ctime) for c in content] # Must add to the objstorage before the DB and journal. Otherwise: # 1. in case of a crash the DB may "believe" we have the content, but # we didn't have time to write to the objstorage before the crash # 2. the objstorage mirroring, which reads from the journal, may attempt to # read from the objstorage before we finished writing it objstorage_summary = self.objstorage.content_add(contents) with self.db() as db: with db.transaction() as cur: missing = list( self.content_missing( map(Content.to_dict, contents), key_hash="sha1_git", db=db, cur=cur, ) ) contents = [c for c in contents if c.sha1_git in missing] self.journal_writer.content_add(contents) self._content_add_metadata(db, cur, contents) return { "content:add": len(contents), "content:add:bytes": objstorage_summary["content:add:bytes"], } @timed @db_transaction() def content_update( self, contents: List[Dict[str, Any]], keys: List[str] = [], *, db: Db, cur=None ) -> None: # TODO: Add a check on input keys. How to properly implement # this? We don't know yet the new columns. self.journal_writer.content_update(contents) db.mktemp("content", cur) select_keys = list(set(db.content_get_metadata_keys).union(set(keys))) with convert_validation_exceptions(): db.copy_to(contents, "tmp_content", select_keys, cur) db.content_update_from_temp(keys_to_update=keys, cur=cur) @timed @process_metrics @db_transaction() def content_add_metadata( self, content: List[Content], *, db: Db, cur=None ) -> Dict[str, int]: missing = self.content_missing( (c.to_dict() for c in content), key_hash="sha1_git", db=db, cur=cur, ) contents = [c for c in content if c.sha1_git in missing] self.journal_writer.content_add_metadata(contents) self._content_add_metadata(db, cur, contents) return { "content:add": len(contents), } @timed def content_get_data(self, content: Sha1) -> Optional[bytes]: # FIXME: Make this method support slicing the `data` return self.objstorage.content_get(content) @timed @db_transaction() def content_get_partition( self, partition_id: int, nb_partitions: int, page_token: Optional[str] = None, limit: int = 1000, *, db: Db, cur=None, ) -> PagedResult[Content]: if limit is None: raise StorageArgumentException("limit should not be None") (start, end) = get_partition_bounds_bytes( partition_id, nb_partitions, SHA1_SIZE ) if page_token: start = hash_to_bytes(page_token) if end is None: end = b"\xff" * SHA1_SIZE next_page_token: Optional[str] = None contents = [] for counter, row in enumerate(db.content_get_range(start, end, limit + 1, cur)): row_d = dict(zip(db.content_get_metadata_keys, row)) content = Content(**row_d) if counter >= limit: # take the last content for the next page starting from this next_page_token = hash_to_hex(content.sha1) break contents.append(content) assert len(contents) <= limit return PagedResult(results=contents, next_page_token=next_page_token) @timed @db_transaction(statement_timeout=500) def content_get( self, contents: List[bytes], algo: str = "sha1", *, db: Db, cur=None ) -> List[Optional[Content]]: contents_by_hash: Dict[bytes, Optional[Content]] = {} if algo not in DEFAULT_ALGORITHMS: raise StorageArgumentException( "algo should be one of {','.join(DEFAULT_ALGORITHMS)}" ) rows = db.content_get_metadata_from_hashes(contents, algo, cur) key = operator.attrgetter(algo) for row in rows: row_d = dict(zip(db.content_get_metadata_keys, row)) content = Content(**row_d) contents_by_hash[key(content)] = content return [contents_by_hash.get(sha1) for sha1 in contents] @timed @db_transaction_generator() def content_missing( self, contents: List[Dict[str, Any]], key_hash: str = "sha1", *, db: Db, cur=None, ) -> Iterable[bytes]: if key_hash not in DEFAULT_ALGORITHMS: raise StorageArgumentException( "key_hash should be one of {','.join(DEFAULT_ALGORITHMS)}" ) keys = db.content_hash_keys key_hash_idx = keys.index(key_hash) for obj in db.content_missing_from_list(contents, cur): yield obj[key_hash_idx] @timed @db_transaction_generator() def content_missing_per_sha1( self, contents: List[bytes], *, db: Db, cur=None ) -> Iterable[bytes]: for obj in db.content_missing_per_sha1(contents, cur): yield obj[0] @timed @db_transaction_generator() def content_missing_per_sha1_git( self, contents: List[bytes], *, db: Db, cur=None ) -> Iterable[Sha1Git]: for obj in db.content_missing_per_sha1_git(contents, cur): yield obj[0] @timed @db_transaction() def content_find( self, content: Dict[str, Any], *, db: Db, cur=None ) -> List[Content]: if not set(content).intersection(DEFAULT_ALGORITHMS): raise StorageArgumentException( "content keys must contain at least one " f"of: {', '.join(sorted(DEFAULT_ALGORITHMS))}" ) rows = db.content_find( sha1=content.get("sha1"), sha1_git=content.get("sha1_git"), sha256=content.get("sha256"), blake2s256=content.get("blake2s256"), cur=cur, ) contents = [] for row in rows: row_d = dict(zip(db.content_find_cols, row)) contents.append(Content(**row_d)) return contents @timed @db_transaction() def content_get_random(self, *, db: Db, cur=None) -> Sha1Git: return db.content_get_random(cur) @staticmethod def _skipped_content_normalize(d): d = d.copy() if d.get("status") is None: d["status"] = "absent" if d.get("length") is None: d["length"] = -1 return d def _skipped_content_add_metadata(self, db, cur, content: List[SkippedContent]): origin_ids = db.origin_id_get_by_url([cont.origin for cont in content], cur=cur) content = [ attr.evolve(c, origin=origin_id) for (c, origin_id) in zip(content, origin_ids) ] db.mktemp("skipped_content", cur) db.copy_to( [c.to_dict() for c in content], "tmp_skipped_content", db.skipped_content_keys, cur, ) # move metadata in place db.skipped_content_add_from_temp(cur) @timed @process_metrics @db_transaction() def skipped_content_add( self, content: List[SkippedContent], *, db: Db, cur=None ) -> Dict[str, int]: ctime = now() content = [attr.evolve(c, ctime=ctime) for c in content] missing_contents = self.skipped_content_missing( (c.to_dict() for c in content), db=db, cur=cur, ) content = [ c for c in content if any( all( c.get_hash(algo) == missing_content.get(algo) for algo in DEFAULT_ALGORITHMS ) for missing_content in missing_contents ) ] self.journal_writer.skipped_content_add(content) self._skipped_content_add_metadata(db, cur, content) return { "skipped_content:add": len(content), } @timed @db_transaction_generator() def skipped_content_missing( self, contents: List[Dict[str, Any]], *, db: Db, cur=None ) -> Iterable[Dict[str, Any]]: contents = list(contents) for content in db.skipped_content_missing(contents, cur): yield dict(zip(db.content_hash_keys, content)) @timed @process_metrics @db_transaction() def directory_add( self, directories: List[Directory], *, db: Db, cur=None ) -> Dict[str, int]: summary = {"directory:add": 0} dirs = set() dir_entries: Dict[str, defaultdict] = { "file": defaultdict(list), "dir": defaultdict(list), "rev": defaultdict(list), } for cur_dir in directories: dir_id = cur_dir.id dirs.add(dir_id) for src_entry in cur_dir.entries: entry = src_entry.to_dict() entry["dir_id"] = dir_id dir_entries[entry["type"]][dir_id].append(entry) dirs_missing = set(self.directory_missing(dirs, db=db, cur=cur)) if not dirs_missing: return summary self.journal_writer.directory_add( dir_ for dir_ in directories if dir_.id in dirs_missing ) # Copy directory metadata dirs_missing_dict = ( {"id": dir_.id, "raw_manifest": dir_.raw_manifest} for dir_ in directories if dir_.id in dirs_missing ) db.mktemp("directory", cur) db.copy_to(dirs_missing_dict, "tmp_directory", ["id", "raw_manifest"], cur) # Copy entries for entry_type, entry_list in dir_entries.items(): entries = itertools.chain.from_iterable( entries_for_dir for dir_id, entries_for_dir in entry_list.items() if dir_id in dirs_missing ) db.mktemp_dir_entry(entry_type) db.copy_to( entries, "tmp_directory_entry_%s" % entry_type, ["target", "name", "perms", "dir_id"], cur, ) # Do the final copy db.directory_add_from_temp(cur) summary["directory:add"] = len(dirs_missing) return summary @timed @db_transaction_generator() def directory_missing( self, directories: List[Sha1Git], *, db: Db, cur=None ) -> Iterable[Sha1Git]: for obj in db.directory_missing_from_list(directories, cur): yield obj[0] @timed @db_transaction_generator(statement_timeout=20000) def directory_ls( self, directory: Sha1Git, recursive: bool = False, *, db: Db, cur=None ) -> Iterable[Dict[str, Any]]: if recursive: res_gen = db.directory_walk(directory, cur=cur) else: res_gen = db.directory_walk_one(directory, cur=cur) for line in res_gen: yield dict(zip(db.directory_ls_cols, line)) @timed @db_transaction(statement_timeout=2000) def directory_entry_get_by_path( self, directory: Sha1Git, paths: List[bytes], *, db: Db, cur=None ) -> Optional[Dict[str, Any]]: res = db.directory_entry_get_by_path(directory, paths, cur) return dict(zip(db.directory_ls_cols, res)) if res else None @timed @db_transaction() def directory_get_random(self, *, db: Db, cur=None) -> Sha1Git: return db.directory_get_random(cur) @db_transaction() def directory_get_entries( self, directory_id: Sha1Git, page_token: Optional[bytes] = None, limit: int = 1000, *, db: Db, cur=None, ) -> Optional[PagedResult[DirectoryEntry]]: if list(self.directory_missing([directory_id], db=db, cur=cur)): return None if page_token is not None: raise StorageArgumentException("Unsupported page token") # TODO: actually paginate rows = db.directory_get_entries(directory_id, cur=cur) return PagedResult( results=[ DirectoryEntry(**dict(zip(db.directory_get_entries_cols, row))) for row in rows ], next_page_token=None, ) @timed @db_transaction() def directory_get_raw_manifest( self, directory_ids: List[Sha1Git], *, db: Db, cur=None ) -> Dict[Sha1Git, Optional[bytes]]: return dict(db.directory_get_raw_manifest(directory_ids, cur=cur)) @timed @process_metrics @db_transaction() def revision_add( self, revisions: List[Revision], *, db: Db, cur=None ) -> Dict[str, int]: summary = {"revision:add": 0} revisions_missing = set( self.revision_missing( set(revision.id for revision in revisions), db=db, cur=cur ) ) if not revisions_missing: return summary db.mktemp_revision(cur) revisions_filtered = [ revision for revision in revisions if revision.id in revisions_missing ] self.journal_writer.revision_add(revisions_filtered) db_revisions_filtered = list(map(converters.revision_to_db, revisions_filtered)) parents_filtered: List[Dict[str, Any]] = [] with convert_validation_exceptions(): db.copy_to( db_revisions_filtered, "tmp_revision", db.revision_add_cols, cur, lambda rev: parents_filtered.extend(rev["parents"]), ) db.revision_add_from_temp(cur) db.copy_to( parents_filtered, "revision_history", ["id", "parent_id", "parent_rank"], cur, ) return {"revision:add": len(revisions_missing)} @timed @db_transaction_generator() def revision_missing( self, revisions: List[Sha1Git], *, db: Db, cur=None ) -> Iterable[Sha1Git]: if not revisions: return None for obj in db.revision_missing_from_list(revisions, cur): yield obj[0] @timed @db_transaction(statement_timeout=1000) def revision_get( self, revision_ids: List[Sha1Git], ignore_displayname: bool = False, *, db: Db, cur=None, ) -> List[Optional[Revision]]: revisions = [] for line in db.revision_get_from_list(revision_ids, ignore_displayname, cur): revision = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) revisions.append(revision) return revisions @timed @db_transaction_generator(statement_timeout=2000) def revision_log( self, revisions: List[Sha1Git], ignore_displayname: bool = False, limit: Optional[int] = None, *, db: Db, cur=None, ) -> Iterable[Optional[Dict[str, Any]]]: for line in db.revision_log( revisions, ignore_displayname=ignore_displayname, limit=limit, cur=cur ): data = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) if not data: yield None continue yield data.to_dict() @timed @db_transaction_generator(statement_timeout=2000) def revision_shortlog( self, revisions: List[Sha1Git], limit: Optional[int] = None, *, db: Db, cur=None ) -> Iterable[Optional[Tuple[Sha1Git, Tuple[Sha1Git, ...]]]]: yield from db.revision_shortlog(revisions, limit, cur) @timed @db_transaction() def revision_get_random(self, *, db: Db, cur=None) -> Sha1Git: return db.revision_get_random(cur) @timed @db_transaction() def extid_get_from_extid( self, id_type: str, ids: List[bytes], version: Optional[int] = None, *, db: Db, cur=None, ) -> List[ExtID]: extids = [] for row in db.extid_get_from_extid_list(id_type, ids, version=version, cur=cur): if row[0] is not None: extids.append(converters.db_to_extid(dict(zip(db.extid_cols, row)))) return extids @timed @db_transaction() def extid_get_from_target( self, target_type: ObjectType, ids: List[Sha1Git], extid_type: Optional[str] = None, extid_version: Optional[int] = None, *, db: Db, cur=None, ) -> List[ExtID]: extids = [] if (extid_version is not None and extid_type is None) or ( extid_version is None and extid_type is not None ): raise ValueError("You must provide both extid_type and extid_version") for row in db.extid_get_from_swhid_list( target_type.value, ids, extid_version=extid_version, extid_type=extid_type, cur=cur, ): if row[0] is not None: extids.append(converters.db_to_extid(dict(zip(db.extid_cols, row)))) return extids @timed @db_transaction() def extid_add(self, ids: List[ExtID], *, db: Db, cur=None) -> Dict[str, int]: extid = [ { "extid": extid.extid, "extid_type": extid.extid_type, "extid_version": getattr(extid, "extid_version", 0), "target": extid.target.object_id, "target_type": extid.target.object_type.name.lower(), # arghh } for extid in ids ] db.mktemp("extid", cur) self.journal_writer.extid_add(ids) db.copy_to(extid, "tmp_extid", db.extid_cols, cur) # move metadata in place db.extid_add_from_temp(cur) return {"extid:add": len(extid)} @timed @process_metrics @db_transaction() def release_add( self, releases: List[Release], *, db: Db, cur=None ) -> Dict[str, int]: summary = {"release:add": 0} release_ids = set(release.id for release in releases) releases_missing = set(self.release_missing(release_ids, db=db, cur=cur)) if not releases_missing: return summary db.mktemp_release(cur) releases_filtered = [ release for release in releases if release.id in releases_missing ] self.journal_writer.release_add(releases_filtered) db_releases_filtered = list(map(converters.release_to_db, releases_filtered)) with convert_validation_exceptions(): db.copy_to(db_releases_filtered, "tmp_release", db.release_add_cols, cur) db.release_add_from_temp(cur) return {"release:add": len(releases_missing)} @timed @db_transaction_generator() def release_missing( self, releases: List[Sha1Git], *, db: Db, cur=None ) -> Iterable[Sha1Git]: if not releases: return for obj in db.release_missing_from_list(releases, cur): yield obj[0] @timed @db_transaction(statement_timeout=500) def release_get( self, releases: List[Sha1Git], ignore_displayname: bool = False, *, db: Db, cur=None, ) -> List[Optional[Release]]: rels = [] for release in db.release_get_from_list(releases, ignore_displayname, cur): data = converters.db_to_release(dict(zip(db.release_get_cols, release))) rels.append(data if data else None) return rels @timed @db_transaction() def release_get_random(self, *, db: Db, cur=None) -> Sha1Git: return db.release_get_random(cur) @timed @process_metrics @db_transaction() def snapshot_add( self, snapshots: List[Snapshot], *, db: Db, cur=None ) -> Dict[str, int]: created_temp_table = False count = 0 for snapshot in snapshots: if not db.snapshot_exists(snapshot.id, cur): if not created_temp_table: db.mktemp_snapshot_branch(cur) created_temp_table = True with convert_validation_exceptions(): db.copy_to( ( { "name": name, "target": info.target if info else None, "target_type": ( info.target_type.value if info else None ), } for name, info in snapshot.branches.items() ), "tmp_snapshot_branch", ["name", "target", "target_type"], cur, ) self.journal_writer.snapshot_add([snapshot]) db.snapshot_add(snapshot.id, cur) count += 1 return {"snapshot:add": count} @timed @db_transaction_generator() def snapshot_missing( self, snapshots: List[Sha1Git], *, db: Db, cur=None ) -> Iterable[Sha1Git]: for obj in db.snapshot_missing_from_list(snapshots, cur): yield obj[0] @timed @db_transaction(statement_timeout=2000) def snapshot_get( self, snapshot_id: Sha1Git, *, db: Db, cur=None ) -> Optional[Dict[str, Any]]: d = self.snapshot_get_branches(snapshot_id) if d is None: return d return { "id": d["id"], "branches": { name: branch.to_dict() if branch else None for (name, branch) in d["branches"].items() }, "next_branch": d["next_branch"], } @timed @db_transaction(statement_timeout=2000) def snapshot_count_branches( self, snapshot_id: Sha1Git, branch_name_exclude_prefix: Optional[bytes] = None, *, db: Db, cur=None, ) -> Optional[Dict[Optional[str], int]]: return dict( [ bc for bc in db.snapshot_count_branches( snapshot_id, branch_name_exclude_prefix, cur, ) ] ) @timed @db_transaction(statement_timeout=2000) def snapshot_get_branches( self, snapshot_id: Sha1Git, branches_from: bytes = b"", branches_count: int = 1000, target_types: Optional[List[str]] = None, branch_name_include_substring: Optional[bytes] = None, branch_name_exclude_prefix: Optional[bytes] = None, *, db: Db, cur=None, ) -> Optional[PartialBranches]: if snapshot_id == EMPTY_SNAPSHOT_ID: return PartialBranches(id=snapshot_id, branches={}, next_branch=None,) if list(self.snapshot_missing([snapshot_id])): return None branches = {} next_branch = None fetched_branches = list( db.snapshot_get_by_id( snapshot_id, branches_from=branches_from, # the underlying SQL query can be quite expensive to execute for small # branches_count value, so we ensure a minimum branches limit of 10 for # optimal performances branches_count=max(branches_count + 1, 10), target_types=target_types, branch_name_include_substring=branch_name_include_substring, branch_name_exclude_prefix=branch_name_exclude_prefix, cur=cur, ) ) for row in fetched_branches[:branches_count]: branch_d = dict(zip(db.snapshot_get_cols, row)) del branch_d["snapshot_id"] name = branch_d.pop("name") if branch_d["target"] is None and branch_d["target_type"] is None: branch = None else: assert branch_d["target_type"] is not None branch = SnapshotBranch( target=branch_d["target"], target_type=TargetType(branch_d["target_type"]), ) branches[name] = branch if len(fetched_branches) > branches_count: next_branch = dict( zip(db.snapshot_get_cols, fetched_branches[branches_count]) )["name"] return PartialBranches( id=snapshot_id, branches=branches, next_branch=next_branch, ) @timed @db_transaction() def snapshot_get_random(self, *, db: Db, cur=None) -> Sha1Git: return db.snapshot_get_random(cur) @timed @db_transaction() def origin_visit_add( self, visits: List[OriginVisit], *, db: Db, cur=None ) -> Iterable[OriginVisit]: for visit in visits: origin = self.origin_get([visit.origin], db=db, cur=cur)[0] if not origin: # Cannot add a visit without an origin raise StorageArgumentException("Unknown origin %s", visit.origin) all_visits = [] nb_visits = 0 for visit in visits: nb_visits += 1 if not visit.visit: with convert_validation_exceptions(): visit_id = db.origin_visit_add( visit.origin, visit.date, visit.type, cur=cur ) visit = attr.evolve(visit, visit=visit_id) else: db.origin_visit_add_with_id(visit, cur=cur) assert visit.visit is not None all_visits.append(visit) # Forced to write after for the case when the visit has no id self.journal_writer.origin_visit_add([visit]) visit_status = OriginVisitStatus( origin=visit.origin, visit=visit.visit, date=visit.date, type=visit.type, status="created", snapshot=None, ) self._origin_visit_status_add(visit_status, db=db, cur=cur) send_metric("origin_visit:add", count=nb_visits, method_name="origin_visit") return all_visits def _origin_visit_status_add( self, visit_status: OriginVisitStatus, db, cur ) -> None: """Add an origin visit status""" self.journal_writer.origin_visit_status_add([visit_status]) db.origin_visit_status_add(visit_status, cur=cur) @timed @process_metrics @db_transaction() def origin_visit_status_add( self, visit_statuses: List[OriginVisitStatus], *, db: Db, cur=None, ) -> Dict[str, int]: visit_statuses_ = [] # First round to check existence (fail early if any is ko) for visit_status in visit_statuses: origin_url = self.origin_get([visit_status.origin], db=db, cur=cur)[0] if not origin_url: raise StorageArgumentException(f"Unknown origin {visit_status.origin}") if visit_status.type is None: origin_visit = self.origin_visit_get_by( visit_status.origin, visit_status.visit, db=db, cur=cur ) if origin_visit is None: raise StorageArgumentException( f"Unknown origin visit {visit_status.visit} " f"of origin {visit_status.origin}" ) origin_visit_status = attr.evolve(visit_status, type=origin_visit.type) else: origin_visit_status = visit_status visit_statuses_.append(origin_visit_status) for visit_status in visit_statuses_: self._origin_visit_status_add(visit_status, db, cur) return {"origin_visit_status:add": len(visit_statuses_)} @timed @db_transaction() def origin_visit_status_get_latest( self, origin_url: str, visit: int, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, *, db: Db, cur=None, ) -> Optional[OriginVisitStatus]: if allowed_statuses and not set(allowed_statuses).intersection(VISIT_STATUSES): raise StorageArgumentException( f"Unknown allowed statuses {','.join(allowed_statuses)}, only " f"{','.join(VISIT_STATUSES)} authorized" ) row_d = db.origin_visit_status_get_latest( origin_url, visit, allowed_statuses, require_snapshot, cur=cur ) if not row_d: return None return OriginVisitStatus(**row_d) @timed @db_transaction(statement_timeout=500) def origin_visit_get( self, origin: str, page_token: Optional[str] = None, order: ListOrder = ListOrder.ASC, limit: int = 10, *, db: Db, cur=None, ) -> PagedResult[OriginVisit]: page_token = page_token or "0" if not isinstance(order, ListOrder): raise StorageArgumentException("order must be a ListOrder value") if not isinstance(page_token, str): raise StorageArgumentException("page_token must be a string.") next_page_token = None visit_from = int(page_token) visits: List[OriginVisit] = [] extra_limit = limit + 1 for row in db.origin_visit_get_range( origin, visit_from=visit_from, order=order, limit=extra_limit, cur=cur ): row_d = dict(zip(db.origin_visit_cols, row)) visits.append( OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) ) assert len(visits) <= extra_limit if len(visits) == extra_limit: visits = visits[:limit] next_page_token = str(visits[-1].visit) return PagedResult(results=visits, next_page_token=next_page_token) @timed @db_transaction(statement_timeout=500) def origin_visit_find_by_date( self, origin: str, visit_date: datetime.datetime, *, db: Db, cur=None ) -> Optional[OriginVisit]: row_d = db.origin_visit_find_by_date(origin, visit_date, cur=cur) if not row_d: return None return OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) @timed @db_transaction(statement_timeout=500) def origin_visit_get_by( self, origin: str, visit: int, *, db: Db, cur=None ) -> Optional[OriginVisit]: row = db.origin_visit_get(origin, visit, cur) if row: row_d = dict(zip(db.origin_visit_get_cols, row)) return OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) return None @timed @db_transaction(statement_timeout=4000) def origin_visit_get_latest( self, origin: str, type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, *, db: Db, cur=None, ) -> Optional[OriginVisit]: if allowed_statuses and not set(allowed_statuses).intersection(VISIT_STATUSES): raise StorageArgumentException( f"Unknown allowed statuses {','.join(allowed_statuses)}, only " f"{','.join(VISIT_STATUSES)} authorized" ) row = db.origin_visit_get_latest( origin, type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, cur=cur, ) if row: row_d = dict(zip(db.origin_visit_get_cols, row)) visit = OriginVisit( origin=row_d["origin"], visit=row_d["visit"], date=row_d["date"], type=row_d["type"], ) return visit return None @timed @db_transaction(statement_timeout=500) def origin_visit_status_get( self, origin: str, visit: int, page_token: Optional[str] = None, order: ListOrder = ListOrder.ASC, limit: int = 10, *, db: Db, cur=None, ) -> PagedResult[OriginVisitStatus]: next_page_token = None date_from = None if page_token is not None: date_from = datetime.datetime.fromisoformat(page_token) visit_statuses: List[OriginVisitStatus] = [] # Take one more visit status so we can reuse it as the next page token if any for row in db.origin_visit_status_get_range( origin, visit, date_from=date_from, order=order, limit=limit + 1, cur=cur, ): row_d = dict(zip(db.origin_visit_status_cols, row)) visit_statuses.append(OriginVisitStatus(**row_d)) if len(visit_statuses) > limit: # last visit status date is the next page token next_page_token = str(visit_statuses[-1].date) # excluding that visit status from the result to respect the limit size visit_statuses = visit_statuses[:limit] return PagedResult(results=visit_statuses, next_page_token=next_page_token) @timed @db_transaction() def origin_visit_status_get_random( self, type: str, *, db: Db, cur=None ) -> Optional[OriginVisitStatus]: row = db.origin_visit_get_random(type, cur) if row is not None: row_d = dict(zip(db.origin_visit_status_cols, row)) return OriginVisitStatus(**row_d) return None @timed @db_transaction(statement_timeout=2000) def object_find_by_sha1_git( self, ids: List[Sha1Git], *, db: Db, cur=None ) -> Dict[Sha1Git, List[Dict]]: ret: Dict[Sha1Git, List[Dict]] = {id: [] for id in ids} for retval in db.object_find_by_sha1_git(ids, cur=cur): if retval[1]: ret[retval[0]].append( dict(zip(db.object_find_by_sha1_git_cols, retval)) ) return ret @timed @db_transaction(statement_timeout=500) def origin_get( self, origins: List[str], *, db: Db, cur=None ) -> Iterable[Optional[Origin]]: rows = db.origin_get_by_url(origins, cur) result: List[Optional[Origin]] = [] for row in rows: origin_d = dict(zip(db.origin_cols, row)) url = origin_d["url"] result.append(None if url is None else Origin(url=url)) return result @timed @db_transaction(statement_timeout=500) def origin_get_by_sha1( self, sha1s: List[bytes], *, db: Db, cur=None ) -> List[Optional[Dict[str, Any]]]: return [ dict(zip(db.origin_cols, row)) if row[0] else None for row in db.origin_get_by_sha1(sha1s, cur) ] @timed @db_transaction_generator() def origin_get_range(self, origin_from=1, origin_count=100, *, db: Db, cur=None): for origin in db.origin_get_range(origin_from, origin_count, cur): yield dict(zip(db.origin_get_range_cols, origin)) @timed @db_transaction() def origin_list( self, page_token: Optional[str] = None, limit: int = 100, *, db: Db, cur=None ) -> PagedResult[Origin]: page_token = page_token or "0" if not isinstance(page_token, str): raise StorageArgumentException("page_token must be a string.") origin_from = int(page_token) next_page_token = None origins: List[Origin] = [] # Take one more origin so we can reuse it as the next page token if any for row_d in self.origin_get_range(origin_from, limit + 1, db=db, cur=cur): origins.append(Origin(url=row_d["url"])) # keep the last_id for the pagination if needed last_id = row_d["id"] if len(origins) > limit: # data left for subsequent call # last origin id is the next page token next_page_token = str(last_id) # excluding that origin from the result to respect the limit size origins = origins[:limit] assert len(origins) <= limit return PagedResult(results=origins, next_page_token=next_page_token) @timed @db_transaction() def origin_search( self, url_pattern: str, page_token: Optional[str] = None, limit: int = 50, regexp: bool = False, with_visit: bool = False, visit_types: Optional[List[str]] = None, *, db: Db, cur=None, ) -> PagedResult[Origin]: next_page_token = None offset = int(page_token) if page_token else 0 origins = [] # Take one more origin so we can reuse it as the next page token if any for origin in db.origin_search( url_pattern, offset, limit + 1, regexp, with_visit, visit_types, cur ): row_d = dict(zip(db.origin_cols, origin)) origins.append(Origin(url=row_d["url"])) if len(origins) > limit: # next offset next_page_token = str(offset + limit) # excluding that origin from the result to respect the limit size origins = origins[:limit] assert len(origins) <= limit return PagedResult(results=origins, next_page_token=next_page_token) @timed @db_transaction() def origin_count( self, url_pattern: str, regexp: bool = False, with_visit: bool = False, *, db: Db, cur=None, ) -> int: return db.origin_count(url_pattern, regexp, with_visit, cur) @timed @db_transaction() def origin_snapshot_get_all( self, origin_url: str, *, db: Db, cur=None ) -> List[Sha1Git]: return list(db.origin_snapshot_get_all(origin_url, cur)) @timed @process_metrics @db_transaction() def origin_add(self, origins: List[Origin], *, db: Db, cur=None) -> Dict[str, int]: urls = [o.url for o in origins] known_origins = set(url for (url,) in db.origin_get_by_url(urls, cur)) # keep only one occurrence of each given origin while keeping the list # sorted as originally given to_add = sorted(set(urls) - known_origins, key=urls.index) self.journal_writer.origin_add([Origin(url=url) for url in to_add]) added = 0 for url in to_add: if db.origin_add(url, cur): added += 1 return {"origin:add": added} @db_transaction(statement_timeout=500) def stat_counters(self, *, db: Db, cur=None): return {k: v for (k, v) in db.stat_counters()} @db_transaction() def refresh_stat_counters(self, *, db: Db, cur=None): keys = [ "content", "directory", "directory_entry_dir", "directory_entry_file", "directory_entry_rev", "origin", "origin_visit", "person", "release", "revision", "revision_history", "skipped_content", "snapshot", ] for key in keys: cur.execute("select * from swh_update_counter(%s)", (key,)) @timed @process_metrics @db_transaction() def raw_extrinsic_metadata_add( self, metadata: List[RawExtrinsicMetadata], db, cur, ) -> Dict[str, int]: metadata = list(metadata) self.journal_writer.raw_extrinsic_metadata_add(metadata) counter = Counter[ExtendedObjectType]() for metadata_entry in metadata: authority_id = self._get_authority_id(metadata_entry.authority, db, cur) fetcher_id = self._get_fetcher_id(metadata_entry.fetcher, db, cur) db.raw_extrinsic_metadata_add( id=metadata_entry.id, type=metadata_entry.target.object_type.name.lower(), target=str(metadata_entry.target), discovery_date=metadata_entry.discovery_date, authority_id=authority_id, fetcher_id=fetcher_id, format=metadata_entry.format, metadata=metadata_entry.metadata, origin=metadata_entry.origin, visit=metadata_entry.visit, snapshot=map_optional(str, metadata_entry.snapshot), release=map_optional(str, metadata_entry.release), revision=map_optional(str, metadata_entry.revision), path=metadata_entry.path, directory=map_optional(str, metadata_entry.directory), cur=cur, ) counter[metadata_entry.target.object_type] += 1 return { f"{type.value}_metadata:add": count for (type, count) in counter.items() } @db_transaction() def raw_extrinsic_metadata_get( self, target: ExtendedSWHID, authority: MetadataAuthority, after: Optional[datetime.datetime] = None, page_token: Optional[bytes] = None, limit: int = 1000, *, db: Db, cur=None, ) -> PagedResult[RawExtrinsicMetadata]: if page_token: (after_time, after_fetcher) = msgpack_loads(base64.b64decode(page_token)) if after and after_time < after: raise StorageArgumentException( "page_token is inconsistent with the value of 'after'." ) else: after_time = after after_fetcher = None authority_id = self._get_authority_id(authority, db, cur) if not authority_id: return PagedResult(next_page_token=None, results=[],) rows = db.raw_extrinsic_metadata_get( str(target), authority_id, after_time, after_fetcher, limit + 1, cur, ) rows = [dict(zip(db.raw_extrinsic_metadata_get_cols, row)) for row in rows] results = [] for row in rows: assert str(target) == row["raw_extrinsic_metadata.target"] results.append(converters.db_to_raw_extrinsic_metadata(row)) if len(results) > limit: results.pop() assert len(results) == limit last_returned_row = rows[-2] # rows[-1] corresponds to the popped result next_page_token: Optional[str] = base64.b64encode( msgpack_dumps( ( last_returned_row["discovery_date"], last_returned_row["metadata_fetcher.id"], ) ) ).decode() else: next_page_token = None return PagedResult(next_page_token=next_page_token, results=results,) @db_transaction() def raw_extrinsic_metadata_get_by_ids( self, ids: List[Sha1Git], *, db: Db, cur=None, ) -> List[RawExtrinsicMetadata]: return [ converters.db_to_raw_extrinsic_metadata( dict(zip(db.raw_extrinsic_metadata_get_cols, row)) ) for row in db.raw_extrinsic_metadata_get_by_ids(ids) ] @db_transaction() def raw_extrinsic_metadata_get_authorities( self, target: ExtendedSWHID, *, db: Db, cur=None, ) -> List[MetadataAuthority]: return [ MetadataAuthority( type=MetadataAuthorityType(authority_type), url=authority_url ) for ( authority_type, authority_url, ) in db.raw_extrinsic_metadata_get_authorities(str(target), cur) ] @timed @process_metrics @db_transaction() def metadata_fetcher_add( self, fetchers: List[MetadataFetcher], *, db: Db, cur=None ) -> Dict[str, int]: fetchers = list(fetchers) self.journal_writer.metadata_fetcher_add(fetchers) count = 0 for fetcher in fetchers: db.metadata_fetcher_add(fetcher.name, fetcher.version, cur=cur) count += 1 return {"metadata_fetcher:add": count} @timed @db_transaction(statement_timeout=500) def metadata_fetcher_get( self, name: str, version: str, *, db: Db, cur=None ) -> Optional[MetadataFetcher]: row = db.metadata_fetcher_get(name, version, cur=cur) if not row: return None return MetadataFetcher.from_dict(dict(zip(db.metadata_fetcher_cols, row))) @timed @process_metrics @db_transaction() def metadata_authority_add( self, authorities: List[MetadataAuthority], *, db: Db, cur=None ) -> Dict[str, int]: authorities = list(authorities) self.journal_writer.metadata_authority_add(authorities) count = 0 for authority in authorities: db.metadata_authority_add(authority.type.value, authority.url, cur=cur) count += 1 return {"metadata_authority:add": count} @timed @db_transaction() def metadata_authority_get( self, type: MetadataAuthorityType, url: str, *, db: Db, cur=None ) -> Optional[MetadataAuthority]: row = db.metadata_authority_get(type.value, url, cur=cur) if not row: return None return MetadataAuthority.from_dict(dict(zip(db.metadata_authority_cols, row))) def clear_buffers(self, object_types: Sequence[str] = ()) -> None: """Do nothing """ return None def flush(self, object_types: Sequence[str] = ()) -> Dict[str, int]: return {} def _get_authority_id(self, authority: MetadataAuthority, db, cur): authority_id = db.metadata_authority_get_id( authority.type.value, authority.url, cur ) if not authority_id: raise StorageArgumentException(f"Unknown authority {authority}") return authority_id def _get_fetcher_id(self, fetcher: MetadataFetcher, db, cur): fetcher_id = db.metadata_fetcher_get_id(fetcher.name, fetcher.version, cur) if not fetcher_id: raise StorageArgumentException(f"Unknown fetcher {fetcher}") return fetcher_id diff --git a/swh/storage/pytest_plugin.py b/swh/storage/pytest_plugin.py index 604a3d90..26d14ee1 100644 --- a/swh/storage/pytest_plugin.py +++ b/swh/storage/pytest_plugin.py @@ -1,54 +1,64 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from os import environ, path +from functools import partial +from os import environ import pytest +from pytest_postgresql import factories -from swh.core.db.pytest_plugin import postgresql_fact -import swh.storage +from swh.core.db.pytest_plugin import initialize_database_for_module, postgresql_fact from swh.storage import get_storage +from swh.storage.postgresql.db import Db as StorageDb from swh.storage.tests.storage_data import StorageData -SQL_DIR = path.join(path.dirname(swh.storage.__file__), "sql") - environ["LC_ALL"] = "C.UTF-8" -swh_storage_postgresql = postgresql_fact( - "postgresql_proc", dbname="storage", dump_files=path.join(SQL_DIR, "*.sql") +swh_storage_postgresql_proc = factories.postgresql_proc( + dbname="storage", + load=[ + partial( + initialize_database_for_module, + modname="storage", + version=StorageDb.current_version, + ) + ], ) +swh_storage_postgresql = postgresql_fact("swh_storage_postgresql_proc") + + @pytest.fixture def swh_storage_backend_config(swh_storage_postgresql): """Basic pg storage configuration with no journal collaborator (to avoid pulling optional dependency on clients of this fixture) """ yield { "cls": "postgresql", "db": swh_storage_postgresql.dsn, "objstorage": {"cls": "memory"}, "check_config": {"check_write": True}, } @pytest.fixture def swh_storage(swh_storage_backend_config): return get_storage(**swh_storage_backend_config) @pytest.fixture def sample_data() -> StorageData: """Pre-defined sample storage object data to manipulate Returns: StorageData whose attribute keys are data model objects. Either multiple objects: contents, directories, revisions, releases, ... or simple ones: content, directory, revision, release, ... """ return StorageData() diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql index 3708a248..063f9dd1 100644 --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -1,532 +1,517 @@ --- --- SQL implementation of the Software Heritage data model --- --- schema versions -create table dbversion -( - version int primary key, - release timestamptz, - description text -); - -comment on table dbversion is 'Details of current db version'; -comment on column dbversion.version is 'SQL schema version'; -comment on column dbversion.release is 'Version deployment timestamp'; -comment on column dbversion.description is 'Release description'; - --- latest schema version -insert into dbversion(version, release, description) - values(182, now(), 'Work In Progress'); +-- schema versions table (dbversion) is now created by swh.core.db directly -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a Git-style salted SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); -- a blake2 checksum create domain blake2s256 as bytea check (length(value) = 32); -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod create domain file_perms as int; -- an SWHID create domain swhid as text check (value ~ '^swh:[0-9]+:.*'); -- Checksums about actual file content. Note that the content itself is not -- stored in the DB, but on external (key-value) storage. A single checksum is -- used as key there, but the other can be used to verify that we do not inject -- content collisions not knowingly. create table content ( sha1 sha1 not null, sha1_git sha1_git not null, sha256 sha256 not null, blake2s256 blake2s256 not null, length bigint not null, ctime timestamptz not null default now(), -- creation time, i.e. time of (first) injection into the storage status content_status not null default 'visible', object_id bigserial ); comment on table content is 'Checksums of file content which is actually stored externally'; comment on column content.sha1 is 'Content sha1 hash'; comment on column content.sha1_git is 'Git object sha1 hash'; comment on column content.sha256 is 'Content Sha256 hash'; comment on column content.blake2s256 is 'Content blake2s hash'; comment on column content.length is 'Content length'; comment on column content.ctime is 'First seen time'; comment on column content.status is 'Content status (absent, visible, hidden)'; comment on column content.object_id is 'Content identifier'; -- An origin is a place, identified by an URL, where software source code -- artifacts can be found. We support different kinds of origins, e.g., git and -- other VCS repositories, web pages that list tarballs URLs (e.g., -- http://www.kernel.org), indirect tarball URLs (e.g., -- http://www.example.org/latest.tar.gz), etc. The key feature of an origin is -- that it can be *fetched* from (wget, git clone, svn checkout, etc.) to -- retrieve all the contained software. create table origin ( id bigserial not null, url text not null ); comment on column origin.id is 'Artifact origin id'; comment on column origin.url is 'URL of origin'; -- Content blobs observed somewhere, but not ingested into the archive for -- whatever reason. This table is separate from the content table as we might -- not have the sha1 checksum of skipped contents (for instance when we inject -- git repositories, objects that are too big will be skipped here, and we will -- only know their sha1_git). 'reason' contains the reason the content was -- skipped. origin is a nullable column allowing to find out which origin -- contains that skipped content. create table skipped_content ( sha1 sha1, sha1_git sha1_git, sha256 sha256, blake2s256 blake2s256, length bigint not null, ctime timestamptz not null default now(), status content_status not null default 'absent', reason text not null, origin bigint, object_id bigserial ); comment on table skipped_content is 'Content blobs observed, but not ingested in the archive'; comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; comment on column skipped_content.sha1_git is 'Git object sha1 hash'; comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; comment on column skipped_content.length is 'Skipped content length'; comment on column skipped_content.ctime is 'First seen time'; comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; comment on column skipped_content.reason is 'Reason for skipping'; comment on column skipped_content.origin is 'Origin table identifier'; comment on column skipped_content.object_id is 'Skipped content identifier'; -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). -- -- To list the contents of a directory: -- 1. list the contained directory_entry_dir using array dir_entries -- 2. list the contained directory_entry_file using array file_entries -- 3. list the contained directory_entry_rev using array rev_entries -- 4. UNION -- -- Synonyms/mappings: -- * git: tree create table directory ( id sha1_git not null, dir_entries bigint[], -- sub-directories, reference directory_entry_dir file_entries bigint[], -- contained files, reference directory_entry_file rev_entries bigint[], -- mounted revisions, reference directory_entry_rev object_id bigserial, -- short object identifier raw_manifest bytea -- git manifest of the object, if it cannot be represented using only the other fields ); comment on table directory is 'Contents of a directory, synonymous to tree (git)'; comment on column directory.id is 'Git object sha1 hash'; comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir'; comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; comment on column directory.object_id is 'Short object identifier'; comment on column directory.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; -- A directory entry pointing to a (sub-)directory. create table directory_entry_dir ( id bigserial, target sha1_git not null, -- id of target directory name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_dir is 'Directory entry for directory'; comment on column directory_entry_dir.id is 'Directory identifier'; comment on column directory_entry_dir.target is 'Target directory identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- A directory entry pointing to a file content. create table directory_entry_file ( id bigserial, target sha1_git not null, -- id of target file name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_file is 'Directory entry for file'; comment on column directory_entry_file.id is 'File identifier'; comment on column directory_entry_file.target is 'Target file identifier'; comment on column directory_entry_file.name is 'Path name, relative to containing directory'; comment on column directory_entry_file.perms is 'Unix-like permissions'; -- A directory entry pointing to a revision. create table directory_entry_rev ( id bigserial, target sha1_git not null, -- id of target revision name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_rev is 'Directory entry for revision'; comment on column directory_entry_dir.id is 'Revision identifier'; comment on column directory_entry_dir.target is 'Target revision in identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- A person referenced by some source code artifacts, e.g., a VCS revision or -- release metadata. create table person ( id bigserial, name bytea, email bytea, fullname bytea not null, displayname bytea ); comment on table person is 'Person, referenced in Revision author/committer or Release author'; comment on column person.id is 'Internal id'; comment on column person.name is 'Name (advisory, only present if parsed from fullname)'; comment on column person.email is 'Email (advisory, only present if parsed from fullname)'; comment on column person.fullname is 'Full name, usually of the form `Name `, ' 'used in integrity computations'; comment on column person.displayname is 'Full name, usually of the form `Name `, ' 'used for display queries'; -- The state of a source code tree at a specific point in time. -- -- Synonyms/mappings: -- * git / subversion / etc: commit -- * tarball: a specific tarball -- -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in -- case of merges) parent revisions. Each revision points to a directory, i.e., -- a file-system tree containing files and directories. create table revision ( id sha1_git not null, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type not null, directory sha1_git, -- source code 'root' directory message bytea, author bigint, committer bigint, synthetic boolean not null default false, -- true iff revision has been created by Software Heritage metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...) object_id bigserial, date_neg_utc_offset boolean, committer_date_neg_utc_offset boolean, extra_headers bytea[][] not null, -- extra headers (used in hash computation) date_offset_bytes bytea, committer_date_offset_bytes bytea, raw_manifest bytea -- git manifest of the object, if it cannot be represented using only the other fields ); comment on table revision is 'A revision represents the state of a source code tree at a specific point in time'; comment on column revision.id is 'Git-style SHA1 commit identifier'; comment on column revision.date is 'Author timestamp as UNIX epoch'; comment on column revision.date_offset is 'Author timestamp timezone, as minute offsets from UTC'; comment on column revision.date_neg_utc_offset is 'True indicates a -0 UTC offset on author timestamp'; comment on column revision.committer_date is 'Committer timestamp as UNIX epoch'; comment on column revision.committer_date_offset is 'Committer timestamp timezone, as minute offsets from UTC'; comment on column revision.committer_date_neg_utc_offset is 'True indicates a -0 UTC offset on committer timestamp'; comment on column revision.type is 'Type of revision'; comment on column revision.directory is 'Directory identifier'; comment on column revision.message is 'Commit message'; comment on column revision.author is 'Author identity'; comment on column revision.committer is 'Committer identity'; comment on column revision.synthetic is 'True iff revision has been synthesized by Software Heritage'; comment on column revision.metadata is 'Extra revision metadata'; comment on column revision.object_id is 'Non-intrinsic, sequential object identifier'; comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation'; comment on column revision.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; comment on column revision.committer_date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; comment on column revision.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; -- either this table or the sha1_git[] column on the revision table create table revision_history ( id sha1_git not null, parent_id sha1_git not null, parent_rank int not null default 0 -- parent position in merge commits, 0-based ); comment on table revision_history is 'Sequence of revision history with parent and position in history'; comment on column revision_history.id is 'Revision history git object sha1 checksum'; comment on column revision_history.parent_id is 'Parent revision git object identifier'; comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; -- Crawling history of software origins visited by Software Heritage. Each -- visit is a 3-way mapping between a software origin, a timestamp, and a -- snapshot object capturing the full-state of the origin at visit time. create table origin_visit ( origin bigint not null, visit bigint not null, date timestamptz not null, type text not null ); comment on column origin_visit.origin is 'Visited origin'; comment on column origin_visit.visit is 'Sequential visit number for the origin'; comment on column origin_visit.date is 'Visit timestamp'; comment on column origin_visit.type is 'Type of loader that did the visit (hg, git, ...)'; -- Crawling history of software origin visits by Software Heritage. Each -- visit see its history change through new origin visit status updates create table origin_visit_status ( origin bigint not null, visit bigint not null, date timestamptz not null, type text not null, status origin_visit_state not null, metadata jsonb, snapshot sha1_git ); comment on column origin_visit_status.origin is 'Origin concerned by the visit update'; comment on column origin_visit_status.visit is 'Visit concerned by the visit update'; comment on column origin_visit_status.date is 'Visit update timestamp'; comment on column origin_visit_status.type is 'Type of loader that did the visit (hg, git, ...)'; comment on column origin_visit_status.status is 'Visit status (ongoing, failed, full)'; comment on column origin_visit_status.metadata is 'Optional origin visit metadata'; comment on column origin_visit_status.snapshot is 'Optional, possibly partial, snapshot of the origin visit. It can be partial.'; -- A snapshot represents the entire state of a software origin as crawled by -- Software Heritage. This table is a simple mapping between (public) intrinsic -- snapshot identifiers and (private) numeric sequential identifiers. create table snapshot ( object_id bigserial not null, -- PK internal object identifier id sha1_git not null -- snapshot intrinsic identifier ); comment on table snapshot is 'State of a software origin as crawled by Software Heritage'; comment on column snapshot.object_id is 'Internal object identifier'; comment on column snapshot.id is 'Intrinsic snapshot identifier'; -- Each snapshot associate "branch" names to other objects in the Software -- Heritage Merkle DAG. This table describes branches as mappings between names -- and target typed objects. create table snapshot_branch ( object_id bigserial not null, -- PK internal object identifier name bytea not null, -- branch name, e.g., "master" or "feature/drag-n-drop" target bytea, -- target object identifier, e.g., a revision identifier target_type snapshot_target -- target object type, e.g., "revision" ); comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG'; comment on column snapshot_branch.object_id is 'Internal object identifier'; comment on column snapshot_branch.name is 'Branch name'; comment on column snapshot_branch.target is 'Target object identifier'; comment on column snapshot_branch.target_type is 'Target object type'; -- Mapping between snapshots and their branches. create table snapshot_branches ( snapshot_id bigint not null, -- snapshot identifier, ref. snapshot.object_id branch_id bigint not null -- branch identifier, ref. snapshot_branch.object_id ); comment on table snapshot_branches is 'Mapping between snapshot and their branches'; comment on column snapshot_branches.snapshot_id is 'Snapshot identifier'; comment on column snapshot_branches.branch_id is 'Branch identifier'; -- A "memorable" point in time in the development history of a software -- project. -- -- Synonyms/mappings: -- * git: tag (of the annotated kind, otherwise they are just references) -- * tarball: the release version number create table release ( id sha1_git not null, target sha1_git, date timestamptz, date_offset smallint, name bytea, comment bytea, author bigint, synthetic boolean not null default false, -- true iff release has been created by Software Heritage object_id bigserial, target_type object_type not null, date_neg_utc_offset boolean, date_offset_bytes bytea, raw_manifest bytea ); comment on table release is 'Details of a software release, synonymous with a tag (git) or version number (tarball)'; comment on column release.id is 'Release git identifier'; comment on column release.target is 'Target git identifier'; comment on column release.date is 'Release timestamp'; comment on column release.date_offset is 'Timestamp offset from UTC'; comment on column release.name is 'Name'; comment on column release.comment is 'Comment'; comment on column release.author is 'Author'; comment on column release.synthetic is 'Indicates if created by Software Heritage'; comment on column release.object_id is 'Object identifier'; comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; comment on column release.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; comment on column release.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; -- Tools create table metadata_fetcher ( id serial not null, name text not null, version text not null ); comment on table metadata_fetcher is 'Tools used to retrieve metadata'; comment on column metadata_fetcher.id is 'Internal identifier of the fetcher'; comment on column metadata_fetcher.name is 'Fetcher name'; comment on column metadata_fetcher.version is 'Fetcher version'; create table metadata_authority ( id serial not null, type text not null, url text not null ); comment on table metadata_authority is 'Metadata authority information'; comment on column metadata_authority.id is 'Internal identifier of the authority'; comment on column metadata_authority.type is 'Type of authority (deposit_client/forge/registry)'; comment on column metadata_authority.url is 'Authority''s uri'; -- Extrinsic metadata on a DAG objects and origins. create table raw_extrinsic_metadata ( id sha1_git not null, type text not null, target text not null, -- metadata source authority_id bigint not null, fetcher_id bigint not null, discovery_date timestamptz not null, -- metadata itself format text not null, metadata bytea not null, -- context origin text, visit bigint, snapshot swhid, release swhid, revision swhid, path bytea, directory swhid ); comment on table raw_extrinsic_metadata is 'keeps all metadata found concerning an object'; comment on column raw_extrinsic_metadata.type is 'the type of object (content/directory/revision/release/snapshot/origin) the metadata is on'; comment on column raw_extrinsic_metadata.target is 'the SWHID or origin URL for which the metadata was found'; comment on column raw_extrinsic_metadata.discovery_date is 'the date of retrieval'; comment on column raw_extrinsic_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.'; comment on column raw_extrinsic_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.'; comment on column raw_extrinsic_metadata.format is 'name of the format of metadata, used by readers to interpret it.'; comment on column raw_extrinsic_metadata.metadata is 'original metadata in opaque format'; -- Keep a cache of object counts create table object_counts ( object_type text, -- table for which we're counting objects (PK) value bigint, -- count of objects in the table last_update timestamptz, -- last update for the object count in this table single_update boolean -- whether we update this table standalone (true) or through bucketed counts (false) ); comment on table object_counts is 'Cache of object counts'; comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts.value is 'Count of objects in the table'; comment on column object_counts.last_update is 'Last update for object count'; comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)'; create table object_counts_bucketed ( line serial not null, -- PK object_type text not null, -- table for which we're counting objects identifier text not null, -- identifier across which we're bucketing objects bucket_start bytea, -- lower bound (inclusive) for the bucket bucket_end bytea, -- upper bound (exclusive) for the bucket value bigint, -- count of objects in the bucket last_update timestamptz -- last update for the object count in this bucket ); comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type'; comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value'; comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects'; comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket'; comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket'; -- The ExtID (typ. original VCS) <-> swhid relation table create table extid ( extid_type text not null, extid bytea not null, target_type object_type not null, target sha1_git not null, extid_version bigint not null default 0 ); comment on table extid is 'Correspondance SWH object (SWHID) <-> original revision id (vcs id)'; comment on column extid.extid_type is 'ExtID type'; comment on column extid.extid is 'Intrinsic identifier of the object (e.g. hg revision)'; comment on column extid.target_type is 'Type of SWHID of the referenced SWH object'; comment on column extid.target is 'Value (hash) of SWHID of the refenced SWH object'; comment on column extid.extid_version is 'Version of the extid type for the given original object'; diff --git a/sql/upgrades/015.sql b/swh/storage/sql/upgrades/015.sql similarity index 100% rename from sql/upgrades/015.sql rename to swh/storage/sql/upgrades/015.sql diff --git a/sql/upgrades/016.sql b/swh/storage/sql/upgrades/016.sql similarity index 100% rename from sql/upgrades/016.sql rename to swh/storage/sql/upgrades/016.sql diff --git a/sql/upgrades/017.sql b/swh/storage/sql/upgrades/017.sql similarity index 100% rename from sql/upgrades/017.sql rename to swh/storage/sql/upgrades/017.sql diff --git a/sql/upgrades/018.sql b/swh/storage/sql/upgrades/018.sql similarity index 100% rename from sql/upgrades/018.sql rename to swh/storage/sql/upgrades/018.sql diff --git a/sql/upgrades/019.sql b/swh/storage/sql/upgrades/019.sql similarity index 100% rename from sql/upgrades/019.sql rename to swh/storage/sql/upgrades/019.sql diff --git a/sql/upgrades/020.sql b/swh/storage/sql/upgrades/020.sql similarity index 100% rename from sql/upgrades/020.sql rename to swh/storage/sql/upgrades/020.sql diff --git a/sql/upgrades/021.sql b/swh/storage/sql/upgrades/021.sql similarity index 100% rename from sql/upgrades/021.sql rename to swh/storage/sql/upgrades/021.sql diff --git a/sql/upgrades/022.sql b/swh/storage/sql/upgrades/022.sql similarity index 100% rename from sql/upgrades/022.sql rename to swh/storage/sql/upgrades/022.sql diff --git a/sql/upgrades/023.sql b/swh/storage/sql/upgrades/023.sql similarity index 100% rename from sql/upgrades/023.sql rename to swh/storage/sql/upgrades/023.sql diff --git a/sql/upgrades/024.sql b/swh/storage/sql/upgrades/024.sql similarity index 100% rename from sql/upgrades/024.sql rename to swh/storage/sql/upgrades/024.sql diff --git a/sql/upgrades/025.sql b/swh/storage/sql/upgrades/025.sql similarity index 100% rename from sql/upgrades/025.sql rename to swh/storage/sql/upgrades/025.sql diff --git a/sql/upgrades/026.sql b/swh/storage/sql/upgrades/026.sql similarity index 100% rename from sql/upgrades/026.sql rename to swh/storage/sql/upgrades/026.sql diff --git a/sql/upgrades/027.sql b/swh/storage/sql/upgrades/027.sql similarity index 100% rename from sql/upgrades/027.sql rename to swh/storage/sql/upgrades/027.sql diff --git a/sql/upgrades/028.sql b/swh/storage/sql/upgrades/028.sql similarity index 100% rename from sql/upgrades/028.sql rename to swh/storage/sql/upgrades/028.sql diff --git a/sql/upgrades/029.sql b/swh/storage/sql/upgrades/029.sql similarity index 100% rename from sql/upgrades/029.sql rename to swh/storage/sql/upgrades/029.sql diff --git a/sql/upgrades/030.sql b/swh/storage/sql/upgrades/030.sql similarity index 100% rename from sql/upgrades/030.sql rename to swh/storage/sql/upgrades/030.sql diff --git a/sql/upgrades/032.sql b/swh/storage/sql/upgrades/032.sql similarity index 100% rename from sql/upgrades/032.sql rename to swh/storage/sql/upgrades/032.sql diff --git a/sql/upgrades/033.sql b/swh/storage/sql/upgrades/033.sql similarity index 100% rename from sql/upgrades/033.sql rename to swh/storage/sql/upgrades/033.sql diff --git a/sql/upgrades/034.sql b/swh/storage/sql/upgrades/034.sql similarity index 100% rename from sql/upgrades/034.sql rename to swh/storage/sql/upgrades/034.sql diff --git a/sql/upgrades/035.sql b/swh/storage/sql/upgrades/035.sql similarity index 100% rename from sql/upgrades/035.sql rename to swh/storage/sql/upgrades/035.sql diff --git a/sql/upgrades/036.sql b/swh/storage/sql/upgrades/036.sql similarity index 100% rename from sql/upgrades/036.sql rename to swh/storage/sql/upgrades/036.sql diff --git a/sql/upgrades/037.sql b/swh/storage/sql/upgrades/037.sql similarity index 100% rename from sql/upgrades/037.sql rename to swh/storage/sql/upgrades/037.sql diff --git a/sql/upgrades/038.sql b/swh/storage/sql/upgrades/038.sql similarity index 100% rename from sql/upgrades/038.sql rename to swh/storage/sql/upgrades/038.sql diff --git a/sql/upgrades/039.sql b/swh/storage/sql/upgrades/039.sql similarity index 100% rename from sql/upgrades/039.sql rename to swh/storage/sql/upgrades/039.sql diff --git a/sql/upgrades/040.sql b/swh/storage/sql/upgrades/040.sql similarity index 100% rename from sql/upgrades/040.sql rename to swh/storage/sql/upgrades/040.sql diff --git a/sql/upgrades/041.sql b/swh/storage/sql/upgrades/041.sql similarity index 100% rename from sql/upgrades/041.sql rename to swh/storage/sql/upgrades/041.sql diff --git a/sql/upgrades/042.sql b/swh/storage/sql/upgrades/042.sql similarity index 100% rename from sql/upgrades/042.sql rename to swh/storage/sql/upgrades/042.sql diff --git a/sql/upgrades/043.sql b/swh/storage/sql/upgrades/043.sql similarity index 100% rename from sql/upgrades/043.sql rename to swh/storage/sql/upgrades/043.sql diff --git a/sql/upgrades/044.sql b/swh/storage/sql/upgrades/044.sql similarity index 100% rename from sql/upgrades/044.sql rename to swh/storage/sql/upgrades/044.sql diff --git a/sql/upgrades/045.sql b/swh/storage/sql/upgrades/045.sql similarity index 100% rename from sql/upgrades/045.sql rename to swh/storage/sql/upgrades/045.sql diff --git a/sql/upgrades/046.sql b/swh/storage/sql/upgrades/046.sql similarity index 100% rename from sql/upgrades/046.sql rename to swh/storage/sql/upgrades/046.sql diff --git a/sql/upgrades/047.sql b/swh/storage/sql/upgrades/047.sql similarity index 100% rename from sql/upgrades/047.sql rename to swh/storage/sql/upgrades/047.sql diff --git a/sql/upgrades/048.sql b/swh/storage/sql/upgrades/048.sql similarity index 100% rename from sql/upgrades/048.sql rename to swh/storage/sql/upgrades/048.sql diff --git a/sql/upgrades/049.sql b/swh/storage/sql/upgrades/049.sql similarity index 100% rename from sql/upgrades/049.sql rename to swh/storage/sql/upgrades/049.sql diff --git a/sql/upgrades/050.sql b/swh/storage/sql/upgrades/050.sql similarity index 100% rename from sql/upgrades/050.sql rename to swh/storage/sql/upgrades/050.sql diff --git a/sql/upgrades/051.sql b/swh/storage/sql/upgrades/051.sql similarity index 100% rename from sql/upgrades/051.sql rename to swh/storage/sql/upgrades/051.sql diff --git a/sql/upgrades/052.sql b/swh/storage/sql/upgrades/052.sql similarity index 100% rename from sql/upgrades/052.sql rename to swh/storage/sql/upgrades/052.sql diff --git a/sql/upgrades/053.sql b/swh/storage/sql/upgrades/053.sql similarity index 100% rename from sql/upgrades/053.sql rename to swh/storage/sql/upgrades/053.sql diff --git a/sql/upgrades/054.sql b/swh/storage/sql/upgrades/054.sql similarity index 100% rename from sql/upgrades/054.sql rename to swh/storage/sql/upgrades/054.sql diff --git a/sql/upgrades/055.sql b/swh/storage/sql/upgrades/055.sql similarity index 100% rename from sql/upgrades/055.sql rename to swh/storage/sql/upgrades/055.sql diff --git a/sql/upgrades/056.sql b/swh/storage/sql/upgrades/056.sql similarity index 100% rename from sql/upgrades/056.sql rename to swh/storage/sql/upgrades/056.sql diff --git a/sql/upgrades/057.sql b/swh/storage/sql/upgrades/057.sql similarity index 100% rename from sql/upgrades/057.sql rename to swh/storage/sql/upgrades/057.sql diff --git a/sql/upgrades/058.sql b/swh/storage/sql/upgrades/058.sql similarity index 100% rename from sql/upgrades/058.sql rename to swh/storage/sql/upgrades/058.sql diff --git a/sql/upgrades/059.sql b/swh/storage/sql/upgrades/059.sql similarity index 100% rename from sql/upgrades/059.sql rename to swh/storage/sql/upgrades/059.sql diff --git a/sql/upgrades/060.sql b/swh/storage/sql/upgrades/060.sql similarity index 100% rename from sql/upgrades/060.sql rename to swh/storage/sql/upgrades/060.sql diff --git a/sql/upgrades/061.sql b/swh/storage/sql/upgrades/061.sql similarity index 100% rename from sql/upgrades/061.sql rename to swh/storage/sql/upgrades/061.sql diff --git a/sql/upgrades/062.sql b/swh/storage/sql/upgrades/062.sql similarity index 100% rename from sql/upgrades/062.sql rename to swh/storage/sql/upgrades/062.sql diff --git a/sql/upgrades/063.sql b/swh/storage/sql/upgrades/063.sql similarity index 100% rename from sql/upgrades/063.sql rename to swh/storage/sql/upgrades/063.sql diff --git a/sql/upgrades/064.sql b/swh/storage/sql/upgrades/064.sql similarity index 100% rename from sql/upgrades/064.sql rename to swh/storage/sql/upgrades/064.sql diff --git a/sql/upgrades/065.sql b/swh/storage/sql/upgrades/065.sql similarity index 100% rename from sql/upgrades/065.sql rename to swh/storage/sql/upgrades/065.sql diff --git a/sql/upgrades/066.sql b/swh/storage/sql/upgrades/066.sql similarity index 100% rename from sql/upgrades/066.sql rename to swh/storage/sql/upgrades/066.sql diff --git a/sql/upgrades/067.sql b/swh/storage/sql/upgrades/067.sql similarity index 100% rename from sql/upgrades/067.sql rename to swh/storage/sql/upgrades/067.sql diff --git a/sql/upgrades/068.sql b/swh/storage/sql/upgrades/068.sql similarity index 100% rename from sql/upgrades/068.sql rename to swh/storage/sql/upgrades/068.sql diff --git a/sql/upgrades/069.sql b/swh/storage/sql/upgrades/069.sql similarity index 100% rename from sql/upgrades/069.sql rename to swh/storage/sql/upgrades/069.sql diff --git a/sql/upgrades/070.sql b/swh/storage/sql/upgrades/070.sql similarity index 100% rename from sql/upgrades/070.sql rename to swh/storage/sql/upgrades/070.sql diff --git a/sql/upgrades/071.sql b/swh/storage/sql/upgrades/071.sql similarity index 100% rename from sql/upgrades/071.sql rename to swh/storage/sql/upgrades/071.sql diff --git a/sql/upgrades/072.sql b/swh/storage/sql/upgrades/072.sql similarity index 100% rename from sql/upgrades/072.sql rename to swh/storage/sql/upgrades/072.sql diff --git a/sql/upgrades/073.sql b/swh/storage/sql/upgrades/073.sql similarity index 100% rename from sql/upgrades/073.sql rename to swh/storage/sql/upgrades/073.sql diff --git a/sql/upgrades/074.sql b/swh/storage/sql/upgrades/074.sql similarity index 100% rename from sql/upgrades/074.sql rename to swh/storage/sql/upgrades/074.sql diff --git a/sql/upgrades/075.sql b/swh/storage/sql/upgrades/075.sql similarity index 100% rename from sql/upgrades/075.sql rename to swh/storage/sql/upgrades/075.sql diff --git a/sql/upgrades/076.sql b/swh/storage/sql/upgrades/076.sql similarity index 100% rename from sql/upgrades/076.sql rename to swh/storage/sql/upgrades/076.sql diff --git a/sql/upgrades/077.sql b/swh/storage/sql/upgrades/077.sql similarity index 100% rename from sql/upgrades/077.sql rename to swh/storage/sql/upgrades/077.sql diff --git a/sql/upgrades/078.sql b/swh/storage/sql/upgrades/078.sql similarity index 100% rename from sql/upgrades/078.sql rename to swh/storage/sql/upgrades/078.sql diff --git a/sql/upgrades/079.sql b/swh/storage/sql/upgrades/079.sql similarity index 100% rename from sql/upgrades/079.sql rename to swh/storage/sql/upgrades/079.sql diff --git a/sql/upgrades/080.sql b/swh/storage/sql/upgrades/080.sql similarity index 100% rename from sql/upgrades/080.sql rename to swh/storage/sql/upgrades/080.sql diff --git a/sql/upgrades/081.sql b/swh/storage/sql/upgrades/081.sql similarity index 100% rename from sql/upgrades/081.sql rename to swh/storage/sql/upgrades/081.sql diff --git a/sql/upgrades/082.sql b/swh/storage/sql/upgrades/082.sql similarity index 100% rename from sql/upgrades/082.sql rename to swh/storage/sql/upgrades/082.sql diff --git a/sql/upgrades/083.sql b/swh/storage/sql/upgrades/083.sql similarity index 100% rename from sql/upgrades/083.sql rename to swh/storage/sql/upgrades/083.sql diff --git a/sql/upgrades/084.sql b/swh/storage/sql/upgrades/084.sql similarity index 100% rename from sql/upgrades/084.sql rename to swh/storage/sql/upgrades/084.sql diff --git a/sql/upgrades/085.sql b/swh/storage/sql/upgrades/085.sql similarity index 100% rename from sql/upgrades/085.sql rename to swh/storage/sql/upgrades/085.sql diff --git a/sql/upgrades/086.sql b/swh/storage/sql/upgrades/086.sql similarity index 100% rename from sql/upgrades/086.sql rename to swh/storage/sql/upgrades/086.sql diff --git a/sql/upgrades/087.sql b/swh/storage/sql/upgrades/087.sql similarity index 100% rename from sql/upgrades/087.sql rename to swh/storage/sql/upgrades/087.sql diff --git a/sql/upgrades/088.sql b/swh/storage/sql/upgrades/088.sql similarity index 100% rename from sql/upgrades/088.sql rename to swh/storage/sql/upgrades/088.sql diff --git a/sql/upgrades/089.sql b/swh/storage/sql/upgrades/089.sql similarity index 100% rename from sql/upgrades/089.sql rename to swh/storage/sql/upgrades/089.sql diff --git a/sql/upgrades/090.sql b/swh/storage/sql/upgrades/090.sql similarity index 100% rename from sql/upgrades/090.sql rename to swh/storage/sql/upgrades/090.sql diff --git a/sql/upgrades/091.sql b/swh/storage/sql/upgrades/091.sql similarity index 100% rename from sql/upgrades/091.sql rename to swh/storage/sql/upgrades/091.sql diff --git a/sql/upgrades/092.sql b/swh/storage/sql/upgrades/092.sql similarity index 100% rename from sql/upgrades/092.sql rename to swh/storage/sql/upgrades/092.sql diff --git a/sql/upgrades/093.sql b/swh/storage/sql/upgrades/093.sql similarity index 100% rename from sql/upgrades/093.sql rename to swh/storage/sql/upgrades/093.sql diff --git a/sql/upgrades/094.sql b/swh/storage/sql/upgrades/094.sql similarity index 100% rename from sql/upgrades/094.sql rename to swh/storage/sql/upgrades/094.sql diff --git a/sql/upgrades/095.sql b/swh/storage/sql/upgrades/095.sql similarity index 100% rename from sql/upgrades/095.sql rename to swh/storage/sql/upgrades/095.sql diff --git a/sql/upgrades/096.sql b/swh/storage/sql/upgrades/096.sql similarity index 100% rename from sql/upgrades/096.sql rename to swh/storage/sql/upgrades/096.sql diff --git a/sql/upgrades/097.sql b/swh/storage/sql/upgrades/097.sql similarity index 100% rename from sql/upgrades/097.sql rename to swh/storage/sql/upgrades/097.sql diff --git a/sql/upgrades/098.sql b/swh/storage/sql/upgrades/098.sql similarity index 100% rename from sql/upgrades/098.sql rename to swh/storage/sql/upgrades/098.sql diff --git a/sql/upgrades/099.sql b/swh/storage/sql/upgrades/099.sql similarity index 100% rename from sql/upgrades/099.sql rename to swh/storage/sql/upgrades/099.sql diff --git a/sql/upgrades/100.sql b/swh/storage/sql/upgrades/100.sql similarity index 100% rename from sql/upgrades/100.sql rename to swh/storage/sql/upgrades/100.sql diff --git a/sql/upgrades/101.sql b/swh/storage/sql/upgrades/101.sql similarity index 100% rename from sql/upgrades/101.sql rename to swh/storage/sql/upgrades/101.sql diff --git a/sql/upgrades/102.sql b/swh/storage/sql/upgrades/102.sql similarity index 100% rename from sql/upgrades/102.sql rename to swh/storage/sql/upgrades/102.sql diff --git a/sql/upgrades/103.sql b/swh/storage/sql/upgrades/103.sql similarity index 100% rename from sql/upgrades/103.sql rename to swh/storage/sql/upgrades/103.sql diff --git a/sql/upgrades/104.sql b/swh/storage/sql/upgrades/104.sql similarity index 100% rename from sql/upgrades/104.sql rename to swh/storage/sql/upgrades/104.sql diff --git a/sql/upgrades/105.sql b/swh/storage/sql/upgrades/105.sql similarity index 100% rename from sql/upgrades/105.sql rename to swh/storage/sql/upgrades/105.sql diff --git a/sql/upgrades/106.sql b/swh/storage/sql/upgrades/106.sql similarity index 100% rename from sql/upgrades/106.sql rename to swh/storage/sql/upgrades/106.sql diff --git a/sql/upgrades/107.sql b/swh/storage/sql/upgrades/107.sql similarity index 100% rename from sql/upgrades/107.sql rename to swh/storage/sql/upgrades/107.sql diff --git a/sql/upgrades/108.sql b/swh/storage/sql/upgrades/108.sql similarity index 100% rename from sql/upgrades/108.sql rename to swh/storage/sql/upgrades/108.sql diff --git a/sql/upgrades/109.sql b/swh/storage/sql/upgrades/109.sql similarity index 100% rename from sql/upgrades/109.sql rename to swh/storage/sql/upgrades/109.sql diff --git a/sql/upgrades/110.sql b/swh/storage/sql/upgrades/110.sql similarity index 100% rename from sql/upgrades/110.sql rename to swh/storage/sql/upgrades/110.sql diff --git a/sql/upgrades/111.sql b/swh/storage/sql/upgrades/111.sql similarity index 100% rename from sql/upgrades/111.sql rename to swh/storage/sql/upgrades/111.sql diff --git a/sql/upgrades/112.sql b/swh/storage/sql/upgrades/112.sql similarity index 100% rename from sql/upgrades/112.sql rename to swh/storage/sql/upgrades/112.sql diff --git a/sql/upgrades/113.sql b/swh/storage/sql/upgrades/113.sql similarity index 100% rename from sql/upgrades/113.sql rename to swh/storage/sql/upgrades/113.sql diff --git a/sql/upgrades/114.sql b/swh/storage/sql/upgrades/114.sql similarity index 100% rename from sql/upgrades/114.sql rename to swh/storage/sql/upgrades/114.sql diff --git a/sql/upgrades/115.sql b/swh/storage/sql/upgrades/115.sql similarity index 100% rename from sql/upgrades/115.sql rename to swh/storage/sql/upgrades/115.sql diff --git a/sql/upgrades/116.sql b/swh/storage/sql/upgrades/116.sql similarity index 100% rename from sql/upgrades/116.sql rename to swh/storage/sql/upgrades/116.sql diff --git a/sql/upgrades/117.sql b/swh/storage/sql/upgrades/117.sql similarity index 100% rename from sql/upgrades/117.sql rename to swh/storage/sql/upgrades/117.sql diff --git a/sql/upgrades/118.sql b/swh/storage/sql/upgrades/118.sql similarity index 100% rename from sql/upgrades/118.sql rename to swh/storage/sql/upgrades/118.sql diff --git a/sql/upgrades/119.sql b/swh/storage/sql/upgrades/119.sql similarity index 100% rename from sql/upgrades/119.sql rename to swh/storage/sql/upgrades/119.sql diff --git a/sql/upgrades/120.sql b/swh/storage/sql/upgrades/120.sql similarity index 100% rename from sql/upgrades/120.sql rename to swh/storage/sql/upgrades/120.sql diff --git a/sql/upgrades/121.sql b/swh/storage/sql/upgrades/121.sql similarity index 100% rename from sql/upgrades/121.sql rename to swh/storage/sql/upgrades/121.sql diff --git a/sql/upgrades/122.sql b/swh/storage/sql/upgrades/122.sql similarity index 100% rename from sql/upgrades/122.sql rename to swh/storage/sql/upgrades/122.sql diff --git a/sql/upgrades/123.sql b/swh/storage/sql/upgrades/123.sql similarity index 100% rename from sql/upgrades/123.sql rename to swh/storage/sql/upgrades/123.sql diff --git a/sql/upgrades/124.sql b/swh/storage/sql/upgrades/124.sql similarity index 100% rename from sql/upgrades/124.sql rename to swh/storage/sql/upgrades/124.sql diff --git a/sql/upgrades/125.sql b/swh/storage/sql/upgrades/125.sql similarity index 100% rename from sql/upgrades/125.sql rename to swh/storage/sql/upgrades/125.sql diff --git a/sql/upgrades/126.sql b/swh/storage/sql/upgrades/126.sql similarity index 100% rename from sql/upgrades/126.sql rename to swh/storage/sql/upgrades/126.sql diff --git a/sql/upgrades/127.sql b/swh/storage/sql/upgrades/127.sql similarity index 100% rename from sql/upgrades/127.sql rename to swh/storage/sql/upgrades/127.sql diff --git a/sql/upgrades/128.sql b/swh/storage/sql/upgrades/128.sql similarity index 100% rename from sql/upgrades/128.sql rename to swh/storage/sql/upgrades/128.sql diff --git a/sql/upgrades/129.sql b/swh/storage/sql/upgrades/129.sql similarity index 100% rename from sql/upgrades/129.sql rename to swh/storage/sql/upgrades/129.sql diff --git a/sql/upgrades/130.sql b/swh/storage/sql/upgrades/130.sql similarity index 100% rename from sql/upgrades/130.sql rename to swh/storage/sql/upgrades/130.sql diff --git a/sql/upgrades/131.sql b/swh/storage/sql/upgrades/131.sql similarity index 100% rename from sql/upgrades/131.sql rename to swh/storage/sql/upgrades/131.sql diff --git a/sql/upgrades/132.sql b/swh/storage/sql/upgrades/132.sql similarity index 100% rename from sql/upgrades/132.sql rename to swh/storage/sql/upgrades/132.sql diff --git a/sql/upgrades/133.sql b/swh/storage/sql/upgrades/133.sql similarity index 100% rename from sql/upgrades/133.sql rename to swh/storage/sql/upgrades/133.sql diff --git a/sql/upgrades/134.sql b/swh/storage/sql/upgrades/134.sql similarity index 100% rename from sql/upgrades/134.sql rename to swh/storage/sql/upgrades/134.sql diff --git a/sql/upgrades/135.sql b/swh/storage/sql/upgrades/135.sql similarity index 100% rename from sql/upgrades/135.sql rename to swh/storage/sql/upgrades/135.sql diff --git a/sql/upgrades/136.sql b/swh/storage/sql/upgrades/136.sql similarity index 100% rename from sql/upgrades/136.sql rename to swh/storage/sql/upgrades/136.sql diff --git a/sql/upgrades/137.sql b/swh/storage/sql/upgrades/137.sql similarity index 100% rename from sql/upgrades/137.sql rename to swh/storage/sql/upgrades/137.sql diff --git a/sql/upgrades/138.sql b/swh/storage/sql/upgrades/138.sql similarity index 100% rename from sql/upgrades/138.sql rename to swh/storage/sql/upgrades/138.sql diff --git a/sql/upgrades/139.sql b/swh/storage/sql/upgrades/139.sql similarity index 100% rename from sql/upgrades/139.sql rename to swh/storage/sql/upgrades/139.sql diff --git a/sql/upgrades/140.sql b/swh/storage/sql/upgrades/140.sql similarity index 100% rename from sql/upgrades/140.sql rename to swh/storage/sql/upgrades/140.sql diff --git a/sql/upgrades/141.sql b/swh/storage/sql/upgrades/141.sql similarity index 100% rename from sql/upgrades/141.sql rename to swh/storage/sql/upgrades/141.sql diff --git a/sql/upgrades/142.sql b/swh/storage/sql/upgrades/142.sql similarity index 100% rename from sql/upgrades/142.sql rename to swh/storage/sql/upgrades/142.sql diff --git a/sql/upgrades/143.sql b/swh/storage/sql/upgrades/143.sql similarity index 100% rename from sql/upgrades/143.sql rename to swh/storage/sql/upgrades/143.sql diff --git a/sql/upgrades/144.sql b/swh/storage/sql/upgrades/144.sql similarity index 100% rename from sql/upgrades/144.sql rename to swh/storage/sql/upgrades/144.sql diff --git a/sql/upgrades/145.sql b/swh/storage/sql/upgrades/145.sql similarity index 100% rename from sql/upgrades/145.sql rename to swh/storage/sql/upgrades/145.sql diff --git a/sql/upgrades/146.sql b/swh/storage/sql/upgrades/146.sql similarity index 100% rename from sql/upgrades/146.sql rename to swh/storage/sql/upgrades/146.sql diff --git a/sql/upgrades/147.sql b/swh/storage/sql/upgrades/147.sql similarity index 100% rename from sql/upgrades/147.sql rename to swh/storage/sql/upgrades/147.sql diff --git a/sql/upgrades/148.sql b/swh/storage/sql/upgrades/148.sql similarity index 100% rename from sql/upgrades/148.sql rename to swh/storage/sql/upgrades/148.sql diff --git a/sql/upgrades/149.sql b/swh/storage/sql/upgrades/149.sql similarity index 100% rename from sql/upgrades/149.sql rename to swh/storage/sql/upgrades/149.sql diff --git a/sql/upgrades/150.sql b/swh/storage/sql/upgrades/150.sql similarity index 100% rename from sql/upgrades/150.sql rename to swh/storage/sql/upgrades/150.sql diff --git a/sql/upgrades/151.sql b/swh/storage/sql/upgrades/151.sql similarity index 100% rename from sql/upgrades/151.sql rename to swh/storage/sql/upgrades/151.sql diff --git a/sql/upgrades/152.sql b/swh/storage/sql/upgrades/152.sql similarity index 100% rename from sql/upgrades/152.sql rename to swh/storage/sql/upgrades/152.sql diff --git a/sql/upgrades/153.sql b/swh/storage/sql/upgrades/153.sql similarity index 100% rename from sql/upgrades/153.sql rename to swh/storage/sql/upgrades/153.sql diff --git a/sql/upgrades/154.sql b/swh/storage/sql/upgrades/154.sql similarity index 100% rename from sql/upgrades/154.sql rename to swh/storage/sql/upgrades/154.sql diff --git a/sql/upgrades/155.sql b/swh/storage/sql/upgrades/155.sql similarity index 100% rename from sql/upgrades/155.sql rename to swh/storage/sql/upgrades/155.sql diff --git a/sql/upgrades/156.sql b/swh/storage/sql/upgrades/156.sql similarity index 100% rename from sql/upgrades/156.sql rename to swh/storage/sql/upgrades/156.sql diff --git a/sql/upgrades/157.sql b/swh/storage/sql/upgrades/157.sql similarity index 100% rename from sql/upgrades/157.sql rename to swh/storage/sql/upgrades/157.sql diff --git a/sql/upgrades/158.sql b/swh/storage/sql/upgrades/158.sql similarity index 100% rename from sql/upgrades/158.sql rename to swh/storage/sql/upgrades/158.sql diff --git a/sql/upgrades/159.sql b/swh/storage/sql/upgrades/159.sql similarity index 100% rename from sql/upgrades/159.sql rename to swh/storage/sql/upgrades/159.sql diff --git a/sql/upgrades/160.sql b/swh/storage/sql/upgrades/160.sql similarity index 100% rename from sql/upgrades/160.sql rename to swh/storage/sql/upgrades/160.sql diff --git a/sql/upgrades/161.sql b/swh/storage/sql/upgrades/161.sql similarity index 100% rename from sql/upgrades/161.sql rename to swh/storage/sql/upgrades/161.sql diff --git a/sql/upgrades/162.sql b/swh/storage/sql/upgrades/162.sql similarity index 100% rename from sql/upgrades/162.sql rename to swh/storage/sql/upgrades/162.sql diff --git a/sql/upgrades/163.sql b/swh/storage/sql/upgrades/163.sql similarity index 100% rename from sql/upgrades/163.sql rename to swh/storage/sql/upgrades/163.sql diff --git a/sql/upgrades/164.sql b/swh/storage/sql/upgrades/164.sql similarity index 100% rename from sql/upgrades/164.sql rename to swh/storage/sql/upgrades/164.sql diff --git a/sql/upgrades/165.sql b/swh/storage/sql/upgrades/165.sql similarity index 100% rename from sql/upgrades/165.sql rename to swh/storage/sql/upgrades/165.sql diff --git a/sql/upgrades/166.sql b/swh/storage/sql/upgrades/166.sql similarity index 100% rename from sql/upgrades/166.sql rename to swh/storage/sql/upgrades/166.sql diff --git a/sql/upgrades/167.sql b/swh/storage/sql/upgrades/167.sql similarity index 100% rename from sql/upgrades/167.sql rename to swh/storage/sql/upgrades/167.sql diff --git a/sql/upgrades/168.sql b/swh/storage/sql/upgrades/168.sql similarity index 100% rename from sql/upgrades/168.sql rename to swh/storage/sql/upgrades/168.sql diff --git a/sql/upgrades/169.sql b/swh/storage/sql/upgrades/169.sql similarity index 100% rename from sql/upgrades/169.sql rename to swh/storage/sql/upgrades/169.sql diff --git a/sql/upgrades/170.sql b/swh/storage/sql/upgrades/170.sql similarity index 100% rename from sql/upgrades/170.sql rename to swh/storage/sql/upgrades/170.sql diff --git a/sql/upgrades/171.sql b/swh/storage/sql/upgrades/171.sql similarity index 100% rename from sql/upgrades/171.sql rename to swh/storage/sql/upgrades/171.sql diff --git a/sql/upgrades/172.sql b/swh/storage/sql/upgrades/172.sql similarity index 100% rename from sql/upgrades/172.sql rename to swh/storage/sql/upgrades/172.sql diff --git a/sql/upgrades/173.sql b/swh/storage/sql/upgrades/173.sql similarity index 100% rename from sql/upgrades/173.sql rename to swh/storage/sql/upgrades/173.sql diff --git a/sql/upgrades/174.sql b/swh/storage/sql/upgrades/174.sql similarity index 100% rename from sql/upgrades/174.sql rename to swh/storage/sql/upgrades/174.sql diff --git a/sql/upgrades/175.sql b/swh/storage/sql/upgrades/175.sql similarity index 100% rename from sql/upgrades/175.sql rename to swh/storage/sql/upgrades/175.sql diff --git a/sql/upgrades/176.sql b/swh/storage/sql/upgrades/176.sql similarity index 100% rename from sql/upgrades/176.sql rename to swh/storage/sql/upgrades/176.sql diff --git a/sql/upgrades/177.sql b/swh/storage/sql/upgrades/177.sql similarity index 100% rename from sql/upgrades/177.sql rename to swh/storage/sql/upgrades/177.sql diff --git a/sql/upgrades/178.sql b/swh/storage/sql/upgrades/178.sql similarity index 100% rename from sql/upgrades/178.sql rename to swh/storage/sql/upgrades/178.sql diff --git a/sql/upgrades/179.sql b/swh/storage/sql/upgrades/179.sql similarity index 100% rename from sql/upgrades/179.sql rename to swh/storage/sql/upgrades/179.sql diff --git a/sql/upgrades/180.sql b/swh/storage/sql/upgrades/180.sql similarity index 100% rename from sql/upgrades/180.sql rename to swh/storage/sql/upgrades/180.sql diff --git a/sql/upgrades/181.sql b/swh/storage/sql/upgrades/181.sql similarity index 100% rename from sql/upgrades/181.sql rename to swh/storage/sql/upgrades/181.sql diff --git a/sql/upgrades/182.sql b/swh/storage/sql/upgrades/182.sql similarity index 100% rename from sql/upgrades/182.sql rename to swh/storage/sql/upgrades/182.sql