diff --git a/docs/index.rst b/docs/index.rst index 17037bbd..e320b186 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,46 +1,94 @@ .. _swh-storage: Software Heritage - Storage =========================== Abstraction layer over the archive, allowing to access all stored source code artifacts as well as their metadata The Software Heritage storage consist of a high-level storage layer (:mod:`swh.storage`) that exposes a client/server API (:mod:`swh.storage.api`). The API is exposed by a server (:mod:`swh.storage.api.server`) and accessible via a client (:mod:`swh.storage.api.client`). The low-level implementation of the storage is split between an object storage (:ref:`swh.objstorage `), which stores all "blobs" (i.e., the leaves of the :ref:`data-model`) and a SQL representation of the rest of the graph (:mod:`swh.storage.storage`). +Using ``swh-storage`` +--------------------- + +First, note that ``swh-storage`` is an internal API of Software Heritage, that +is only available to software running on the SWH infrastructure and developers +:ref:`running their own Software Heritage `. +If you want to access the Software Heritage archive without running your own, +you should use the `Web API`_ instead. + +As ``swh-storage`` has multiple backends, it is instantiated via the +:py:func:`swh.storage.get_storage` function, which takes as argument the backend type +(usually ``remote``, if you already have access to a running swh-storage). + +It returns an instance of a class implementing +:py:class:`swh.storage.interface.StorageInterface`; which is mostly a set of key-value +stores, one for each object type. + +Many of the arguments and return types are "model objects", ie. immutable objects +that are instances of the classes defined in :py:mod:`swh.model.model`. + +Methods returning long lists of arguments are paginated; by returning both a list +of results and an opaque token to get the next page of results. +For example, to list all the visits of an origin using ``origin_visit_get`` +ten visits at a time, you can do: + +.. code-block:: + + storage = get_storage("remote", url="http://localhost:5002") + while True: + page = storage.origin_visit_get(origin="https://github.com/torvalds/linux") + for visit in page.results: + print(visit) + if page.next_page_token is None: + break + +Or, using :py:func:`swh.core.api.classes.stream_results` for convenience: + +.. code-block:: + + storage = get_storage("remote", url="http://localhost:5002") + visits = stream_results( + storage.origin_visit_get, origin="https://github.com/torvalds/linux" + ) + for visit in visits: + print(visit) + +.. _Web API: https://archive.softwareheritage.org/api/ + Database schema --------------- * :ref:`sql-storage` Archive copies -------------- * :ref:`archive-copies` Specifications -------------- * :ref:`extrinsic-metadata-specification` Reference Documentation ----------------------- .. toctree:: :maxdepth: 2 cli /apidoc/swh.storage diff --git a/swh/storage/__init__.py b/swh/storage/__init__.py index d13eb462..37d1bebe 100644 --- a/swh/storage/__init__.py +++ b/swh/storage/__init__.py @@ -1,106 +1,110 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import importlib from typing import TYPE_CHECKING, Any, Dict, List import warnings if TYPE_CHECKING: from .interface import StorageInterface STORAGE_IMPLEMENTATIONS = { "local": ".postgresql.storage.Storage", "remote": ".api.client.RemoteStorage", "memory": ".in_memory.InMemoryStorage", "filter": ".filter.FilteringProxyStorage", "buffer": ".buffer.BufferingProxyStorage", "retry": ".retry.RetryingProxyStorage", "cassandra": ".cassandra.CassandraStorage", "validate": ".validate.ValidatingProxyStorage", } def get_storage(cls: str, **kwargs) -> "StorageInterface": """Get a storage object of class `storage_class` with arguments `storage_args`. Args: - storage (dict): dictionary with keys: - - cls (str): storage's class, either local, remote, memory, filter, - buffer - - args (dict): dictionary with keys + cls (str): storage's class, can be: + - ``local`` to use a postgresql database + - ``cassandra`` to use a cassandra database + - ``remote`` to connect to a swh-storage server + - ``memory`` for an in-memory storage, useful for fast tests + - ``filter``, ``buffer``, ... to use specific storage "proxies", see their + respective documentations + args (dict): dictionary with keys Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ if "args" in kwargs: warnings.warn( 'Explicit "args" key is deprecated, use keys directly instead.', DeprecationWarning, ) kwargs = kwargs["args"] if cls == "pipeline": return get_storage_pipeline(**kwargs) class_path = STORAGE_IMPLEMENTATIONS.get(cls) if class_path is None: raise ValueError( "Unknown storage class `%s`. Supported: %s" % (cls, ", ".join(STORAGE_IMPLEMENTATIONS)) ) (module_path, class_name) = class_path.rsplit(".", 1) module = importlib.import_module(module_path, package=__package__) Storage = getattr(module, class_name) check_config = kwargs.pop("check_config", {}) storage = Storage(**kwargs) if check_config: if not storage.check_config(**check_config): raise EnvironmentError("storage check config failed") return storage def get_storage_pipeline( steps: List[Dict[str, Any]], check_config=None ) -> "StorageInterface": """Recursively get a storage object that may use other storage objects as backends. Args: steps (List[dict]): List of dicts that may be used as kwargs for `get_storage`. Returns: an instance of swh.storage.Storage or compatible class Raises: ValueError if passed an unknown storage class. """ storage_config = None for step in reversed(steps): if "args" in step: warnings.warn( 'Explicit "args" key is deprecated, use keys directly ' "instead.", DeprecationWarning, ) step = { "cls": step["cls"], **step["args"], } if storage_config: step["storage"] = storage_config step["check_config"] = check_config storage_config = step if storage_config is None: raise ValueError("'pipeline' has no steps.") return get_storage(**storage_config)