diff --git a/requirements.txt b/requirements.txt index 38d44f7..908774f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,14 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html sphinx < 4.4.0 # version 4.4.0 adds this kind of warning: "WARNING: hardcoded link 'https://archive.softwareheritage.org/api/1/vault/directory/doc/' could be replaced by an extlink (try using ':swh_web_api:`vault/directory/doc/`' instead)", which breaks the CI since we fail on warnings sphinxcontrib-httpdomain >= 1.8.0 sphinxcontrib-images sphinxcontrib-programoutput sphinx-tabs +sphinx-panels sphinx-reredirects sphinx_rtd_theme sphinx-click myst-parser sphinx-celery diff --git a/swh/docs/sphinx/conf.py b/swh/docs/sphinx/conf.py index 8a1f874..b18e527 100755 --- a/swh/docs/sphinx/conf.py +++ b/swh/docs/sphinx/conf.py @@ -1,286 +1,287 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # import logging import os from typing import Dict from sphinx.ext import autodoc from swh.docs.django_settings import force_django_settings # General information about the project. project = "Software Heritage - Development Documentation" copyright = "2015-2021 The Software Heritage developers" author = "The Software Heritage developers" # -- General configuration ------------------------------------------------ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", "sphinxcontrib.httpdomain", "sphinx.ext.extlinks", "sphinxcontrib.images", "sphinxcontrib.programoutput", "sphinx.ext.viewcode", "sphinx_tabs.tabs", "sphinx_rtd_theme", "sphinx.ext.graphviz", "sphinx_click.ext", "myst_parser", "sphinx.ext.todo", "sphinx_reredirects", "swh.docs.sphinx.view_in_phabricator", # swh.scheduler inherits some attribute descriptions from celery that use # custom crossrefs (eg. :setting:`task_ignore_result`) "sphinx_celery.setting_crossref", + "sphinx_panels", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ".rst" # The master toctree document. master_doc = "index" # A string of reStructuredText that will be included at the beginning of every # source file that is read. # A bit hackish but should work both for each swh package and the whole swh-doc rst_prolog = """ .. include:: /../../swh-docs/docs/swh_substitutions """ # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = "" # The full version, including alpha/beta/rc tags. release = "" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [ "_build", "swh-icinga-plugins/index.rst", "swh.loader.cvs.rcsparse.setup.rst", "apidoc/swh.loader.cvs.rcsparse.setup.rst", ] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" html_favicon = "_static/favicon.ico" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = { "collapse_navigation": True, "sticky_navigation": True, } html_logo = "_static/software-heritage-logo-title-motto-vertical-white.png" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] # make logo actually appear, avoiding gotcha due to alabaster default conf. # https://github.com/bitprophet/alabaster/issues/97#issuecomment-303722935 html_sidebars = { "**": [ "about.html", "globaltoc.html", "relations.html", "sourcelink.html", "searchbox.html", ] } # If not None, a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. # The empty string is equivalent to '%b %d, %Y'. html_last_updated_fmt = "%Y-%m-%d %H:%M:%S %Z" # refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "swh-devel": ("https://docs.softwareheritage.org/devel", None), "swh-sysadm": ("https://docs.softwareheritage.org/sysadm", None), } # Redirects for pages that were moved, so we don't break external links. # Uses sphinx-reredirects redirects = { "swh-deposit/spec-api": "api/api-documentation.html", "swh-deposit/metadata": "api/metadata.html", "swh-deposit/specs/blueprint": "../api/use-cases.html", "swh-deposit/user-manual": "api/user-manual.html", "infrastructure/index.html": "../../sysadm/network-architecture/index.html", "infrastructure/network.html": "../../sysadm/network-architecture/index.html", "infrastructure/service-urls.html": "../../sysadm/network-architecture/service-urls.html", # noqa "architecture": "architecture/overview.html", "architecture/mirror": "../../sysadm/mirror-operations/index.html", "keycloak": "../../sysadm/user-management/keycloak/index.html", "mirror": "architecture/mirror.html", "users": "user", } # -- autodoc configuration ---------------------------------------------- autodoc_default_flags = [ "members", "undoc-members", "private-members", "special-members", ] autodoc_member_order = "bysource" autodoc_mock_imports = [ "rados", ] autoclass_content = "both" modindex_common_prefix = ["swh."] # For the todo extension. Todo and todolist produce output only if this is True todo_include_todos = True _swh_web_base_url = "https://archive.softwareheritage.org" # for the extlinks extension, sub-projects should fill that dict extlinks: Dict = { "swh_web": (f"{_swh_web_base_url}/%s", None), "swh_web_api": (f"{_swh_web_base_url}/api/1/%s", None), "swh_web_browse": (f"{_swh_web_base_url}/browse/%s", None), } # SWH_PACKAGE_DOC_TOX_BUILD environment variable is set in a tox environment # named sphinx for each swh package (except the swh-docs package itself). swh_package_doc_tox_build = os.environ.get("SWH_PACKAGE_DOC_TOX_BUILD", False) # override some configuration when building a swh package # documentation with tox to remove warnings and suppress # those related to unresolved references if swh_package_doc_tox_build: swh_substitutions = os.path.join( os.path.dirname(__file__), "../../../docs/swh_substitutions" ) rst_prolog = f".. include:: /{swh_substitutions}" suppress_warnings = ["ref.ref"] html_favicon = "" html_logo = "" class SimpleDocumenter(autodoc.FunctionDocumenter): """ Custom autodoc directive to inline the docstring of a function in a document without the signature header and with no indentation. Example of use:: .. autosimple:: swh.web.api.views.directory.api_directory """ objtype = "simple" # ensure the priority is lesser than the base FunctionDocumenter # to avoid side effects with autodoc processing priority = -1 # do not indent the content content_indent = "" # do not add a header to the docstring def add_directive_header(self, sig): pass # sphinx event handler to set adequate django settings prior reading # apidoc generated rst files when building doc to avoid autodoc errors def set_django_settings(app, env, docname): if any([pattern in app.srcdir for pattern in ("swh-web-client", "DWCLI")]): # swh-web-client is detected as swh-web by the code below but # django is not installed when building standalone swh-web-client doc return package_settings = { "auth": "swh.auth.tests.django.app.apptest.settings", "deposit": "swh.deposit.settings.development", "web": "swh.web.settings.development", } for package, settings in package_settings.items(): if any( [pattern in docname for pattern in (f"swh.{package}", f"swh-{package}")] ): force_django_settings(settings) # when building local package documentation with tox, insert glossary # content at the end of the index file in order to resolve references # to the terms it contains def add_glossary_to_index(app, docname, source): if docname == "index": glossary_path = os.path.join( os.path.dirname(__file__), "../../../docs/glossary.rst" ) with open(glossary_path, "r") as glossary: source[0] += "\n" + glossary.read() def setup(app): # env-purge-doc event is fired before source-read app.connect("env-purge-doc", set_django_settings) # add autosimple directive (used in swh-web) app.add_autodocumenter(SimpleDocumenter) # set an environment variable indicating we are currently building # the documentation os.environ["SWH_DOC_BUILD"] = "1" logger = logging.getLogger("sphinx") if swh_package_doc_tox_build: # ensure glossary will be available in package doc scope app.connect("source-read", add_glossary_to_index) # suppress some httpdomain warnings in non web packages if not any([pattern in app.srcdir for pattern in ("swh-web", "DWAPPS")]): # filter out httpdomain unresolved reference warnings # to not consider them as errors when using -W option of sphinx-build class HttpDomainRefWarningFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: return not record.msg.startswith("Cannot resolve reference to") # insert a custom filter in the warning log handler of sphinx logger.handlers[1].filters.insert(0, HttpDomainRefWarningFilter()) diff --git a/sysadm/mirror-operations/index.rst b/sysadm/mirror-operations/index.rst index 2baa555..7039fdf 100644 --- a/sysadm/mirror-operations/index.rst +++ b/sysadm/mirror-operations/index.rst @@ -1,132 +1,137 @@ .. _mirror_operations: Mirror Operations ================= .. _mirror: Description ----------- A mirror is a full copy of the |swh| archive, operated independently from the Software Heritage initiative. A minimal mirror consists of two parts: - the graph storage (typically an instance of :ref:`swh.storage `), which contains the Merkle DAG structure of the archive, *except* the actual content of source code files (AKA blobs), - the object storage (typically an instance of :ref:`swh.objstorage `), which contains all the blobs corresponding to archived source code files. However, a usable mirror needs also to be accessible by others. As such, a proper mirror should also allow to: - navigate the archive copy using a Web browser and/or the Web API (typically using the :ref:`the web application `), - retrieve data from the copy of the archive (typically using the :ref:`the vault service `) A mirror is initially populated and maintained up-to-date by consuming data from the |swh| Kafka-based :ref:`journal ` and retrieving the blob objects (file content) from the |swh| :ref:`object storage `. .. note:: It is not required that a mirror be deployed using the |swh| software stack. Other technologies, including different storage methods, can be used. But we will focus in this documentation to the case of mirror deployment using the |swh| software stack. .. thumbnail:: ../images/mirror-architecture.svg General view of the |swh| mirroring architecture. +See the :ref:`planning-a-mirror` for a complete description of the requirements +to host a mirror. + + Mirroring the Graph Storage ~~~~~~~~~~~~~~~~~~~~~~~~~~~ The replication of the graph is based on a journal using Kafka_ as event streaming platform. On the Software Heritage side, every addition made to the archive consist of the addition of a :ref:`data-model` object. The new object is also serialized as a msgpack_ bytestring which is used as the value of a message added to a Kafka topic dedicated to the object type. The main Kafka topics for the |swh| :ref:`data-model` are: - `swh.journal.objects.content` - `swh.journal.objects.directory` - `swh.journal.objects.extid` - `swh.journal.objects.metadata_authority` - `swh.journal.objects.metadata_fetcher` - `swh.journal.objects.origin_visit_status` - `swh.journal.objects.origin_visit` - `swh.journal.objects.origin` - `swh.journal.objects.raw_extrinsic_metadata` - `swh.journal.objects.release` - `swh.journal.objects.revision` - `swh.journal.objects.skipped_content` - `swh.journal.objects.snapshot` In order to set up a mirror of the graph, one needs to deploy a stack capable of retrieving all these topics and store their content reliably. For example a Kafka cluster configured as a replica of the main Kafka broker hosted by |swh| would do the job (albeit not in a very useful manner by itself). A more useful mirror can be set up using the :ref:`storage ` component with the help of the special service named `replayer` provided by the :mod:`swh.storage.replay` module. .. TODO: replace this previous link by a link to the 'swh storage replay' command once available, and ideally once https://github.com/sphinx-doc/sphinx/issues/880 is fixed Mirroring the Object Storage ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ File content (blobs) are *not* directly stored in messages of the `swh.journal.objects.content` Kafka topic, which only contains metadata about them, such as various kinds of cryptographic hashes. A separate component is in charge of replicating blob objects from the archive and stored them in the local object storage instance. A separate `swh-journal` client should subscribe to the `swh.journal.objects.content` topic to get the stream of blob objects identifiers, then retrieve corresponding blobs from the main Software Heritage object storage, and store them in the local object storage. A reference implementation for this component is available in :ref:`content replayer `. Installation ------------ When using the |swh| software stack to deploy a mirror, a number of |swh| software components must be installed (cf. architecture diagram above). A `docker-swarm `_ based deployment solution is provided as a working example of the mirror stack, see :ref:`mirror_deploy`. It is strongly recommended to start from there before planning a production-like deployment. .. _Kafka: https://kafka.apache.org/ .. _msgpack: https://msgpack.org -You may also want to read: +You may want to read: - :ref:`mirror_monitor` to learn how to monitor your mirror and how to report its health back the |swh|. - :ref:`mirror_onboard` for the |swh| side view of adding a new mirror. .. toctree:: :hidden: + planning deploy onboard monitor diff --git a/sysadm/mirror-operations/planning.rst b/sysadm/mirror-operations/planning.rst new file mode 100644 index 0000000..70013b6 --- /dev/null +++ b/sysadm/mirror-operations/planning.rst @@ -0,0 +1,187 @@ +.. _planning-a-mirror: + +Hosting a mirror +================ + +This section present and discuss the technical requirements needed to host a +|SWH| mirror. + +There are many different options to host a mirror, but there are common overall +requirements that needs to be fulfilled. + +Namely, hosting a mirror requires: + +- a dedicated infrastructure with enough compute (s/computing) power and storage +- enough network bandwidth (both ingress and egress) +- good IT tooling (supervision, alerting). + +The mirror operator is not required to run the Software Heritage `full software +stack `_, however it is possible to +use it. + +.. Warning:: + + Volumes given in this section are estimations and numbers from **January + 2022**. + + + +The global raw hardware requirements are: + +- a database system for the main storage of the archive (the graph structure); + the current volume is about 17TB, with an increase rate of about + 280GB/month, +- an object storage system for the objects (archived software source code + files); the current volume is about 800TB with an increase rate of + about 21TB/month, +- an elasticsearch engine; the current main index is about 180M entries + (origins) for an index size of 360GB; the increase rate is about 2M + entries/month, +- a web/application server for the main web application and public API, +- a few compute nodes for the application services. + + +A mirror should provision machines or cloud-based resources with these numbers +in mind. This should include the usual robustness margins (RAID-like storage, +replication, backup etc.). + +General hardware requirements +----------------------------- + +When deploying a mirror based on the Software Heritage software stack, one will +need: + + +Core services +^^^^^^^^^^^^^ + +- a database for the storage; this can be either a + `Postgresql `_ database (single machine) + or a `Cassandra `_ cluster (at least 3 nodes), +- an object storage system; this can be any + :py:mod:`supported backend ` + -- a public cloud-based obstorage (e.g. s3), any private supported object storage, + an ad-hoc filesystem storage system, etc. +- an `elasticsearch `_ instance, +- a few nodes for backend applications + (:py:mod:`swh-storage `, :py:mod:`swh-objstorage `) +- the web frontend (:py:mod:`swh-web `) + serving the main web app and the `public + API `_) + + +Replaying services +^^^^^^^^^^^^^^^^^^ + +- `graph + replayers `_ + as mirroring workers (increase parallelism to increase speed) +- `content + replayers `_ + as mirroring workers (id.) + + +Vault service +^^^^^^^^^^^^^ + +- a node for the :ref:`swh-vault ` backend service, +- a node for the :ref:`swh-vault ` worker service + + +Sizing a mirror infrastructure +------------------------------ + +.. Note:: solutions with a star (*) in the tables below are still under test or + validation. + +Common components +^^^^^^^^^^^^^^^^^ + +================ ====================== ========= ===== ============== ============== +SWH Service Tool Instances RAM Storage Type Storage Volume +================ ====================== ========= ===== ============== ============== +storage swh-storage 16 16GB regular 10GB +search elasticsearch 3 32GB fast / zfs 6TB +web swh-web 1 32GB regular 100GB +---------------- ---------------------- --------- ----- -------------- -------------- +graph replayer swh-storage 32 4GB regular 10GB +content replayer swh-obstorage-replayer 32 4GB regular 10GB +replayer redis 1 8GB regular 100GB +---------------- ---------------------- --------- ----- -------------- -------------- +vault swh-vault 1 4GB regular 10GB +vault worker swh-vault 1 16GB fast 1TB +vault rabbitmq 1 8GB regular 10GB +================ ====================== ========= ===== ============== ============== + + +Storage backend +^^^^^^^^^^^^^^^ + +.. tabbed:: Postgresql + + ================ ====================== ========= ===== ============== ============== + SWH Service Tool Instances RAM Storage Type Storage Volume + ================ ====================== ========= ===== ============== ============== + storage postgresql 1 512GB fast+zfs (lz4) 40TB + ================ ====================== ========= ===== ============== ============== + +.. tabbed:: Cassandra (min.)* + + ================ ====================== ========= ===== ============== ============== + SWH Service Tool Instances RAM Storage Type Storage Volume + ================ ====================== ========= ===== ============== ============== + storage cassandra 3 32GB fast 30TB + ================ ====================== ========= ===== ============== ============== + +.. tabbed:: Cassandra (typ.)* + + ================ ====================== ========= ===== ============== ============== + SWH Service Tool Instances RAM Storage Type Storage Volume + ================ ====================== ========= ===== ============== ============== + storage cassandra 6+ 32GB fast 20TB + ================ ====================== ========= ===== ============== ============== + + +Objstorage backend +^^^^^^^^^^^^^^^^^^ + + +.. tabbed:: FS + + ================ ====================== ========= ===== ============== ============== + SWH Service Tool Instances RAM Storage Type Storage Volume + ================ ====================== ========= ===== ============== ============== + objstorage swh-objstorage 1 [#f1]_ 512GB zfs (with lz4) 1PB + ================ ====================== ========= ===== ============== ============== + +.. tabbed:: Winery - Ceph* + + ================ ====================== ========= ===== ============== ============== + SWH Service Tool Instances RAM Storage Type Storage Volume + ================ ====================== ========= ===== ============== ============== + objstorage swh-objstorage 2 [#f2]_ 32GB standard 100GB + winery-db postgresql 2 [#f2]_ 512GB fast 10TB + ceph-mon ceph 3 4GB fast 60GB + ceph-osd ceph 12+ 4GB mix fast+HDD 1PB (total) + ================ ====================== ========= ===== ============== ============== + +.. tabbed:: Seaweedfs* + + ================ ====================== ========= ===== ============== ============== + SWH Service Tool Instances RAM Storage Type Storage Volume + ================ ====================== ========= ===== ============== ============== + objstorage swh-objstorage 3 32GB standard 100GB + seaweed LB nginx 1 32GB fast 100GB + seaweed-master seaweedfs 3 8GB standard 10GB + seaweed-filer seaweedfs 3 32GB fast 1TB + seaweed-volume seaweedfs 3+ 32GB standard 1PB (total) + ================ ====================== ========= ===== ============== ============== + +.. rubric:: Notes + +.. [#f1] An swh-objstorage using :py:mod:`simple filesystem + ` as backend can actually be + split on several machines using the + :py:mod:`swh.objstorage.multiplexer` backend. +.. [#f2] The swh-objstorage RPC service and the index database can be hosted on + the same machine.