diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2665767..a7dadbc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,49 +1,49 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 + rev: v4.3.0 hooks: - id: trailing-whitespace - id: check-json - id: check-yaml -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 +- repo: https://github.com/pycqa/flake8 + rev: 5.0.4 hooks: - id: flake8 - repo: https://github.com/codespell-project/codespell - rev: v1.16.0 + rev: v2.2.2 hooks: - id: codespell args: [-L mor] - repo: local hooks: - id: mypy name: mypy entry: mypy args: [swh] pass_filenames: false language: system types: [python] # unfortunately, we are far from being able to enable this... # - repo: https://github.com/PyCQA/pydocstyle.git # rev: 4.0.0 # hooks: # - id: pydocstyle # name: pydocstyle # description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. # entry: pydocstyle --convention=google # language: python # types: [python] - repo: https://github.com/PyCQA/isort - rev: 5.5.2 + rev: 5.10.1 hooks: - id: isort - repo: https://github.com/python/black - rev: 22.3.0 + rev: 22.10.0 hooks: - id: black diff --git a/PKG-INFO b/PKG-INFO index 33a2bcf..2f9d457 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,65 +1,65 @@ Metadata-Version: 2.1 Name: swh.scrubber -Version: 0.1.1 +Version: 0.1.2 Summary: Software Heritage Datastore Scrubber Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Datastore Scrubber ====================================== Tools to periodically checks data integrity in swh-storage and swh-objstorage, reports errors, and (try to) fix them. This is a work in progress; some of the components described below do not exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) The Scrubber package is made of the following parts: Checking -------- Highly parallel processes continuously read objects from a data store, compute checksums, and write any failure in a database, along with the data of the corrupt object. There is one "checker" for each datastore package: storage (postgresql and cassandra), journal (kafka), and objstorage. The journal is "crawled" using its native streaming; others are crawled by range, reusing swh-storage's backfiller utilities, and checkpointed from time to time to the scrubber's database (in the ``checked_range`` table). Recovery -------- Then, from time to time, jobs go through the list of known corrupt objects, and try to recover the original objects, through various means: * Brute-forcing variations until they match their checksum * Recovering from another data store * As a last resort, recovering from known origins, if any Reinjection ----------- Finally, when an original object is recovered, it is reinjected in the original data store, replacing the corrupt one. diff --git a/debian/changelog b/debian/changelog index d23593f..60eccd9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,94 +1,100 @@ -swh-scrubber (0.1.1-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-scrubber (0.1.2-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 0.1.2 - (tagged by Valentin Lorentz + on 2022-12-20 10:42:49 +0100) + * Upstream changes: - v0.1.2 - * docs: Include module indices + only when building standalone package doc - * sql: Fix typos + detected by codespell - * pre-commit, tox: Bump pre-commit, + codespell, black and flake8 - * storage_checker: Retry on + postgresql errors from swh-storage - -- Software Heritage autobuilder (on jenkins-debian1) Mon, 17 Oct 2022 13:12:46 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Tue, 20 Dec 2022 09:46:50 +0000 swh-scrubber (0.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.1 - (tagged by Valentin Lorentz on 2022-10-17 15:05:50 +0200) * Upstream changes: - v0.1.1 - * Add checkpointing on storage_checker to avoid rechecking objects at - the beginning of ranges again and again - This release contains a database migration, that must be applied before - restarting workers. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 17 Oct 2022 13:10:49 +0000 swh-scrubber (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by Nicolas Dandrimont on 2022-08-18 17:01:12 +0200) * Upstream changes: - Release swh.scrubber 0.1.0 - Add missing_object-related tables - Add metrics and reduce logging for production purposes -- Software Heritage autobuilder (on jenkins-debian1) Thu, 18 Aug 2022 15:05:14 +0000 swh-scrubber (0.0.6-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.6 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-31 11:15:58 +0200) * Upstream changes: - v0.0.6 - Wrap queries in transaction -- Software Heritage autobuilder (on jenkins-debian1) Tue, 31 May 2022 09:20:56 +0000 swh-scrubber (0.0.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.5 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 17:55:26 +0200) * Upstream changes: - v0.0.5 - Reference the scrubber db model upgrade (from version 1 to 2) -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 16:02:37 +0000 swh-scrubber (0.0.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.4 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 17:35:51 +0200) * Upstream changes: - v0.0.4 - Recursive include the swh/scrubber/sql folder -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 15:40:57 +0000 swh-scrubber (0.0.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 15:45:51 +0200) * Upstream changes: - v0.0.3 - Unify factory to use keyword 'postgresql' over deprecated 'local' - db: Bump to version 2 - requirements: Add missing dependency -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 13:49:46 +0000 swh-scrubber (0.0.2-1~swh2) unstable-swh; urgency=medium * Update dependencies and bump new release -- Antoine R. Dumont (@ardumont) Mon, 30 May 2022 14:25:52 +0200 swh-scrubber (0.0.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-30 10:15:15 +0200) * Upstream changes: - v0.0.2 - Add Fixer class, which re-loads corrupt objects from origins - Fix crash when using datastore_get_or_add for an existing datastore - Internals - -------- - requirements-test: Remove pytest pinning to < 7 - add strict asyncio_mode in pytest.ini - Bump mypy to v0.942 - Add .git-blame-ignore-revs file with automatic reformatting commits - python: Reformat code with black 22.3.0 - pre-commit, tox: Bump black from 19.10b0 to 22.3.0 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 08:19:10 +0000 swh-scrubber (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release -- Nicolas Dandrimont Thu, 31 Mar 2022 19:29:54 +0200 diff --git a/docs/index.rst b/docs/index.rst index f920c5a..1d6af76 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,15 +1,17 @@ .. _swh-scrubber: .. include:: README.rst .. toctree:: :maxdepth: 2 :caption: Contents: -Indices and tables ------------------- +.. only:: standalone_package_doc -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + Indices and tables + ------------------ + + * :ref:`genindex` + * :ref:`modindex` + * :ref:`search` diff --git a/requirements.txt b/requirements.txt index e35fb35..a5f4288 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html dulwich +psycopg2 +tenacity diff --git a/swh.scrubber.egg-info/PKG-INFO b/swh.scrubber.egg-info/PKG-INFO index 33a2bcf..2f9d457 100644 --- a/swh.scrubber.egg-info/PKG-INFO +++ b/swh.scrubber.egg-info/PKG-INFO @@ -1,65 +1,65 @@ Metadata-Version: 2.1 Name: swh.scrubber -Version: 0.1.1 +Version: 0.1.2 Summary: Software Heritage Datastore Scrubber Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scrubber Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scrubber/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Datastore Scrubber ====================================== Tools to periodically checks data integrity in swh-storage and swh-objstorage, reports errors, and (try to) fix them. This is a work in progress; some of the components described below do not exist yet (cassandra storage checker, objstorage checker, recovery, and reinjection) The Scrubber package is made of the following parts: Checking -------- Highly parallel processes continuously read objects from a data store, compute checksums, and write any failure in a database, along with the data of the corrupt object. There is one "checker" for each datastore package: storage (postgresql and cassandra), journal (kafka), and objstorage. The journal is "crawled" using its native streaming; others are crawled by range, reusing swh-storage's backfiller utilities, and checkpointed from time to time to the scrubber's database (in the ``checked_range`` table). Recovery -------- Then, from time to time, jobs go through the list of known corrupt objects, and try to recover the original objects, through various means: * Brute-forcing variations until they match their checksum * Recovering from another data store * As a last resort, recovering from known origins, if any Reinjection ----------- Finally, when an original object is recovered, it is reinjected in the original data store, replacing the corrupt one. diff --git a/swh.scrubber.egg-info/requires.txt b/swh.scrubber.egg-info/requires.txt index a52c064..7698c3a 100644 --- a/swh.scrubber.egg-info/requires.txt +++ b/swh.scrubber.egg-info/requires.txt @@ -1,13 +1,15 @@ dulwich +psycopg2 +tenacity swh.core[http]>=0.3 swh.loader.git>=1.4.0 swh.model>=5.0.0 swh.storage>=1.1.0 swh.journal>=0.9.0 [testing] pytest pytest-mock pyyaml swh.graph types-pyyaml diff --git a/swh/scrubber/sql/30-schema.sql b/swh/scrubber/sql/30-schema.sql index b28ea3c..9ef813a 100644 --- a/swh/scrubber/sql/30-schema.sql +++ b/swh/scrubber/sql/30-schema.sql @@ -1,107 +1,107 @@ ------------------------------------- -- Shared definitions ------------------------------------- create domain swhid as text check (value ~ '^swh:[0-9]+:.*'); create table datastore ( id bigserial not null, package datastore_type not null, class text, instance text ); comment on table datastore is 'Each row identifies a data store being scrubbed'; comment on column datastore.id is 'Internal identifier of the datastore'; comment on column datastore.package is 'Name of the component using this datastore (storage/journal/objstorage)'; comment on column datastore.class is 'For datastores with multiple backends, name of the backend (postgresql/cassandra for storage, kafka for journal, pathslicer/azure/winery/... for objstorage)'; comment on column datastore.instance is 'Human-readable way to uniquely identify the datastore; eg. its URL or DSN.'; ------------------------------------- -- Checkpointing/progress tracking ------------------------------------- create table checked_range ( datastore int not null, range_start swhid not null, range_end swhid not null, last_date timestamptz not null ); -comment on table checked_range is 'Each row represents a range of objects in a datastore that were fetched, checksumed, and checked at some point in the past.'; +comment on table checked_range is 'Each row represents a range of objects in a datastore that were fetched, checksummed, and checked at some point in the past.'; comment on column checked_range.range_start is 'First SWHID of the range that was checked (inclusive, possibly non-existent).'; comment on column checked_range.range_end is 'Last SWHID of the range that was checked (inclusive, possiby non-existent).'; comment on column checked_range.last_date is 'Date the last scrub of that range *started*.'; ------------------------------------- -- Inventory of objects with issues ------------------------------------- create table corrupt_object ( id swhid not null, datastore int not null, object bytea not null, first_occurrence timestamptz not null default now() ); comment on table corrupt_object is 'Each row identifies an object that was found to be corrupt'; comment on column corrupt_object.datastore is 'Datastore the corrupt object was found in.'; comment on column corrupt_object.object is 'Corrupt object, as found in the datastore (possibly msgpack-encoded, using the journal''s serializer)'; comment on column corrupt_object.first_occurrence is 'Moment the object was found to be corrupt for the first time'; create table missing_object ( id swhid not null, datastore int not null, first_occurrence timestamptz not null default now() ); comment on table missing_object is 'Each row identifies an object that are missing but referenced by another object (aka "holes")'; comment on column missing_object.datastore is 'Datastore where the hole is.'; comment on column missing_object.first_occurrence is 'Moment the object was found to be corrupt for the first time'; create table missing_object_reference ( missing_id swhid not null, reference_id swhid not null, datastore int not null, first_occurrence timestamptz not null default now() ); comment on table missing_object_reference is 'Each row identifies an object that points to an object that does not exist (aka a "hole")'; comment on column missing_object_reference.missing_id is 'SWHID of the missing object.'; comment on column missing_object_reference.reference_id is 'SWHID of the object referencing the missing object.'; comment on column missing_object_reference.datastore is 'Datastore where the referencing object is.'; comment on column missing_object_reference.first_occurrence is 'Moment the object was found to reference a missing object'; ------------------------------------- -- Issue resolution ------------------------------------- create table object_origin ( object_id swhid not null, origin_url text not null, last_attempt timestamptz -- NULL if not tried yet ); comment on table object_origin is 'Maps objects to origins they might be found in.'; create table fixed_object ( id swhid not null, object bytea not null, method text, recovery_date timestamptz not null default now() ); comment on table fixed_object is 'Each row identifies an object that was found to be corrupt, along with the original version of the object'; comment on column fixed_object.object is 'The recovered object itself, as a msgpack-encoded dict'; comment on column fixed_object.recovery_date is 'Moment the object was recovered.'; comment on column fixed_object.method is 'How the object was recovered. For example: "from_origin", "negative_utc", "capitalized_revision_parent".'; diff --git a/swh/scrubber/sql/upgrades/4.sql b/swh/scrubber/sql/upgrades/4.sql index 9dc7e2f..026ba49 100644 --- a/swh/scrubber/sql/upgrades/4.sql +++ b/swh/scrubber/sql/upgrades/4.sql @@ -1,21 +1,21 @@ -- SWH Scrubber DB schema upgrade -- from_version: 3 -- to_version: 4 -- description: Add checked_range create table checked_range ( datastore int not null, range_start swhid not null, range_end swhid not null, last_date timestamptz not null ); -comment on table checked_range is 'Each row represents a range of objects in a datastore that were fetched, checksumed, and checked at some point in the past.'; +comment on table checked_range is 'Each row represents a range of objects in a datastore that were fetched, checksummed, and checked at some point in the past.'; comment on column checked_range.range_start is 'First SWHID of the range that was checked (inclusive, possibly non-existent).'; comment on column checked_range.range_end is 'Last SWHID of the range that was checked (inclusive, possiby non-existent).'; comment on column checked_range.last_date is 'Date the last scrub of that range *started*.'; create unique index concurrently checked_range_pkey on checked_range(datastore, range_start, range_end); alter table checked_range add primary key using index checked_range_pkey; diff --git a/swh/scrubber/storage_checker.py b/swh/scrubber/storage_checker.py index c29a903..69ef8a3 100644 --- a/swh/scrubber/storage_checker.py +++ b/swh/scrubber/storage_checker.py @@ -1,351 +1,367 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Reads all objects in a swh-storage instance and recomputes their checksums.""" import collections import contextlib import dataclasses import datetime import logging from typing import Iterable, Optional, Tuple, Union +import psycopg2 +import tenacity + from swh.core.statsd import Statsd from swh.journal.serializers import value_to_kafka from swh.model import swhids from swh.model.model import ( Content, Directory, ObjectType, Release, Revision, Snapshot, TargetType, ) from swh.storage import backfill from swh.storage.interface import StorageInterface from swh.storage.postgresql.storage import Storage as PostgresqlStorage from .db import Datastore, ScrubberDb logger = logging.getLogger(__name__) ScrubbableObject = Union[Revision, Release, Snapshot, Directory, Content] @contextlib.contextmanager def storage_db(storage): db = storage.get_db() try: yield db finally: storage.put_db(db) def _get_inclusive_range_swhids( inclusive_range_start: Optional[bytes], exclusive_range_end: Optional[bytes], object_type: swhids.ObjectType, ) -> Tuple[swhids.CoreSWHID, swhids.CoreSWHID]: r""" Given a ``[range_start, range_end)`` right-open interval of id prefixes and an object type (as returned by :const:`swh.storage.backfill.RANGE_GENERATORS`), returns a ``[range_start_swhid, range_end_swhid]`` closed interval of SWHIDs suitable for the scrubber database. >>> _get_inclusive_range_swhids(b"\x42", None, swhids.ObjectType.SNAPSHOT) (CoreSWHID.from_string('swh:1:snp:4200000000000000000000000000000000000000'), CoreSWHID.from_string('swh:1:snp:ffffffffffffffffffffffffffffffffffffffff')) >>> _get_inclusive_range_swhids(b"\x00", b"\x12\x34", swhids.ObjectType.REVISION) (CoreSWHID.from_string('swh:1:rev:0000000000000000000000000000000000000000'), CoreSWHID.from_string('swh:1:rev:1233ffffffffffffffffffffffffffffffffffff')) """ # noqa range_start_swhid = swhids.CoreSWHID( object_type=object_type, object_id=(inclusive_range_start or b"").ljust(20, b"\00"), ) if exclusive_range_end is None: inclusive_range_end = b"\xff" * 20 else: # convert "1230000000..." to "122fffffff..." inclusive_range_end = ( int.from_bytes(exclusive_range_end.ljust(20, b"\x00"), "big") - 1 ).to_bytes(20, "big") range_end_swhid = swhids.CoreSWHID( object_type=object_type, object_id=inclusive_range_end, ) return (range_start_swhid, range_end_swhid) @dataclasses.dataclass class StorageChecker: """Reads a chunk of a swh-storage database, recomputes checksums, and reports errors in a separate database.""" db: ScrubberDb storage: StorageInterface object_type: str """``directory``/``revision``/``release``/``snapshot``""" start_object: str """minimum value of the hexdigest of the object's sha1.""" end_object: str """maximum value of the hexdigest of the object's sha1.""" _datastore = None _statsd = None def datastore_info(self) -> Datastore: """Returns a :class:`Datastore` instance representing the swh-storage instance being checked.""" if self._datastore is None: if isinstance(self.storage, PostgresqlStorage): with storage_db(self.storage) as db: self._datastore = Datastore( package="storage", cls="postgresql", instance=db.conn.dsn, ) else: raise NotImplementedError( f"StorageChecker(storage={self.storage!r}).datastore()" ) return self._datastore def statsd(self) -> Statsd: if self._statsd is None: self._statsd = Statsd( namespace="swh_scrubber", constant_tags={"object_type": self.object_type}, ) return self._statsd def run(self): """Runs on all objects of ``object_type`` and with id between ``start_object`` and ``end_object``. """ if isinstance(self.storage, PostgresqlStorage): - with storage_db(self.storage) as db: - return self._check_postgresql(db) + return self._check_postgresql() else: raise NotImplementedError( f"StorageChecker(storage={self.storage!r}).check_storage()" ) - def _check_postgresql(self, db): + def _check_postgresql(self): object_type = getattr(swhids.ObjectType, self.object_type.upper()) for range_start, range_end in backfill.RANGE_GENERATORS[self.object_type]( self.start_object, self.end_object ): (range_start_swhid, range_end_swhid) = _get_inclusive_range_swhids( range_start, range_end, object_type ) start_time = datetime.datetime.now(tz=datetime.timezone.utc) # Currently, this matches range boundaries exactly, with no regard for # ranges that contain or are contained by it. last_check_time = self.db.checked_range_get_last_date( self.datastore_info(), range_start_swhid, range_end_swhid, ) if last_check_time is not None: # TODO: re-check if 'last_check_time' was a long ago. logger.debug( "Skipping processing of %s range %s to %s: already done at %s", self.object_type, backfill._format_range_bound(range_start), backfill._format_range_bound(range_end), last_check_time, ) continue logger.debug( "Processing %s range %s to %s", self.object_type, backfill._format_range_bound(range_start), backfill._format_range_bound(range_end), ) + self._check_postgresql_range(object_type, range_start, range_end) + + self.db.checked_range_upsert( + self.datastore_info(), + range_start_swhid, + range_end_swhid, + start_time, + ) + + @tenacity.retry( + retry=tenacity.retry_if_exception_type(psycopg2.OperationalError), + wait=tenacity.wait_random_exponential(min=10, max=180), + ) + def _check_postgresql_range( + self, object_type: swhids.ObjectType, range_start, range_end + ) -> None: + assert isinstance( + self.storage, PostgresqlStorage + ), f"_check_postgresql_range called with self.storage={self.storage!r}" + + with storage_db(self.storage) as db: objects = backfill.fetch( db, self.object_type, start=range_start, end=range_end ) objects = list(objects) with self.statsd().timed( "batch_duration_seconds", tags={"operation": "check_hashes"} ): self.check_object_hashes(objects) with self.statsd().timed( "batch_duration_seconds", tags={"operation": "check_references"} ): self.check_object_references(objects) - self.db.checked_range_upsert( - self.datastore_info(), - range_start_swhid, - range_end_swhid, - start_time, - ) - def check_object_hashes(self, objects: Iterable[ScrubbableObject]): """Recomputes hashes, and reports mismatches.""" count = 0 for object_ in objects: if isinstance(object_, Content): # TODO continue real_id = object_.compute_hash() count += 1 if object_.id != real_id: self.statsd().increment("hash_mismatch_total") self.db.corrupt_object_add( object_.swhid(), self.datastore_info(), value_to_kafka(object_.to_dict()), ) if count: self.statsd().increment("objects_hashed_total", count) def check_object_references(self, objects: Iterable[ScrubbableObject]): """Check all objects references by these objects exist.""" cnt_references = collections.defaultdict(set) dir_references = collections.defaultdict(set) rev_references = collections.defaultdict(set) rel_references = collections.defaultdict(set) snp_references = collections.defaultdict(set) for object_ in objects: swhid = object_.swhid() if isinstance(object_, Content): pass elif isinstance(object_, Directory): for entry in object_.entries: if entry.type == "file": cnt_references[entry.target].add(swhid) elif entry.type == "dir": dir_references[entry.target].add(swhid) elif entry.type == "rev": # dir->rev holes are not considered a problem because they # happen whenever git submodules point to repositories that # were not loaded yet; ignore them pass else: assert False, entry elif isinstance(object_, Revision): dir_references[object_.directory].add(swhid) for parent in object_.parents: rev_references[parent].add(swhid) elif isinstance(object_, Release): if object_.target is None: pass elif object_.target_type == ObjectType.CONTENT: cnt_references[object_.target].add(swhid) elif object_.target_type == ObjectType.DIRECTORY: dir_references[object_.target].add(swhid) elif object_.target_type == ObjectType.REVISION: rev_references[object_.target].add(swhid) elif object_.target_type == ObjectType.RELEASE: rel_references[object_.target].add(swhid) else: assert False, object_ elif isinstance(object_, Snapshot): for branch in object_.branches.values(): if branch is None: pass elif branch.target_type == TargetType.ALIAS: pass elif branch.target_type == TargetType.CONTENT: cnt_references[branch.target].add(swhid) elif branch.target_type == TargetType.DIRECTORY: dir_references[branch.target].add(swhid) elif branch.target_type == TargetType.REVISION: rev_references[branch.target].add(swhid) elif branch.target_type == TargetType.RELEASE: rel_references[branch.target].add(swhid) elif branch.target_type == TargetType.SNAPSHOT: snp_references[branch.target].add(swhid) else: assert False, (str(object_.swhid()), branch) else: assert False, object_.swhid() missing_cnts = set( self.storage.content_missing_per_sha1_git(list(cnt_references)) ) missing_dirs = set(self.storage.directory_missing(list(dir_references))) missing_revs = set(self.storage.revision_missing(list(rev_references))) missing_rels = set(self.storage.release_missing(list(rel_references))) missing_snps = set(self.storage.snapshot_missing(list(snp_references))) self.statsd().increment( "missing_object_total", len(missing_cnts), tags={"target_object_type": "content"}, ) self.statsd().increment( "missing_object_total", len(missing_dirs), tags={"target_object_type": "directory"}, ) self.statsd().increment( "missing_object_total", len(missing_revs), tags={"target_object_type": "revision"}, ) self.statsd().increment( "missing_object_total", len(missing_rels), tags={"target_object_type": "release"}, ) self.statsd().increment( "missing_object_total", len(missing_snps), tags={"target_object_type": "snapshot"}, ) for missing_id in missing_cnts: missing_swhid = swhids.CoreSWHID( object_type=swhids.ObjectType.CONTENT, object_id=missing_id ) self.db.missing_object_add( missing_swhid, cnt_references[missing_id], self.datastore_info() ) for missing_id in missing_dirs: missing_swhid = swhids.CoreSWHID( object_type=swhids.ObjectType.DIRECTORY, object_id=missing_id ) self.db.missing_object_add( missing_swhid, dir_references[missing_id], self.datastore_info() ) for missing_id in missing_revs: missing_swhid = swhids.CoreSWHID( object_type=swhids.ObjectType.REVISION, object_id=missing_id ) self.db.missing_object_add( missing_swhid, rev_references[missing_id], self.datastore_info() ) for missing_id in missing_rels: missing_swhid = swhids.CoreSWHID( object_type=swhids.ObjectType.RELEASE, object_id=missing_id ) self.db.missing_object_add( missing_swhid, rel_references[missing_id], self.datastore_info() ) for missing_id in missing_snps: missing_swhid = swhids.CoreSWHID( object_type=swhids.ObjectType.SNAPSHOT, object_id=missing_id ) self.db.missing_object_add( missing_swhid, snp_references[missing_id], self.datastore_info() ) diff --git a/tox.ini b/tox.ini index f17af83..c132c8b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,74 +1,75 @@ [tox] envlist=black,flake8,mypy,py3 [testenv] extras = testing deps = pytest-cov commands = pytest --doctest-modules \ {envsitepackagesdir}/swh/scrubber \ --cov={envsitepackagesdir}/swh/scrubber \ --cov-branch {posargs} [testenv:black] skip_install = true deps = - black==22.3.0 + black==22.10.0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = - flake8==4.0.1 - flake8-bugbear==22.3.23 + flake8==5.0.4 + flake8-bugbear==22.9.23 + pycodestyle==2.9.1 commands = {envpython} -m flake8 [testenv:mypy] extras = testing deps = mypy==0.942 commands = mypy swh # build documentation outside swh-environment using the current # git HEAD of swh-docs, is executed on CI for each diff to prevent # breaking doc build [testenv:sphinx] whitelist_externals = make usedevelop = true extras = testing deps = # fetch and install swh-docs in develop mode -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs # build documentation only inside swh-environment using local state # of swh-docs package [testenv:sphinx-dev] whitelist_externals = make usedevelop = true extras = testing deps = # install swh-docs in develop mode -e ../swh-docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs