diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -18,6 +18,7 @@ from dulwich.objects import ShaFile from dulwich.pack import PackData, PackInflater +from swh.core.statsd import statsd from swh.loader.core.loader import DVCSLoader from swh.loader.exception import NotFound from swh.model import hashutil @@ -36,6 +37,8 @@ from . import converters, dumb, utils from .utils import HexBytes +STATSD_PREFIX = "swh.loader.git.loader" + logger = logging.getLogger(__name__) @@ -47,9 +50,11 @@ storage, base_snapshot: Optional[Snapshot] = None, incremental: bool = True, + statsd_incremental_mode: str = None, ): self.storage = storage self.incremental = incremental + self.statsd_incremental_mode = statsd_incremental_mode if base_snapshot and incremental: self.base_snapshot: Snapshot = base_snapshot @@ -95,6 +100,16 @@ logger.debug("remote_heads_count=%s", len(remote_heads)) wanted_refs = list(remote_heads - local_heads) logger.debug("wanted_refs_count=%s", len(wanted_refs)) + statsd.histogram( + f"{STATSD_PREFIX}.ratio_ignored_refs", + len(remote_heads - set(refs.values())) / len(refs), + tags={"incremental": self.statsd_incremental_mode}, + ) + statsd.histogram( + f"{STATSD_PREFIX}.ratio_known_refs", + len(local_heads & remote_heads) / len(remote_heads), + tags={"incremental": self.statsd_incremental_mode}, + ) return wanted_refs @@ -107,10 +122,34 @@ class GitLoader(DVCSLoader): - """A bulk loader for a git repository""" + f"""A bulk loader for a git repository + + Emits the following statsd stats: + + * increments ``{STATSD_PREFIX}.incremental.`` + * histogram ``{STATSD_PREFIX}.ratio_ignored_refs`` is the ratio of refs ignored + over all refs of the remote repository + * histogram ``{STATSD_PREFIX}.ratio_known_refs`` is the ratio of (non-ignored) + remote heads that are already local over all non-ignored remote heads + + The latter two are tagged with ``{{"incremental": ""}}`` where + ``incremental_mode`` is one of: + + * ``from_same_origin`` when the origin was already loaded + * ``from_parent_origin`` when the origin was not already loaded, + but it was detected as a forge-fork of an origin that was already loaded + * ``no_previous_snapshot`` when the origin was not already loaded, + and it was detected as a forge-fork of origins that were not already loaded either + * ``no_parent_origin`` when the origin was no already loaded, and it was not + detected as a forge-fork of any other origin + * ``disabled`` when incremental loading is disabled by configuration + """ visit_type = "git" + statsd_incremental_mode: str + """value of the 'incremental' tag set on statsd histograms for refs stats""" + def __init__( self, storage: StorageInterface, @@ -235,15 +274,25 @@ if self.incremental: prev_snapshot = self.get_full_snapshot(self.origin.url) - - if self.parent_origins is not None: + if prev_snapshot: + self.statsd_incremental_mode = "from_same_origin" + elif self.parent_origins is not None: # If this is the first time we load this origin and it is a forge # fork, load incrementally from one of the origins it was forked from, # closest parent first for parent_origin in self.parent_origins: + prev_snapshot = self.get_full_snapshot(parent_origin.url) if prev_snapshot is not None: + self.statsd_incremental_mode = "from_parent_origin" break - prev_snapshot = self.get_full_snapshot(parent_origin.url) + else: + self.statsd_incremental_mode = "no_previous_snapshot" + else: + self.statsd_incremental_mode = "no_parent_origin" + else: + self.statsd_incremental_mode = "disabled" + + statsd.increment(f"{STATSD_PREFIX}.incremental.{self.statsd_incremental_mode}") if prev_snapshot is not None: self.base_snapshot = prev_snapshot @@ -257,6 +306,7 @@ storage=self.storage, base_snapshot=self.base_snapshot, incremental=self.incremental, + statsd_incremental_mode=self.statsd_incremental_mode, ) def do_progress(msg: bytes) -> None: diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,12 +1,14 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime from functools import partial from http.server import HTTPServer, SimpleHTTPRequestHandler import os import subprocess +import sys from tempfile import SpooledTemporaryFile from threading import Thread from unittest.mock import MagicMock, call @@ -18,13 +20,13 @@ from swh.loader.git import dumb from swh.loader.git.loader import GitLoader -from swh.loader.git.tests.test_from_disk import FullGitLoaderTests +from swh.loader.git.tests.test_from_disk import SNAPSHOT1, FullGitLoaderTests from swh.loader.tests import ( assert_last_visit_matches, get_stats, prepare_repository_from_archive, ) -from swh.model.model import Origin +from swh.model.model import Origin, OriginVisit, OriginVisitStatus class CommonGitLoaderNotFound: @@ -151,7 +153,8 @@ ) self.repo = dulwich.repo.Repo(self.destination_path) - def test_load_incremental(self): + def test_no_previous_snapshot(self, mocker): + statsd_report = mocker.patch("swh.core.statsd.statsd._report") res = self.loader.load() assert res == {"status": "eventful"} @@ -180,9 +183,111 @@ ), ] + p = "swh.loader.git" + assert [c for c in statsd_report.mock_calls if c[1][0].startswith(p)] == [ + call(f"{p}.loader.incremental.no_previous_snapshot", "c", 1, None, 1), + call( + f"{p}.loader.ratio_ignored_refs", + "h", + 0.0, + {"incremental": "no_previous_snapshot"}, + 1, + ), + call( + f"{p}.loader.ratio_known_refs", + "h", + 0.0, + {"incremental": "no_previous_snapshot"}, + 1, + ), + ] + + def test_load_incremental(self, mocker): + statsd_report = mocker.patch("swh.core.statsd.statsd._report") + + snapshot_id = b"\x01" * 20 + now = datetime.datetime.now(tz=datetime.timezone.utc) + + def ovgl(origin_url, allowed_statuses, require_snapshot, type): + if origin_url == f"base://{self.repo_url}": + return OriginVisit(origin=origin_url, visit=42, date=now, type="git") + else: + return None + + self.loader.storage.origin_visit_get_latest.side_effect = ovgl + self.loader.storage.origin_visit_status_get_latest.return_value = ( + OriginVisitStatus( + origin=f"base://{self.repo_url}", + visit=42, + snapshot=snapshot_id, + date=now, + status="full", + ) + ) + self.loader.storage.snapshot_get_branches.return_value = { + "id": snapshot_id, + "branches": { + b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"] + }, + "next_branch": None, + } + + res = self.loader.load() + assert res == {"status": "eventful"} + + self.fetcher_cls.assert_called_once_with( + credentials={}, + lister_name="fake-lister", + lister_instance_name="", + origin=Origin(url=self.repo_url), + ) + self.fetcher.get_parent_origins.assert_called_once_with() + + # First tries the same origin + assert self.loader.storage.origin_visit_get_latest.mock_calls == [ + call( + self.repo_url, + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + # As it does not already have a snapshot, fall back to the parent origin + call( + f"base://{self.repo_url}", + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + ] + + p = "swh.loader.git" + assert [c for c in statsd_report.mock_calls if c[1][0].startswith(p)] == [ + call(f"{p}.loader.incremental.from_parent_origin", "c", 1, None, 1), + call( + f"{p}.loader.ratio_ignored_refs", + "h", + 0.0, + {"incremental": "from_parent_origin"}, + 1, + ), + call( + f"{p}.loader.ratio_known_refs", + "h", + 0.25, + {"incremental": "from_parent_origin"}, + 1, + ), + ] + self.fetcher.reset_mock() self.fetcher_cls.reset_mock() - self.loader.storage.reset_mock() + self.loader.storage.reset_mock(return_value=True, side_effect=True) + if sys.version_info < (3, 9, 0): + # Reimplement https://github.com/python/cpython/commit/aef7dc89879d099dc704bd8037b8a7686fb72838 # noqa + # for old Python versions: + for child in self.loader.storage._mock_children.values(): + child.reset_mock(return_value=True, side_effect=True) + statsd_report.reset_mock() # Load again res = self.loader.load() @@ -200,13 +305,32 @@ # Tries the same origin, and finds a snapshot call( self.repo_url, + type=None, allowed_statuses=None, require_snapshot=True, - type=None, ), # -> does not need to fall back to the parent ] + p = "swh.loader.git" + assert [c for c in statsd_report.mock_calls if c.args[0].startswith(p)] == [ + call(f"{p}.loader.incremental.from_same_origin", "c", 1, None, 1), + call( + f"{p}.loader.ratio_ignored_refs", + "h", + 0.0, + {"incremental": "from_same_origin"}, + 1, + ), + call( + f"{p}.loader.ratio_known_refs", + "h", + 1.0, + {"incremental": "from_same_origin"}, + 1, + ), + ] + class DumbGitLoaderTestBase(FullGitLoaderTests): """Prepare a git repository to be loaded using the HTTP dumb transfer protocol."""