diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -18,6 +18,7 @@ from dulwich.objects import ShaFile from dulwich.pack import PackData, PackInflater +from swh.core.statsd import Statsd from swh.loader.core.loader import DVCSLoader from swh.loader.exception import NotFound from swh.model import hashutil @@ -36,6 +37,8 @@ from . import converters, dumb, utils from .utils import HexBytes +STATSD_PREFIX = "swh_loader_git" + logger = logging.getLogger(__name__) @@ -47,9 +50,11 @@ storage, base_snapshot: Optional[Snapshot] = None, incremental: bool = True, + statsd: Statsd = None, ): self.storage = storage self.incremental = incremental + self.statsd = statsd if base_snapshot and incremental: self.base_snapshot: Snapshot = base_snapshot @@ -95,6 +100,17 @@ logger.debug("remote_heads_count=%s", len(remote_heads)) wanted_refs = list(remote_heads - local_heads) logger.debug("wanted_refs_count=%s", len(wanted_refs)) + if self.statsd is not None: + self.statsd.histogram( + "git_ignored_refs_percent", + len(remote_heads - set(refs.values())) / len(refs), + tags={}, + ) + self.statsd.histogram( + "git_known_refs_percent", + len(local_heads & remote_heads) / len(remote_heads), + tags={}, + ) return wanted_refs @@ -107,7 +123,28 @@ class GitLoader(DVCSLoader): - """A bulk loader for a git repository""" + """A bulk loader for a git repository + + Emits the following statsd stats: + + * increments ``swh_loader_git`` + * histogram ``swh_loader_git_ignored_refs_percent`` is the ratio of refs ignored + over all refs of the remote repository + * histogram ``swh_loader_git_known_refs_percent`` is the ratio of (non-ignored) + remote heads that are already local over all non-ignored remote heads + + All three are tagged with ``{{"incremental": ""}}`` where + ``incremental_mode`` is one of: + + * ``from_same_origin`` when the origin was already loaded + * ``from_parent_origin`` when the origin was not already loaded, + but it was detected as a forge-fork of an origin that was already loaded + * ``no_previous_snapshot`` when the origin was not already loaded, + and it was detected as a forge-fork of origins that were not already loaded either + * ``no_parent_origin`` when the origin was no already loaded, and it was not + detected as a forge-fork of any other origin + * ``disabled`` when incremental loading is disabled by configuration + """ visit_type = "git" @@ -235,15 +272,29 @@ if self.incremental: prev_snapshot = self.get_full_snapshot(self.origin.url) - - if self.parent_origins is not None: + if prev_snapshot: + statsd_incremental_mode = "from_same_origin" + elif self.parent_origins is not None: # If this is the first time we load this origin and it is a forge # fork, load incrementally from one of the origins it was forked from, # closest parent first for parent_origin in self.parent_origins: + prev_snapshot = self.get_full_snapshot(parent_origin.url) if prev_snapshot is not None: + statsd_incremental_mode = "from_parent_origin" break - prev_snapshot = self.get_full_snapshot(parent_origin.url) + else: + statsd_incremental_mode = "no_previous_snapshot" + else: + statsd_incremental_mode = "no_parent_origin" + else: + statsd_incremental_mode = "disabled" + + self.statsd.constant_tags["incremental"] = statsd_incremental_mode + + # Increments a metric with full name 'swh_loader_git'; which is useful to + # count how many runs of the loader are with each incremental mode + self.statsd.increment("git", tags={}) if prev_snapshot is not None: self.base_snapshot = prev_snapshot @@ -257,6 +308,7 @@ storage=self.storage, base_snapshot=self.base_snapshot, incremental=self.incremental, + statsd=self.statsd, ) def do_progress(msg: bytes) -> None: diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,12 +1,14 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime from functools import partial from http.server import HTTPServer, SimpleHTTPRequestHandler import os import subprocess +import sys from tempfile import SpooledTemporaryFile from threading import Thread from unittest.mock import MagicMock, call @@ -18,13 +20,13 @@ from swh.loader.git import dumb from swh.loader.git.loader import GitLoader -from swh.loader.git.tests.test_from_disk import FullGitLoaderTests +from swh.loader.git.tests.test_from_disk import SNAPSHOT1, FullGitLoaderTests from swh.loader.tests import ( assert_last_visit_matches, get_stats, prepare_repository_from_archive, ) -from swh.model.model import Origin +from swh.model.model import Origin, OriginVisit, OriginVisitStatus class CommonGitLoaderNotFound: @@ -151,7 +153,8 @@ ) self.repo = dulwich.repo.Repo(self.destination_path) - def test_load_incremental(self): + def test_no_previous_snapshot(self, mocker): + statsd_report = mocker.patch.object(self.loader.statsd, "_report") res = self.loader.load() assert res == {"status": "eventful"} @@ -180,9 +183,102 @@ ), ] + # TODO: assert "incremental" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 0.0, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental": "no_previous_snapshot", + } + + def test_load_incremental(self, mocker): + statsd_report = mocker.patch.object(self.loader.statsd, "_report") + + snapshot_id = b"\x01" * 20 + now = datetime.datetime.now(tz=datetime.timezone.utc) + + def ovgl(origin_url, allowed_statuses, require_snapshot, type): + if origin_url == f"base://{self.repo_url}": + return OriginVisit(origin=origin_url, visit=42, date=now, type="git") + else: + return None + + self.loader.storage.origin_visit_get_latest.side_effect = ovgl + self.loader.storage.origin_visit_status_get_latest.return_value = ( + OriginVisitStatus( + origin=f"base://{self.repo_url}", + visit=42, + snapshot=snapshot_id, + date=now, + status="full", + ) + ) + self.loader.storage.snapshot_get_branches.return_value = { + "id": snapshot_id, + "branches": { + b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"] + }, + "next_branch": None, + } + + res = self.loader.load() + assert res == {"status": "eventful"} + + self.fetcher_cls.assert_called_once_with( + credentials={}, + lister_name="fake-lister", + lister_instance_name="", + origin=Origin(url=self.repo_url), + ) + self.fetcher.get_parent_origins.assert_called_once_with() + + # First tries the same origin + assert self.loader.storage.origin_visit_get_latest.mock_calls == [ + call( + self.repo_url, + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + # As it does not already have a snapshot, fall back to the parent origin + call( + f"base://{self.repo_url}", + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + ] + + # TODO: assert "incremental" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 0.25, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental": "from_parent_origin", + } + self.fetcher.reset_mock() self.fetcher_cls.reset_mock() - self.loader.storage.reset_mock() + if sys.version_info >= (3, 9, 0): + self.loader.storage.reset_mock(return_value=True, side_effect=True) + else: + # Reimplement https://github.com/python/cpython/commit/aef7dc89879d099dc704bd8037b8a7686fb72838 # noqa + # for old Python versions: + def reset_mock(m): + m.reset_mock(return_value=True, side_effect=True) + for child in m._mock_children.values(): + reset_mock(child) + + reset_mock(self.loader.storage) + statsd_report.reset_mock() # Load again res = self.loader.load() @@ -200,13 +296,25 @@ # Tries the same origin, and finds a snapshot call( self.repo_url, + type=None, allowed_statuses=None, require_snapshot=True, - type=None, ), # -> does not need to fall back to the parent ] + # TODO: assert "incremental" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 1.0, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental": "from_same_origin", + } + class DumbGitLoaderTestBase(FullGitLoaderTests): """Prepare a git repository to be loaded using the HTTP dumb transfer protocol."""