diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 3.2.0 +swh.loader.core >= 3.4.0 swh.model >= 4.3.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -18,6 +18,7 @@ from dulwich.objects import ShaFile from dulwich.pack import PackData, PackInflater +from swh.core.statsd import Statsd from swh.loader.core.loader import DVCSLoader from swh.loader.exception import NotFound from swh.model import hashutil @@ -47,9 +48,11 @@ storage, base_snapshot: Optional[Snapshot] = None, incremental: bool = True, + statsd: Statsd = None, ): self.storage = storage self.incremental = incremental + self.statsd = statsd if base_snapshot and incremental: self.base_snapshot: Snapshot = base_snapshot @@ -95,6 +98,17 @@ logger.debug("remote_heads_count=%s", len(remote_heads)) wanted_refs = list(remote_heads - local_heads) logger.debug("wanted_refs_count=%s", len(wanted_refs)) + if self.statsd is not None: + self.statsd.histogram( + "git_ignored_refs_percent", + len(remote_heads - set(refs.values())) / len(refs), + tags={}, + ) + self.statsd.histogram( + "git_known_refs_percent", + len(local_heads & remote_heads) / len(remote_heads), + tags={}, + ) return wanted_refs @@ -107,7 +121,28 @@ class GitLoader(DVCSLoader): - """A bulk loader for a git repository""" + """A bulk loader for a git repository + + Emits the following statsd stats: + + * increments ``swh_loader_git`` + * histogram ``swh_loader_git_ignored_refs_percent`` is the ratio of refs ignored + over all refs of the remote repository + * histogram ``swh_loader_git_known_refs_percent`` is the ratio of (non-ignored) + remote heads that are already local over all non-ignored remote heads + + All three are tagged with ``{{"incremental": ""}}`` where + ``incremental_mode`` is one of: + + * ``from_same_origin`` when the origin was already loaded + * ``from_parent_origin`` when the origin was not already loaded, + but it was detected as a forge-fork of an origin that was already loaded + * ``no_previous_snapshot`` when the origin was not already loaded, + and it was detected as a forge-fork of origins that were not already loaded either + * ``no_parent_origin`` when the origin was no already loaded, and it was not + detected as a forge-fork of any other origin + * ``disabled`` when incremental loading is disabled by configuration + """ visit_type = "git" @@ -233,17 +268,34 @@ prev_snapshot: Optional[Snapshot] = None + self.statsd.constant_tags["incremental_enabled"] = self.incremental + self.statsd.constant_tags["has_parent_origins"] = bool(self.parent_origins) if self.incremental: prev_snapshot = self.get_full_snapshot(self.origin.url) + if prev_snapshot: + incremental_snapshot_origin = "self" - if self.parent_origins is not None: + elif self.parent_origins is not None: # If this is the first time we load this origin and it is a forge # fork, load incrementally from one of the origins it was forked from, # closest parent first for parent_origin in self.parent_origins: + prev_snapshot = self.get_full_snapshot(parent_origin.url) if prev_snapshot is not None: + incremental_snapshot_origin = "parent" break - prev_snapshot = self.get_full_snapshot(parent_origin.url) + else: + incremental_snapshot_origin = "none" + else: + incremental_snapshot_origin = "none" + + self.statsd.constant_tags[ + "incremental_snapshot_origin" + ] = incremental_snapshot_origin + + # Increments a metric with full name 'swh_loader_git'; which is useful to + # count how many runs of the loader are with each incremental mode + self.statsd.increment("git", tags={}) if prev_snapshot is not None: self.base_snapshot = prev_snapshot @@ -257,6 +309,7 @@ storage=self.storage, base_snapshot=self.base_snapshot, incremental=self.incremental, + statsd=self.statsd, ) def do_progress(msg: bytes) -> None: diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,12 +1,14 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime from functools import partial from http.server import HTTPServer, SimpleHTTPRequestHandler import os import subprocess +import sys from tempfile import SpooledTemporaryFile from threading import Thread from unittest.mock import MagicMock, call @@ -18,13 +20,13 @@ from swh.loader.git import dumb from swh.loader.git.loader import GitLoader -from swh.loader.git.tests.test_from_disk import FullGitLoaderTests +from swh.loader.git.tests.test_from_disk import SNAPSHOT1, FullGitLoaderTests from swh.loader.tests import ( assert_last_visit_matches, get_stats, prepare_repository_from_archive, ) -from swh.model.model import Origin +from swh.model.model import Origin, OriginVisit, OriginVisitStatus class CommonGitLoaderNotFound: @@ -111,6 +113,25 @@ self.loader = GitLoader(swh_storage, self.repo_url) self.repo = dulwich.repo.Repo(self.destination_path) + def test_metrics(self, mocker): + statsd_report = mocker.patch.object(self.loader.statsd, "_report") + res = self.loader.load() + assert res == {"status": "eventful"} + + # TODO: assert "incremental" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 0.0, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental_enabled": True, + "incremental_snapshot_origin": "none", + "has_parent_origins": False, + } + class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a ``parent_origin`` different from the @@ -151,7 +172,8 @@ ) self.repo = dulwich.repo.Repo(self.destination_path) - def test_load_incremental(self): + def test_no_previous_snapshot(self, mocker): + statsd_report = mocker.patch.object(self.loader.statsd, "_report") res = self.loader.load() assert res == {"status": "eventful"} @@ -180,9 +202,106 @@ ), ] + # TODO: assert "incremental" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 0.0, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental_enabled": True, + "incremental_snapshot_origin": "none", + "has_parent_origins": True, + } + + def test_load_incremental(self, mocker): + statsd_report = mocker.patch.object(self.loader.statsd, "_report") + + snapshot_id = b"\x01" * 20 + now = datetime.datetime.now(tz=datetime.timezone.utc) + + def ovgl(origin_url, allowed_statuses, require_snapshot, type): + if origin_url == f"base://{self.repo_url}": + return OriginVisit(origin=origin_url, visit=42, date=now, type="git") + else: + return None + + self.loader.storage.origin_visit_get_latest.side_effect = ovgl + self.loader.storage.origin_visit_status_get_latest.return_value = ( + OriginVisitStatus( + origin=f"base://{self.repo_url}", + visit=42, + snapshot=snapshot_id, + date=now, + status="full", + ) + ) + self.loader.storage.snapshot_get_branches.return_value = { + "id": snapshot_id, + "branches": { + b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"] + }, + "next_branch": None, + } + + res = self.loader.load() + assert res == {"status": "eventful"} + + self.fetcher_cls.assert_called_once_with( + credentials={}, + lister_name="fake-lister", + lister_instance_name="", + origin=Origin(url=self.repo_url), + ) + self.fetcher.get_parent_origins.assert_called_once_with() + + # First tries the same origin + assert self.loader.storage.origin_visit_get_latest.mock_calls == [ + call( + self.repo_url, + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + # As it does not already have a snapshot, fall back to the parent origin + call( + f"base://{self.repo_url}", + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + ] + + # TODO: assert "incremental*" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 0.25, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental_enabled": True, + "incremental_snapshot_origin": "parent", + "has_parent_origins": True, + } + self.fetcher.reset_mock() self.fetcher_cls.reset_mock() - self.loader.storage.reset_mock() + if sys.version_info >= (3, 9, 0): + self.loader.storage.reset_mock(return_value=True, side_effect=True) + else: + # Reimplement https://github.com/python/cpython/commit/aef7dc89879d099dc704bd8037b8a7686fb72838 # noqa + # for old Python versions: + def reset_mock(m): + m.reset_mock(return_value=True, side_effect=True) + for child in m._mock_children.values(): + reset_mock(child) + + reset_mock(self.loader.storage) + statsd_report.reset_mock() # Load again res = self.loader.load() @@ -200,13 +319,27 @@ # Tries the same origin, and finds a snapshot call( self.repo_url, + type=None, allowed_statuses=None, require_snapshot=True, - type=None, ), # -> does not need to fall back to the parent ] + # TODO: assert "incremental*" is added to constant tags before these + # metrics are sent + assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git")] == [ + call("git", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 1.0, {}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental_enabled": True, + "incremental_snapshot_origin": "self", + "has_parent_origins": True, + } + class DumbGitLoaderTestBase(FullGitLoaderTests): """Prepare a git repository to be loaded using the HTTP dumb transfer protocol."""