diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 3.4.0 +swh.loader.core >= 3.5.0 swh.model >= 4.3.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh/loader/git/base.py b/swh/loader/git/base.py --- a/swh/loader/git/base.py +++ b/swh/loader/git/base.py @@ -3,7 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Iterable +import collections +from typing import Dict, Iterable from swh.loader.core.loader import BaseLoader from swh.model.model import ( @@ -72,24 +73,51 @@ if self.save_data_path: self.save_data() + counts: Dict[str, int] = collections.defaultdict(int) + storage_summary: Dict[str, int] = collections.Counter() + if self.has_contents(): for obj in self.get_contents(): if isinstance(obj, Content): - self.storage.content_add([obj]) + counts["content"] += 1 + storage_summary.update(self.storage.content_add([obj])) elif isinstance(obj, SkippedContent): - self.storage.skipped_content_add([obj]) + counts["skipped_content"] += 1 + storage_summary.update(self.storage.skipped_content_add([obj])) else: raise TypeError(f"Unexpected content type: {obj}") + if self.has_directories(): for directory in self.get_directories(): - self.storage.directory_add([directory]) + counts["directory"] += 1 + storage_summary.update(self.storage.directory_add([directory])) + if self.has_revisions(): for revision in self.get_revisions(): - self.storage.revision_add([revision]) + counts["revision"] += 1 + storage_summary.update(self.storage.revision_add([revision])) + if self.has_releases(): for release in self.get_releases(): - self.storage.release_add([release]) + counts["release"] += 1 + storage_summary.update(self.storage.release_add([release])) + snapshot = self.get_snapshot() - self.storage.snapshot_add([snapshot]) - self.flush() + counts["snapshot"] += 1 + storage_summary.update(self.storage.snapshot_add([snapshot])) + + storage_summary.update(self.flush()) self.loaded_snapshot_id = snapshot.id + + for (object_type, total) in counts.items(): + filtered = total - storage_summary[f"{object_type}:add"] + assert 0 <= filtered <= total, (filtered, total) + + if total == 0: + # No need to send it + continue + + # cannot use self.statsd_average, because this is a weighted average + tags = {"object_type": object_type} + self.statsd.increment("filtered_objects_percent_sum", filtered, tags=tags) + self.statsd.increment("filtered_objects_percent_count", total, tags=tags) diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -18,7 +18,7 @@ import dulwich.repo import pytest -from swh.loader.git import dumb +from swh.loader.git import converters, dumb from swh.loader.git.loader import GitLoader from swh.loader.git.tests.test_from_disk import SNAPSHOT1, FullGitLoaderTests from swh.loader.tests import ( @@ -120,11 +120,86 @@ # TODO: assert "incremental" is added to constant tags before these # metrics are sent - assert [c for c in statsd_report.mock_calls if c[1][0].startswith("git_")] == [ + statsd_calls = statsd_report.mock_calls + assert [c for c in statsd_calls if c[1][0].startswith("git_")] == [ + call("git_total", "c", 1, {}, 1), + call("git_ignored_refs_percent", "h", 0.0, {}, 1), + call("git_known_refs_percent", "h", 0.0, {}, 1), + ] + sum_name = "filtered_objects_percent_sum" + count_name = "filtered_objects_percent_count" + assert [c for c in statsd_calls if c[1][0].startswith("filtered_")] == [ + call(sum_name, "c", 0, {"object_type": "content"}, 1), + call(count_name, "c", 4, {"object_type": "content"}, 1), + call(sum_name, "c", 0, {"object_type": "directory"}, 1), + call(count_name, "c", 7, {"object_type": "directory"}, 1), + call(sum_name, "c", 0, {"object_type": "revision"}, 1), + call(count_name, "c", 7, {"object_type": "revision"}, 1), + call(sum_name, "c", 0, {"object_type": "snapshot"}, 1), + call(count_name, "c", 1, {"object_type": "snapshot"}, 1), + ] + assert self.loader.statsd.constant_tags == { + "visit_type": "git", + "incremental_enabled": True, + "has_parent_snapshot": False, + "has_previous_snapshot": False, + "has_parent_origins": False, + } + + def test_metrics_filtered(self, mocker): + """Tests that presence of some objects in the storage (but not referenced + by a snapshot) is reported""" + + known_revs = [ + converters.dulwich_commit_to_revision(self.repo[sha1]) + for sha1 in [ + b"b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a", + b"1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b", + ] + ] + known_dirs = [ + converters.dulwich_tree_to_directory(self.repo[sha1]) + for sha1 in [ + b"fbf70528223d263661b5ad4b80f26caf3860eb8e", + b"9ca0c7d6ffa3f9f0de59fd7912e08f11308a1338", + b"5df34ec74d6f69072d9a0a6677d8efbed9b12e60", + ] + ] + known_cnts = [ + converters.dulwich_blob_to_content(self.repo[sha1]) + for sha1 in [ + b"534d61ecee4f6da4d6ca6ddd8abf258208d2d1bc", + ] + ] + self.loader.storage.revision_add(known_revs) + self.loader.storage.directory_add(known_dirs) + self.loader.storage.content_add(known_cnts) + self.loader.storage.flush() + + statsd_report = mocker.patch.object(self.loader.statsd, "_report") + res = self.loader.load() + assert res == {"status": "eventful"} + + # TODO: assert "incremental" is added to constant tags before these + # metrics are sent + statsd_calls = statsd_report.mock_calls + assert [c for c in statsd_calls if c[1][0].startswith("git_")] == [ call("git_total", "c", 1, {}, 1), call("git_ignored_refs_percent", "h", 0.0, {}, 1), call("git_known_refs_percent", "h", 0.0, {}, 1), ] + sum_name = "filtered_objects_percent_sum" + count_name = "filtered_objects_percent_count" + assert [c for c in statsd_calls if c[1][0].startswith("filtered_")] == [ + call(sum_name, "c", len(known_cnts), {"object_type": "content"}, 1), + call(count_name, "c", 4, {"object_type": "content"}, 1), + call(sum_name, "c", len(known_dirs), {"object_type": "directory"}, 1), + call(count_name, "c", 7, {"object_type": "directory"}, 1), + call(sum_name, "c", len(known_revs), {"object_type": "revision"}, 1), + call(count_name, "c", 7, {"object_type": "revision"}, 1), + call(sum_name, "c", 0, {"object_type": "snapshot"}, 1), + call(count_name, "c", 1, {"object_type": "snapshot"}, 1), + ] assert self.loader.statsd.constant_tags == { "visit_type": "git", "incremental_enabled": True,