diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ dist/ version.txt .mypy_cache/ +.hypothesis \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,6 +1,7 @@ +confluent-kafka +hypothesis pytest pytest-mock -confluent-kafka pytest-redis pyyaml requests_mock diff --git a/swh/counters/journal_client.py b/swh/counters/journal_client.py --- a/swh/counters/journal_client.py +++ b/swh/counters/journal_client.py @@ -3,7 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict from typing import Dict +from urllib.parse import urlparse import msgpack @@ -20,6 +22,9 @@ for key in messages.keys(): counters.add(key, messages[key]) + if "origin" in messages: + process_origins(messages["origin"], counters) + if "revision" in messages: process_revisions(messages["revision"], counters) @@ -27,6 +32,23 @@ process_releases(messages["release"], counters) +def process_origins(origins: Dict[bytes, bytes], counters: CountersInterface): + """Count the number of different network locations in origin URLs.""" + origins_netloc = defaultdict(set) + for origin_bytes in origins.values(): + origin = msgpack.loads(origin_bytes) + parsed_url = urlparse(origin["url"]) + netloc = parsed_url.netloc + if netloc.endswith("googlecode.com"): + # special case for googlecode origins where URL netloc + # has the form {account}.googlecode.com + netloc = "googlecode.com" + origins_netloc[f"origin_netloc:{netloc}"].add(origin["url"]) + + for k, v in origins_netloc.items(): + counters.add(k, v) + + def process_revisions(revisions: Dict[bytes, bytes], counters: CountersInterface): """Count the number of different authors and committers on the revisions (in the person collection)""" diff --git a/swh/counters/tests/test_journal_client.py b/swh/counters/tests/test_journal_client.py --- a/swh/counters/tests/test_journal_client.py +++ b/swh/counters/tests/test_journal_client.py @@ -3,19 +3,26 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import Counter from typing import Dict, Optional +from urllib.parse import urlparse +from hypothesis import given +from hypothesis.strategies import lists import msgpack from swh.counters.journal_client import ( process_journal_messages, + process_origins, process_releases, process_revisions, ) from swh.counters.redis import Redis from swh.model.hashutil import hash_to_bytes +from swh.model.hypothesis_strategies import origins from swh.model.model import ( ObjectType, + Origin, Person, Release, Revision, @@ -150,3 +157,50 @@ process_releases(releases, redis) assert redis.get_counts(redis.get_counters()) == {} + + +def test_journal_client_process_origins(local_redis_host, redisdb): + # hypothesis does not play well with pytest function scoped fixtures + # so we use an inner test function as workaround + @given(lists(origins())) + def inner(origins): + origins_ = { + msgpack.dumps(origin.to_dict()): msgpack.dumps(origin.to_dict()) + for origin in origins + } + + # explicitly flush redis db content due to hypothesis use + redisdb.flushall() + redis = Redis(host=local_redis_host) + + process_origins(origins_, redis) + + expected_counts = Counter( + [ + f"origin_netloc:{urlparse(origin.url).netloc}".encode() + for origin in set(origins) + ] + ) + + assert redis.get_counts(redis.get_counters()) == expected_counts + + inner() + + +def test_journal_client_process_googlecode_origins(local_redis_host): + origins = [ + Origin(url="https://foo.googlecode.com"), + Origin(url="https://bar.googlecode.com"), + ] + origins_ = { + msgpack.dumps(origin.to_dict()): msgpack.dumps(origin.to_dict()) + for origin in origins + } + + redis = Redis(host=local_redis_host) + + process_origins(origins_, redis) + + assert redis.get_counts(redis.get_counters()) == { + b"origin_netloc:googlecode.com": 2 + }