diff --git a/.gitignore b/.gitignore index f9ef9a5..94f24a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,14 @@ *.egg-info/ *.pyc *.sw? *~ .coverage .eggs/ .mypy_cache .tox __pycache__ build/ dist/ version.txt .mypy_cache/ +.hypothesis \ No newline at end of file diff --git a/PKG-INFO b/PKG-INFO index 1cf1e81..608d816 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,27 +1,31 @@ Metadata-Version: 2.1 Name: swh.counters -Version: 0.7.0 +Version: 0.8.0 Summary: Software Heritage Next gen counters Home-page: https://forge.softwareheritage.org/source/swh-counters Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-counters Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-counters/ -Description: swh-counters - ============ - - Service providing efficient estimates of the number of objects in the SWH archive, using - Redis's Hyperloglog - Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing +License-File: LICENSE +License-File: AUTHORS + +swh-counters +============ + +Service providing efficient estimates of the number of objects in the SWH archive, using +Redis's Hyperloglog + + diff --git a/requirements-test.txt b/requirements-test.txt index 804503e..01b2af6 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,12 @@ +confluent-kafka +hypothesis pytest pytest-mock -confluent-kafka pytest-redis +pyyaml requests_mock + +types-flask +types-pyyaml +types-redis +types-requests diff --git a/swh.counters.egg-info/PKG-INFO b/swh.counters.egg-info/PKG-INFO index 1cf1e81..608d816 100644 --- a/swh.counters.egg-info/PKG-INFO +++ b/swh.counters.egg-info/PKG-INFO @@ -1,27 +1,31 @@ Metadata-Version: 2.1 Name: swh.counters -Version: 0.7.0 +Version: 0.8.0 Summary: Software Heritage Next gen counters Home-page: https://forge.softwareheritage.org/source/swh-counters Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-counters Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-counters/ -Description: swh-counters - ============ - - Service providing efficient estimates of the number of objects in the SWH archive, using - Redis's Hyperloglog - Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing +License-File: LICENSE +License-File: AUTHORS + +swh-counters +============ + +Service providing efficient estimates of the number of objects in the SWH archive, using +Redis's Hyperloglog + + diff --git a/swh.counters.egg-info/requires.txt b/swh.counters.egg-info/requires.txt index 35a67fc..37aff98 100644 --- a/swh.counters.egg-info/requires.txt +++ b/swh.counters.egg-info/requires.txt @@ -1,11 +1,17 @@ Flask redis swh.core[http]>=0.3 swh.journal [testing] +confluent-kafka +hypothesis pytest pytest-mock -confluent-kafka pytest-redis +pyyaml requests_mock +types-flask +types-pyyaml +types-redis +types-requests diff --git a/swh/__init__.py b/swh/__init__.py index fa32dfd..2933e6c 100644 --- a/swh/__init__.py +++ b/swh/__init__.py @@ -1,9 +1,9 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pkgutil import extend_path -from typing import Iterable +from typing import List -__path__: Iterable[str] = extend_path(__path__, __name__) +__path__: List[str] = extend_path(__path__, __name__) diff --git a/swh/counters/journal_client.py b/swh/counters/journal_client.py index 07e0488..cf24ca4 100644 --- a/swh/counters/journal_client.py +++ b/swh/counters/journal_client.py @@ -1,52 +1,74 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict from typing import Dict +from urllib.parse import urlparse import msgpack -from swh.counters.redis import Redis +from swh.counters.interface import CountersInterface def process_journal_messages( - messages: Dict[str, Dict[bytes, bytes]], *, counters: Redis + messages: Dict[str, Dict[bytes, bytes]], *, counters: CountersInterface ) -> None: """Count the number of different values of an object's property. It allow for example to count the persons inside the Release (authors) and Revision (authors and committers) classes """ for key in messages.keys(): counters.add(key, messages[key]) + if "origin" in messages: + process_origins(messages["origin"], counters) + if "revision" in messages: process_revisions(messages["revision"], counters) if "release" in messages: process_releases(messages["release"], counters) -def process_revisions(revisions: Dict[bytes, bytes], counters: Redis): +def process_origins(origins: Dict[bytes, bytes], counters: CountersInterface): + """Count the number of different network locations in origin URLs.""" + origins_netloc = defaultdict(set) + for origin_bytes in origins.values(): + origin = msgpack.loads(origin_bytes) + parsed_url = urlparse(origin["url"]) + netloc = parsed_url.netloc + if netloc.endswith("googlecode.com"): + # special case for googlecode origins where URL netloc + # has the form {account}.googlecode.com + netloc = "googlecode.com" + origins_netloc[f"origin_netloc:{netloc}"].add(origin["url"]) + + for k, v in origins_netloc.items(): + counters.add(k, v) + + +def process_revisions(revisions: Dict[bytes, bytes], counters: CountersInterface): """Count the number of different authors and committers on the revisions (in the person collection)""" persons = set() for revision_bytes in revisions.values(): revision = msgpack.loads(revision_bytes) persons.add(revision["author"]["fullname"]) persons.add(revision["committer"]["fullname"]) counters.add("person", list(persons)) -def process_releases(releases: Dict[bytes, bytes], counters: Redis): +def process_releases(releases: Dict[bytes, bytes], counters: CountersInterface): """Count the number of different authors on the releases (in the person collection)""" persons = set() for release_bytes in releases.values(): release = msgpack.loads(release_bytes) author = release.get("author") if author and "fullname" in author: persons.add(author["fullname"]) counters.add("person", list(persons)) diff --git a/swh/counters/tests/test_journal_client.py b/swh/counters/tests/test_journal_client.py index 910d610..59bb14b 100644 --- a/swh/counters/tests/test_journal_client.py +++ b/swh/counters/tests/test_journal_client.py @@ -1,181 +1,201 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import Counter from typing import Dict, Optional +from urllib.parse import urlparse +from hypothesis import given +from hypothesis.strategies import lists import msgpack from swh.counters.journal_client import ( process_journal_messages, + process_origins, process_releases, process_revisions, ) from swh.counters.redis import Redis from swh.model.hashutil import hash_to_bytes +from swh.model.hypothesis_strategies import origins from swh.model.model import ( ObjectType, + Origin, Person, Release, Revision, RevisionType, Timestamp, TimestampWithTimezone, ) -PROCESSING_METHODS = { - "release": "swh.counters.journal_client.process_releases", - "revision": "swh.counters.journal_client.process_revisions", -} - DATE = TimestampWithTimezone( timestamp=Timestamp(seconds=0, microseconds=0), offset=0, negative_utc=False ) -def _get_processing_method_mocks(mocker): - return { - message_type: mocker.patch(PROCESSING_METHODS[message_type]) - for message_type in PROCESSING_METHODS.keys() - } - - def _create_release(author_fullname: Optional[str]) -> Dict: """Use Release.to_dict to be sure the field's name used to retrieve the author is correct""" author = None if author_fullname: author = Person(fullname=bytes(author_fullname, "utf-8"), name=None, email=None) release = Release( name=b"Release", message=b"Message", target=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), target_type=ObjectType.CONTENT, synthetic=True, author=author, ) return release.to_dict() def _create_revision(author_fullname: str, committer_fullname: str) -> Dict: """Use Revision.to_dict to be sure the names of the fields used to retrieve the author and the committer are correct""" revision = Revision( committer_date=DATE, date=None, type=RevisionType.GIT, parents=(), directory=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), synthetic=True, message=None, author=Person(fullname=bytes(author_fullname, "utf-8"), name=None, email=None), committer=Person( fullname=bytes(committer_fullname, "utf-8"), name=None, email=None ), ) return revision.to_dict() RELEASES = { rel["id"]: msgpack.dumps(rel) for rel in [ _create_release(author_fullname="author 1"), _create_release(author_fullname="author 2"), _create_release(author_fullname=None), ] } RELEASES_AUTHOR_FULLNAMES = {b"author 1", b"author 2"} REVISIONS = { rev["id"]: msgpack.dumps(rev) for rev in [ _create_revision(author_fullname="author 1", committer_fullname="committer 1"), _create_revision(author_fullname="author 2", committer_fullname="committer 2"), _create_revision(author_fullname="author 2", committer_fullname="committer 1"), _create_revision(author_fullname="author 1", committer_fullname="committer 2"), ] } REVISIONS_AUTHOR_FULLNAMES = {b"author 1", b"author 2"} REVISIONS_COMMITTER_FULLNAMES = {b"committer 1", b"committer 2"} REVISIONS_PERSON_FULLNAMES = REVISIONS_AUTHOR_FULLNAMES | REVISIONS_COMMITTER_FULLNAMES -def test__journal_client__all_keys(mocker): - - mock = mocker.patch("swh.counters.redis.Redis.add") - - redis = Redis(host="localhost") +def test_journal_client_all_keys(local_redis_host): + redis = Redis(host=local_redis_host) keys = { "coll1": {b"key1": b"value1", b"key2": b"value2"}, "coll2": {b"key3": b"value3", b"key4": b"value4", b"key5": b"value5"}, } process_journal_messages(messages=keys, counters=redis) - assert mock.call_count == 2 - - first_call_args = mock.call_args_list[0] - assert first_call_args[0][0] == "coll1" - assert first_call_args[0][1] == keys["coll1"] + assert redis.get_counts(redis.get_counters()) == {b"coll1": 2, b"coll2": 3} - second_call_args = mock.call_args_list[1] - assert second_call_args[0][0] == "coll2" - assert second_call_args[0][1] == keys["coll2"] - -def test__journal_client_process_revisions(mocker): - mock = mocker.patch("swh.counters.redis.Redis.add") - - redis = Redis(host="localhost") +def test_journal_client_process_revisions(local_redis_host): + redis = Redis(host=local_redis_host) process_revisions(REVISIONS, redis) - assert mock.call_count == 1 - first_call_args = mock.call_args_list[0] - assert first_call_args[0][0] == "person" - assert sorted(first_call_args[0][1]) == sorted(REVISIONS_PERSON_FULLNAMES) - + assert redis.get_counts(redis.get_counters()) == { + b"person": len(REVISIONS_PERSON_FULLNAMES) + } -def test__journal_client_process_releases(mocker): - mock = mocker.patch("swh.counters.redis.Redis.add") - redis = Redis(host="localhost") +def test_journal_client_process_releases(local_redis_host): + redis = Redis(host=local_redis_host) process_releases(RELEASES, redis) - assert mock.call_count == 1 - first_call_args = mock.call_args_list[0] - assert first_call_args[0][0] == "person" - assert first_call_args[0][1] == list(RELEASES_AUTHOR_FULLNAMES) - + assert redis.get_counts(redis.get_counters()) == { + b"person": len(RELEASES_AUTHOR_FULLNAMES) + } -def test__journal_client_process_releases_without_authors(mocker): - mock = mocker.patch("swh.counters.redis.Redis.add") +def test_journal_client_process_releases_without_authors(local_redis_host): releases = { rel["id"]: msgpack.dumps(rel) for rel in [ _create_release(author_fullname=None), _create_release(author_fullname=None), ] } - redis = Redis(host="localhost") + redis = Redis(host=local_redis_host) process_releases(releases, redis) - assert mock.called == 1 - first_call_args = mock.call_args_list[0] - assert first_call_args[0][0] == "person" - assert first_call_args[0][1] == [] + assert redis.get_counts(redis.get_counters()) == {} + + +def test_journal_client_process_origins(local_redis_host, redisdb): + # hypothesis does not play well with pytest function scoped fixtures + # so we use an inner test function as workaround + @given(lists(origins())) + def inner(origins): + origins_ = { + msgpack.dumps(origin.to_dict()): msgpack.dumps(origin.to_dict()) + for origin in origins + } + + # explicitly flush redis db content due to hypothesis use + redisdb.flushall() + redis = Redis(host=local_redis_host) + + process_origins(origins_, redis) + + expected_counts = Counter( + [ + f"origin_netloc:{urlparse(origin.url).netloc}".encode() + for origin in set(origins) + ] + ) + + assert redis.get_counts(redis.get_counters()) == expected_counts + + inner() + + +def test_journal_client_process_googlecode_origins(local_redis_host): + origins = [ + Origin(url="https://foo.googlecode.com"), + Origin(url="https://bar.googlecode.com"), + ] + origins_ = { + msgpack.dumps(origin.to_dict()): msgpack.dumps(origin.to_dict()) + for origin in origins + } + + redis = Redis(host=local_redis_host) + + process_origins(origins_, redis) + + assert redis.get_counts(redis.get_counters()) == { + b"origin_netloc:googlecode.com": 2 + } diff --git a/tox.ini b/tox.ini index 1f15fb0..978f07f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,36 +1,74 @@ [tox] envlist=black,flake8,mypy,py3 [testenv] extras = testing deps = pytest-cov dev: pdbpp commands = pytest --doctest-modules \ {envsitepackagesdir}/swh/counters \ --cov={envsitepackagesdir}/swh/counters \ --cov-branch {posargs} [testenv:black] skip_install = true deps = black==19.10b0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:mypy] extras = testing deps = mypy commands = mypy swh + +# build documentation outside swh-environment using the current +# git HEAD of swh-docs, is executed on CI for each diff to prevent +# breaking doc build +[testenv:sphinx] +whitelist_externals = make +usedevelop = true +extras = + testing +deps = + # fetch and install swh-docs in develop mode + -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs + +setenv = + SWH_PACKAGE_DOC_TOX_BUILD = 1 + # turn warnings into errors + SPHINXOPTS = -W +commands = + make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs + + +# build documentation only inside swh-environment using local state +# of swh-docs package +[testenv:sphinx-dev] +whitelist_externals = make +usedevelop = true +extras = + testing +deps = + # install swh-docs in develop mode + -e ../swh-docs + +setenv = + SWH_PACKAGE_DOC_TOX_BUILD = 1 + # turn warnings into errors + SPHINXOPTS = -W +commands = + make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs