diff --git a/.gitignore b/.gitignore index 94f24a8..f9ef9a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,13 @@ *.egg-info/ *.pyc *.sw? *~ .coverage .eggs/ .mypy_cache .tox __pycache__ build/ dist/ version.txt .mypy_cache/ -.hypothesis \ No newline at end of file diff --git a/PKG-INFO b/PKG-INFO index 608d816..077fb2a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,31 +1,31 @@ Metadata-Version: 2.1 Name: swh.counters -Version: 0.8.0 +Version: 0.9.0 Summary: Software Heritage Next gen counters Home-page: https://forge.softwareheritage.org/source/swh-counters Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-counters Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-counters/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-counters ============ Service providing efficient estimates of the number of objects in the SWH archive, using Redis's Hyperloglog diff --git a/debian/changelog b/debian/changelog index 9ffe657..f868385 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,126 +1,129 @@ -swh.counters (0.8.0-1~swh1~bpo10+1) buster-swh; urgency=medium +swh.counters (0.9.0-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 0.9.0 - (tagged by Vincent SELLIER + on 2021-10-15 10:58:44 +0200) + * Upstream changes: - v0.9.0 - * T3659 Remove unused origin + location count - -- Software Heritage autobuilder (on jenkins-debian1) Tue, 22 Jun 2021 15:43:15 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Oct 2021 09:02:28 +0000 swh.counters (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine Lambert on 2021-06-22 17:34:45 +0200) * Upstream changes: - version 0.8.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 22 Jun 2021 15:41:57 +0000 swh.counters (0.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.0 - (tagged by Vincent SELLIER on 2021-04-23 10:21:10 +0200) * Upstream changes: - v0.7.0 - * Remove 'journal_type' argument from the CLI -- Software Heritage autobuilder (on jenkins-debian1) Fri, 23 Apr 2021 08:24:44 +0000 swh.counters (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Vincent SELLIER on 2021-04-22 16:15:13 +0200) * Upstream changes: - v0.6.0 - extract the person count from releases and revisions -- Software Heritage autobuilder (on jenkins-debian1) Thu, 22 Apr 2021 14:17:56 +0000 swh.counters (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Vincent SELLIER on 2021-04-14 11:24:46 +0200) * Upstream changes: - v0.5.1 - Add an in_memory implmentation for tests - fix documentation -- Software Heritage autobuilder (on jenkins-debian1) Wed, 14 Apr 2021 09:26:49 +0000 swh.counters (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Vincent SELLIER on 2021-04-13 19:09:26 +0200) * Upstream changes: - v0.5.0 - Add bulk api to retrieve the counter values -- Software Heritage autobuilder (on jenkins-debian1) Tue, 13 Apr 2021 17:11:46 +0000 swh.counters (0.4.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.2 - (tagged by Vincent SELLIER on 2021-04-08 23:53:45 +0200) * Upstream changes: - v0.4.2 - * Let flask manage json response by itself - * Fix history endpoints path -- Software Heritage autobuilder (on jenkins-debian1) Thu, 08 Apr 2021 21:56:07 +0000 swh.counters (0.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.1 - (tagged by Vincent SELLIER on 2021-04-08 12:20:16 +0200) * Upstream changes: - v0.4.1 - * fix debian stable build -- Software Heritage autobuilder (on jenkins-debian1) Thu, 08 Apr 2021 10:22:40 +0000 swh.counters (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Vincent SELLIER on 2021-04-07 17:09:57 +0200) * Upstream changes: - v0.4.0 - Add a GET endpoint to retrieve the historical data - Use a intermediate temporary file during the generation - of the historical data file -- Software Heritage autobuilder (on jenkins-debian1) Wed, 07 Apr 2021 15:14:08 +0000 swh.counters (0.3.0-1~swh2) unstable-swh; urgency=medium * Fix build dependency -- Vincent SELLIER Wed, 07 Apr 2021 12:37:36 +0200 swh.counters (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Vincent SELLIER on 2021-04-07 12:13:40 +0200) * Upstream changes: - v0.3.0 - improve the documentation - add management of the counters's historical data -- Software Heritage autobuilder (on jenkins-debian1) Wed, 07 Apr 2021 10:17:30 +0000 swh.counters (0.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.1 - (tagged by Valentin Lorentz on 2021-04-06 17:24:25 +0200) * Upstream changes: - v0.2.1 - Documentation improvements -- Software Heritage autobuilder (on jenkins-debian1) Tue, 06 Apr 2021 15:26:14 +0000 swh.counters (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by Vincent SELLIER on 2021-03-25 10:15:51 +0100) * Upstream changes: - v0.2.0 - Expose counters as prometheus metrics -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Mar 2021 09:18:18 +0000 swh.counters (0.1.0-1+swh3) unstable-swh; urgency=medium * Rebuild to remove the swh/__init__.py file from the package -- Vincent SELLIER Fri, 19 Mar 2021 17:25:15 +0100 swh.counters (0.1.0-1+swh2) unstable-swh; urgency=medium * Rebuild to unstuck the debian packaging. -- Vincent Sellier Wed, 17 Mar 2021 17:20:49 +0100 swh.counters (0.1.0-1+swh1) unstable-swh; urgency=medium * Initial release. -- Vincent Sellier Wed, 17 Mar 2021 17:20:49 +0100 diff --git a/requirements-test.txt b/requirements-test.txt index 01b2af6..b0b21a1 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,12 +1,11 @@ -confluent-kafka -hypothesis pytest pytest-mock +confluent-kafka pytest-redis pyyaml requests_mock types-flask types-pyyaml types-redis types-requests diff --git a/swh.counters.egg-info/PKG-INFO b/swh.counters.egg-info/PKG-INFO index 608d816..077fb2a 100644 --- a/swh.counters.egg-info/PKG-INFO +++ b/swh.counters.egg-info/PKG-INFO @@ -1,31 +1,31 @@ Metadata-Version: 2.1 Name: swh.counters -Version: 0.8.0 +Version: 0.9.0 Summary: Software Heritage Next gen counters Home-page: https://forge.softwareheritage.org/source/swh-counters Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-counters Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-counters/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-counters ============ Service providing efficient estimates of the number of objects in the SWH archive, using Redis's Hyperloglog diff --git a/swh.counters.egg-info/requires.txt b/swh.counters.egg-info/requires.txt index 37aff98..9616d60 100644 --- a/swh.counters.egg-info/requires.txt +++ b/swh.counters.egg-info/requires.txt @@ -1,17 +1,16 @@ Flask redis swh.core[http]>=0.3 swh.journal [testing] -confluent-kafka -hypothesis pytest pytest-mock +confluent-kafka pytest-redis pyyaml requests_mock types-flask types-pyyaml types-redis types-requests diff --git a/swh/counters/journal_client.py b/swh/counters/journal_client.py index cf24ca4..66942b4 100644 --- a/swh/counters/journal_client.py +++ b/swh/counters/journal_client.py @@ -1,74 +1,52 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from collections import defaultdict from typing import Dict -from urllib.parse import urlparse import msgpack from swh.counters.interface import CountersInterface def process_journal_messages( messages: Dict[str, Dict[bytes, bytes]], *, counters: CountersInterface ) -> None: """Count the number of different values of an object's property. It allow for example to count the persons inside the Release (authors) and Revision (authors and committers) classes """ for key in messages.keys(): counters.add(key, messages[key]) - if "origin" in messages: - process_origins(messages["origin"], counters) - if "revision" in messages: process_revisions(messages["revision"], counters) if "release" in messages: process_releases(messages["release"], counters) -def process_origins(origins: Dict[bytes, bytes], counters: CountersInterface): - """Count the number of different network locations in origin URLs.""" - origins_netloc = defaultdict(set) - for origin_bytes in origins.values(): - origin = msgpack.loads(origin_bytes) - parsed_url = urlparse(origin["url"]) - netloc = parsed_url.netloc - if netloc.endswith("googlecode.com"): - # special case for googlecode origins where URL netloc - # has the form {account}.googlecode.com - netloc = "googlecode.com" - origins_netloc[f"origin_netloc:{netloc}"].add(origin["url"]) - - for k, v in origins_netloc.items(): - counters.add(k, v) - - def process_revisions(revisions: Dict[bytes, bytes], counters: CountersInterface): """Count the number of different authors and committers on the revisions (in the person collection)""" persons = set() for revision_bytes in revisions.values(): revision = msgpack.loads(revision_bytes) persons.add(revision["author"]["fullname"]) persons.add(revision["committer"]["fullname"]) counters.add("person", list(persons)) def process_releases(releases: Dict[bytes, bytes], counters: CountersInterface): """Count the number of different authors on the releases (in the person collection)""" persons = set() for release_bytes in releases.values(): release = msgpack.loads(release_bytes) author = release.get("author") if author and "fullname" in author: persons.add(author["fullname"]) counters.add("person", list(persons)) diff --git a/swh/counters/tests/test_journal_client.py b/swh/counters/tests/test_journal_client.py index 59bb14b..271a2cc 100644 --- a/swh/counters/tests/test_journal_client.py +++ b/swh/counters/tests/test_journal_client.py @@ -1,201 +1,147 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from collections import Counter from typing import Dict, Optional -from urllib.parse import urlparse -from hypothesis import given -from hypothesis.strategies import lists import msgpack from swh.counters.journal_client import ( process_journal_messages, - process_origins, process_releases, process_revisions, ) from swh.counters.redis import Redis from swh.model.hashutil import hash_to_bytes -from swh.model.hypothesis_strategies import origins from swh.model.model import ( ObjectType, - Origin, Person, Release, Revision, RevisionType, Timestamp, TimestampWithTimezone, ) DATE = TimestampWithTimezone( timestamp=Timestamp(seconds=0, microseconds=0), offset=0, negative_utc=False ) def _create_release(author_fullname: Optional[str]) -> Dict: """Use Release.to_dict to be sure the field's name used to retrieve the author is correct""" author = None if author_fullname: author = Person(fullname=bytes(author_fullname, "utf-8"), name=None, email=None) release = Release( name=b"Release", message=b"Message", target=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), target_type=ObjectType.CONTENT, synthetic=True, author=author, ) return release.to_dict() def _create_revision(author_fullname: str, committer_fullname: str) -> Dict: """Use Revision.to_dict to be sure the names of the fields used to retrieve the author and the committer are correct""" revision = Revision( committer_date=DATE, date=None, type=RevisionType.GIT, parents=(), directory=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), synthetic=True, message=None, author=Person(fullname=bytes(author_fullname, "utf-8"), name=None, email=None), committer=Person( fullname=bytes(committer_fullname, "utf-8"), name=None, email=None ), ) return revision.to_dict() RELEASES = { rel["id"]: msgpack.dumps(rel) for rel in [ _create_release(author_fullname="author 1"), _create_release(author_fullname="author 2"), _create_release(author_fullname=None), ] } RELEASES_AUTHOR_FULLNAMES = {b"author 1", b"author 2"} REVISIONS = { rev["id"]: msgpack.dumps(rev) for rev in [ _create_revision(author_fullname="author 1", committer_fullname="committer 1"), _create_revision(author_fullname="author 2", committer_fullname="committer 2"), _create_revision(author_fullname="author 2", committer_fullname="committer 1"), _create_revision(author_fullname="author 1", committer_fullname="committer 2"), ] } REVISIONS_AUTHOR_FULLNAMES = {b"author 1", b"author 2"} REVISIONS_COMMITTER_FULLNAMES = {b"committer 1", b"committer 2"} REVISIONS_PERSON_FULLNAMES = REVISIONS_AUTHOR_FULLNAMES | REVISIONS_COMMITTER_FULLNAMES def test_journal_client_all_keys(local_redis_host): redis = Redis(host=local_redis_host) keys = { "coll1": {b"key1": b"value1", b"key2": b"value2"}, "coll2": {b"key3": b"value3", b"key4": b"value4", b"key5": b"value5"}, } process_journal_messages(messages=keys, counters=redis) assert redis.get_counts(redis.get_counters()) == {b"coll1": 2, b"coll2": 3} def test_journal_client_process_revisions(local_redis_host): redis = Redis(host=local_redis_host) process_revisions(REVISIONS, redis) assert redis.get_counts(redis.get_counters()) == { b"person": len(REVISIONS_PERSON_FULLNAMES) } def test_journal_client_process_releases(local_redis_host): redis = Redis(host=local_redis_host) process_releases(RELEASES, redis) assert redis.get_counts(redis.get_counters()) == { b"person": len(RELEASES_AUTHOR_FULLNAMES) } def test_journal_client_process_releases_without_authors(local_redis_host): releases = { rel["id"]: msgpack.dumps(rel) for rel in [ _create_release(author_fullname=None), _create_release(author_fullname=None), ] } redis = Redis(host=local_redis_host) process_releases(releases, redis) assert redis.get_counts(redis.get_counters()) == {} - - -def test_journal_client_process_origins(local_redis_host, redisdb): - # hypothesis does not play well with pytest function scoped fixtures - # so we use an inner test function as workaround - @given(lists(origins())) - def inner(origins): - origins_ = { - msgpack.dumps(origin.to_dict()): msgpack.dumps(origin.to_dict()) - for origin in origins - } - - # explicitly flush redis db content due to hypothesis use - redisdb.flushall() - redis = Redis(host=local_redis_host) - - process_origins(origins_, redis) - - expected_counts = Counter( - [ - f"origin_netloc:{urlparse(origin.url).netloc}".encode() - for origin in set(origins) - ] - ) - - assert redis.get_counts(redis.get_counters()) == expected_counts - - inner() - - -def test_journal_client_process_googlecode_origins(local_redis_host): - origins = [ - Origin(url="https://foo.googlecode.com"), - Origin(url="https://bar.googlecode.com"), - ] - origins_ = { - msgpack.dumps(origin.to_dict()): msgpack.dumps(origin.to_dict()) - for origin in origins - } - - redis = Redis(host=local_redis_host) - - process_origins(origins_, redis) - - assert redis.get_counts(redis.get_counters()) == { - b"origin_netloc:googlecode.com": 2 - }