D6137.id22219.diff
View Options

	diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
	--- a/swh/search/elasticsearch.py
	+++ b/swh/search/elasticsearch.py
	@@ -4,6 +4,7 @@
	# See top-level LICENSE file for more information

	import base64
	+from collections import Counter
	import logging
	import pprint
	from textwrap import dedent
	@@ -527,3 +528,27 @@
	results=[{"url": hit["_source"]["url"]} for hit in hits],
	next_page_token=next_page_token,
	)
	+
	+ def visit_types_count(self) -> Counter:
	+ body = {
	+ "aggs": {
	+ "not_blocklisted": {
	+ "filter": {"bool": {"must_not": [{"term": {"blocklisted": True}}]}},
	+ "aggs": {
	+ "visit_types": {"terms": {"field": "visit_types", "size": 1000}}
	+ },
	+ }
	+ }
	+ }
	+
	+ res = self._backend.search(
	+ index=self._get_origin_read_alias(), body=body, size=0
	+ )
	+
	+ buckets = (
	+ res.get("aggregations", {})
	+ .get("not_blocklisted", {})
	+ .get("visit_types", {})
	+ .get("buckets", [])
	+ )
	+ return Counter({bucket["key"]: bucket["doc_count"] for bucket in buckets})
	diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
	--- a/swh/search/in_memory.py
	+++ b/swh/search/in_memory.py
	@@ -3,8 +3,9 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-from collections import defaultdict
	+from collections import Counter, defaultdict
	from datetime import datetime, timezone
	+from itertools import chain
	import re
	from typing import Any, Dict, Iterable, Iterator, List, Optional

	@@ -288,11 +289,7 @@
	page_token: Optional[str] = None,
	limit: int = 50,
	) -> PagedResult[MinimalOriginDict]:
	- hits: Iterator[Dict[str, Any]] = (
	- self._origins[id_]
	- for id_ in self._origin_ids
	- if not self._origins[id_].get("blocklisted")
	- )
	+ hits = self._get_hits()

	if url_pattern:
	tokens = set(self._url_splitter.split(url_pattern))
	@@ -506,3 +503,14 @@
	assert len(origins) <= limit

	return PagedResult(results=origins, next_page_token=next_page_token,)
	+
	+ def visit_types_count(self) -> Counter:
	+ hits = self._get_hits()
	+ return Counter(chain(*[hit.get("visit_types", []) for hit in hits]))
	+
	+ def _get_hits(self) -> Iterator[Dict[str, Any]]:
	+ return (
	+ self._origins[id_]
	+ for id_ in self._origin_ids
	+ if not self._origins[id_].get("blocklisted")
	+ )
	diff --git a/swh/search/interface.py b/swh/search/interface.py
	--- a/swh/search/interface.py
	+++ b/swh/search/interface.py
	@@ -3,6 +3,7 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+from collections import Counter
	from typing import Iterable, List, Optional, TypeVar

	from typing_extensions import TypedDict
	@@ -133,3 +134,9 @@

	"""
	...
	+
	+ @remote_api_endpoint("visit_types_count")
	+ def visit_types_count(self) -> Counter:
	+ """Returns origin counts per visit type (git, hg, svn, ...).
	+ """
	+ ...
	diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
	--- a/swh/search/tests/test_search.py
	+++ b/swh/search/tests/test_search.py
	@@ -3,6 +3,7 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+from collections import Counter
	from datetime import datetime, timedelta, timezone
	from itertools import permutations

	@@ -1166,3 +1167,23 @@
	result_page = self.search.origin_search(url_pattern="baaz")
	assert result_page.next_page_token is None
	assert result_page.results == []
	+
	+ def test_visit_types_count(self):
	+ assert self.search.visit_types_count() == Counter()
	+
	+ origins = [
	+ {"url": "http://foobar.baz", "visit_types": ["git"], "blocklisted": True}
	+ ]
	+
	+ for idx, visit_type in enumerate(["git", "hg", "svn"]):
	+ for i in range(idx + 1):
	+ origins.append(
	+ {
	+ "url": f"http://{visit_type}.foobar.baz.{i}",
	+ "visit_types": [visit_type],
	+ }
	+ )
	+ self.search.origin_update(origins)
	+ self.search.flush()
	+
	+ assert self.search.visit_types_count() == Counter(git=1, hg=2, svn=3)

File Metadata

Mime Type: text/plain
Expires: Thu, Jul 3, 3:37 PM (1 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3218923

D6137.id22219.diff
No OneTemporary
Actions

D6137.id22219.diff
View Options

File Metadata

Event Timeline

D6137.id22219.diffNo OneTemporaryActions

D6137.id22219.diffView Options

File Metadata

Event Timeline

D6137.id22219.diff
No OneTemporary
Actions

D6137.id22219.diff
View Options