D8597.id31049.diff
No OneTemporary
Actions

Size

7 KB

Subscribers

None

D8597.id31049.diff
View Options

	diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
	--- a/swh/search/elasticsearch.py
	+++ b/swh/search/elasticsearch.py
	@@ -10,7 +10,7 @@
	from textwrap import dedent
	from typing import Any, Dict, Iterable, List, Optional, cast

	-from elasticsearch import Elasticsearch, helpers
	+from elasticsearch import Elasticsearch, NotFoundError, helpers
	import msgpack

	from swh.indexer import codemeta
	@@ -387,6 +387,19 @@
	"document:index_error", count=len(errors), method_name="origin_update"
	)

	+ def origin_get(self, urls: List[str]) -> List[Dict[str, str]]:
	+ origin_ids = [hash_to_hex(model.Origin(url=url).id) for url in urls]
	+ documents = []
	+ for id_ in origin_ids:
	+ try:
	+ documents.append(
	+ self._backend.get(index=self._get_origin_read_alias(), id=id_)
	+ )
	+ except NotFoundError:
	+ pass
	+
	+ return [doc["_source"] for doc in documents]
	+
	@timed
	def origin_search(
	self,
	diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
	--- a/swh/search/in_memory.py
	+++ b/swh/search/in_memory.py
	@@ -171,8 +171,11 @@

	def origin_update(self, documents: Iterable[OriginDict]) -> None:
	for source_document in documents:
	- document: Dict[str, Any] = dict(source_document)
	- id_ = hash_to_hex(model.Origin(url=document["url"]).id)
	+ id_ = hash_to_hex(model.Origin(url=source_document["url"]).id)
	+ document: Dict[str, Any] = {
	+ **source_document,
	+ "sha1": id_,
	+ }
	if "url" in document:
	document["_url_tokens"] = set(
	self._url_splitter.split(source_document["url"])
	@@ -521,6 +524,14 @@
	next_page_token=next_page_token,
	)

	+ def origin_get(self, urls: List[str]) -> List[Dict[str, str]]:
	+ origin_ids = [hash_to_hex(model.Origin(url=url).id) for url in urls]
	+ return [
	+ {k: v for (k, v) in self._origins[id_].items() if k != "_url_tokens"}
	+ for id_ in origin_ids
	+ if id_ in self._origins
	+ ]
	+
	def visit_types_count(self) -> Counter:
	hits = self._get_hits()
	return Counter(chain(*[hit.get("visit_types", []) for hit in hits]))
	diff --git a/swh/search/interface.py b/swh/search/interface.py
	--- a/swh/search/interface.py
	+++ b/swh/search/interface.py
	@@ -4,7 +4,7 @@
	# See top-level LICENSE file for more information

	from collections import Counter
	-from typing import Iterable, List, Optional, TypeVar
	+from typing import Dict, Iterable, List, Optional, TypeVar

	from typing_extensions import TypedDict

	@@ -132,6 +132,13 @@
	"""
	...

	+ @remote_api_endpoint("origin/dump")
	+ def origin_get(self, urls: List[str]) -> List[Dict[str, str]]:
	+ """Returns the full documents associated to the given origin URLs.
	+
	+ Order is arbitrary; unknown origins are not returned.
	+ """
	+
	@remote_api_endpoint("visit_types_count")
	def visit_types_count(self) -> Counter:
	"""Returns origin counts per visit type (git, hg, svn, ...)."""
	diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
	--- a/swh/search/tests/test_search.py
	+++ b/swh/search/tests/test_search.py
	@@ -5,6 +5,7 @@

	from collections import Counter
	from datetime import datetime, timedelta, timezone
	+import hashlib
	from itertools import permutations

	from hypothesis import given, settings, strategies
	@@ -1227,7 +1228,7 @@
	assert result_page.next_page_token is None
	assert result_page.results == []

	- def test_filter_keyword_in_filter(self):
	+ def test_search_filter_keyword_in_filter(self):
	origin1 = {
	"url": "foo language in ['foo baz'] bar",
	}
	@@ -1242,6 +1243,98 @@
	assert result_page.next_page_token is None
	assert result_page.results == []

	+ def test_origin_get(self):
	+ """Checks the same field can have a concrete value, an object, or an array
	+ in different documents."""
	+ origin1 = {"url": "http://origin1"}
	+ origin2 = {"url": "http://origin2"}
	+ origin3 = {"url": "http://origin3"}
	+ origins = [
	+ {
	+ **origin1,
	+ "jsonld": {
	+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	+ "author": {
	+ "familyName": "Foo",
	+ "givenName": "Bar",
	+ },
	+ },
	+ },
	+ {
	+ **origin2,
	+ "jsonld": {
	+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	+ "author": "Bar Baz",
	+ },
	+ },
	+ {
	+ **origin3,
	+ "jsonld": {
	+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
	+ "author": ["Baz", "Qux"],
	+ },
	+ },
	+ ]
	+
	+ expanded_origins = [
	+ {
	+ **origin1,
	+ "sha1": hashlib.sha1(origin1["url"].encode()).hexdigest(),
	+ "jsonld": [
	+ {
	+ "http://schema.org/author": [
	+ {
	+ "@list": [
	+ {
	+ "http://schema.org/familyName": [
	+ {"@value": "Foo"}
	+ ],
	+ "http://schema.org/givenName": [
	+ {"@value": "Bar"}
	+ ],
	+ }
	+ ]
	+ }
	+ ],
	+ }
	+ ],
	+ },
	+ {
	+ **origin2,
	+ "sha1": hashlib.sha1(origin2["url"].encode()).hexdigest(),
	+ "jsonld": [
	+ {
	+ "http://schema.org/author": [
	+ {"@list": [{"@value": "Bar Baz"}]}
	+ ],
	+ }
	+ ],
	+ },
	+ {
	+ **origin3,
	+ "sha1": hashlib.sha1(origin3["url"].encode()).hexdigest(),
	+ "jsonld": [
	+ {
	+ "http://schema.org/author": [
	+ {"@list": [{"@value": "Baz"}, {"@value": "Qux"}]}
	+ ],
	+ }
	+ ],
	+ },
	+ ]
	+
	+ self.search.origin_update(origins)
	+ self.search.flush()
	+
	+ results = self.search.origin_get([origin1["url"], origin2["url"]])
	+ print(results)
	+ results.sort(key=lambda d: d["url"])
	+ assert results == expanded_origins[0:2]
	+
	+ assert self.search.origin_get(["http://origin4", origin3["url"]]) == [
	+ expanded_origins[2]
	+ ]
	+
	def test_visit_types_count(self):
	assert self.search.visit_types_count() == Counter()

File Metadata

Mime Type: text/plain
Expires: Jul 3 2025, 7:54 AM (10 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3233109

D8597.id31049.diffNo OneTemporaryActions

D8597.id31049.diffView Options

File Metadata

Event Timeline

D8597.id31049.diff
No OneTemporary
Actions

D8597.id31049.diff
View Options