Page MenuHomeSoftware Heritage

D8597.id31049.diff
No OneTemporary

D8597.id31049.diff

diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -10,7 +10,7 @@
from textwrap import dedent
from typing import Any, Dict, Iterable, List, Optional, cast
-from elasticsearch import Elasticsearch, helpers
+from elasticsearch import Elasticsearch, NotFoundError, helpers
import msgpack
from swh.indexer import codemeta
@@ -387,6 +387,19 @@
"document:index_error", count=len(errors), method_name="origin_update"
)
+ def origin_get(self, urls: List[str]) -> List[Dict[str, str]]:
+ origin_ids = [hash_to_hex(model.Origin(url=url).id) for url in urls]
+ documents = []
+ for id_ in origin_ids:
+ try:
+ documents.append(
+ self._backend.get(index=self._get_origin_read_alias(), id=id_)
+ )
+ except NotFoundError:
+ pass
+
+ return [doc["_source"] for doc in documents]
+
@timed
def origin_search(
self,
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -171,8 +171,11 @@
def origin_update(self, documents: Iterable[OriginDict]) -> None:
for source_document in documents:
- document: Dict[str, Any] = dict(source_document)
- id_ = hash_to_hex(model.Origin(url=document["url"]).id)
+ id_ = hash_to_hex(model.Origin(url=source_document["url"]).id)
+ document: Dict[str, Any] = {
+ **source_document,
+ "sha1": id_,
+ }
if "url" in document:
document["_url_tokens"] = set(
self._url_splitter.split(source_document["url"])
@@ -521,6 +524,14 @@
next_page_token=next_page_token,
)
+ def origin_get(self, urls: List[str]) -> List[Dict[str, str]]:
+ origin_ids = [hash_to_hex(model.Origin(url=url).id) for url in urls]
+ return [
+ {k: v for (k, v) in self._origins[id_].items() if k != "_url_tokens"}
+ for id_ in origin_ids
+ if id_ in self._origins
+ ]
+
def visit_types_count(self) -> Counter:
hits = self._get_hits()
return Counter(chain(*[hit.get("visit_types", []) for hit in hits]))
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
from collections import Counter
-from typing import Iterable, List, Optional, TypeVar
+from typing import Dict, Iterable, List, Optional, TypeVar
from typing_extensions import TypedDict
@@ -132,6 +132,13 @@
"""
...
+ @remote_api_endpoint("origin/dump")
+ def origin_get(self, urls: List[str]) -> List[Dict[str, str]]:
+ """Returns the full documents associated to the given origin URLs.
+
+ Order is arbitrary; unknown origins are not returned.
+ """
+
@remote_api_endpoint("visit_types_count")
def visit_types_count(self) -> Counter:
"""Returns origin counts per visit type (git, hg, svn, ...)."""
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -5,6 +5,7 @@
from collections import Counter
from datetime import datetime, timedelta, timezone
+import hashlib
from itertools import permutations
from hypothesis import given, settings, strategies
@@ -1227,7 +1228,7 @@
assert result_page.next_page_token is None
assert result_page.results == []
- def test_filter_keyword_in_filter(self):
+ def test_search_filter_keyword_in_filter(self):
origin1 = {
"url": "foo language in ['foo baz'] bar",
}
@@ -1242,6 +1243,98 @@
assert result_page.next_page_token is None
assert result_page.results == []
+ def test_origin_get(self):
+ """Checks the same field can have a concrete value, an object, or an array
+ in different documents."""
+ origin1 = {"url": "http://origin1"}
+ origin2 = {"url": "http://origin2"}
+ origin3 = {"url": "http://origin3"}
+ origins = [
+ {
+ **origin1,
+ "jsonld": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": {
+ "familyName": "Foo",
+ "givenName": "Bar",
+ },
+ },
+ },
+ {
+ **origin2,
+ "jsonld": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": "Bar Baz",
+ },
+ },
+ {
+ **origin3,
+ "jsonld": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": ["Baz", "Qux"],
+ },
+ },
+ ]
+
+ expanded_origins = [
+ {
+ **origin1,
+ "sha1": hashlib.sha1(origin1["url"].encode()).hexdigest(),
+ "jsonld": [
+ {
+ "http://schema.org/author": [
+ {
+ "@list": [
+ {
+ "http://schema.org/familyName": [
+ {"@value": "Foo"}
+ ],
+ "http://schema.org/givenName": [
+ {"@value": "Bar"}
+ ],
+ }
+ ]
+ }
+ ],
+ }
+ ],
+ },
+ {
+ **origin2,
+ "sha1": hashlib.sha1(origin2["url"].encode()).hexdigest(),
+ "jsonld": [
+ {
+ "http://schema.org/author": [
+ {"@list": [{"@value": "Bar Baz"}]}
+ ],
+ }
+ ],
+ },
+ {
+ **origin3,
+ "sha1": hashlib.sha1(origin3["url"].encode()).hexdigest(),
+ "jsonld": [
+ {
+ "http://schema.org/author": [
+ {"@list": [{"@value": "Baz"}, {"@value": "Qux"}]}
+ ],
+ }
+ ],
+ },
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ results = self.search.origin_get([origin1["url"], origin2["url"]])
+ print(results)
+ results.sort(key=lambda d: d["url"])
+ assert results == expanded_origins[0:2]
+
+ assert self.search.origin_get(["http://origin4", origin3["url"]]) == [
+ expanded_origins[2]
+ ]
+
def test_visit_types_count(self):
assert self.search.visit_types_count() == Counter()

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 7:54 AM (10 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233109

Event Timeline