Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066182
D8597.id31199.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D8597.id31199.diff
View Options
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -10,7 +10,7 @@
from textwrap import dedent
from typing import Any, Dict, Iterable, List, Optional, cast
-from elasticsearch import Elasticsearch, helpers
+from elasticsearch import Elasticsearch, NotFoundError, helpers
import msgpack
from swh.indexer import codemeta
@@ -387,6 +387,17 @@
"document:index_error", count=len(errors), method_name="origin_update"
)
+ def origin_get(self, url: str) -> Optional[Dict[str, str]]:
+ origin_id = hash_to_hex(model.Origin(url=url).id)
+ try:
+ document = self._backend.get(
+ index=self._get_origin_read_alias(), id=origin_id
+ )
+ except NotFoundError:
+ return None
+ else:
+ return document["_source"]
+
@timed
def origin_search(
self,
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -171,8 +171,11 @@
def origin_update(self, documents: Iterable[OriginDict]) -> None:
for source_document in documents:
- document: Dict[str, Any] = dict(source_document)
- id_ = hash_to_hex(model.Origin(url=document["url"]).id)
+ id_ = hash_to_hex(model.Origin(url=source_document["url"]).id)
+ document: Dict[str, Any] = {
+ **source_document,
+ "sha1": id_,
+ }
if "url" in document:
document["_url_tokens"] = set(
self._url_splitter.split(source_document["url"])
@@ -521,6 +524,14 @@
next_page_token=next_page_token,
)
+ def origin_get(self, url: str) -> Optional[Dict[str, str]]:
+ origin_id = hash_to_hex(model.Origin(url=url).id)
+ document = self._origins.get(origin_id)
+ if document is None:
+ return None
+ else:
+ return {k: v for (k, v) in document.items() if k != "_url_tokens"}
+
def visit_types_count(self) -> Counter:
hits = self._get_hits()
return Counter(chain(*[hit.get("visit_types", []) for hit in hits]))
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
from collections import Counter
-from typing import Iterable, List, Optional, TypeVar
+from typing import Dict, Iterable, List, Optional, TypeVar
from typing_extensions import TypedDict
@@ -132,6 +132,13 @@
"""
...
+ @remote_api_endpoint("origin/get")
+ def origin_get(self, url: str) -> Optional[Dict[str, str]]:
+ """Returns the full documents associated to the given origin URLs.
+
+ Order is arbitrary; unknown origins are not returned.
+ """
+
@remote_api_endpoint("visit_types_count")
def visit_types_count(self) -> Counter:
"""Returns origin counts per visit type (git, hg, svn, ...)."""
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -5,6 +5,7 @@
from collections import Counter
from datetime import datetime, timedelta, timezone
+import hashlib
from itertools import permutations
from hypothesis import given, settings, strategies
@@ -1227,7 +1228,7 @@
assert result_page.next_page_token is None
assert result_page.results == []
- def test_filter_keyword_in_filter(self):
+ def test_search_filter_keyword_in_filter(self):
origin1 = {
"url": "foo language in ['foo baz'] bar",
}
@@ -1242,6 +1243,94 @@
assert result_page.next_page_token is None
assert result_page.results == []
+ def test_origin_get(self):
+ """Checks the same field can have a concrete value, an object, or an array
+ in different documents."""
+ origin1 = {"url": "http://origin1"}
+ origin2 = {"url": "http://origin2"}
+ origin3 = {"url": "http://origin3"}
+ origins = [
+ {
+ **origin1,
+ "jsonld": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": {
+ "familyName": "Foo",
+ "givenName": "Bar",
+ },
+ },
+ },
+ {
+ **origin2,
+ "jsonld": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": "Bar Baz",
+ },
+ },
+ {
+ **origin3,
+ "jsonld": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": ["Baz", "Qux"],
+ },
+ },
+ ]
+
+ expanded_origins = [
+ {
+ **origin1,
+ "sha1": hashlib.sha1(origin1["url"].encode()).hexdigest(),
+ "jsonld": [
+ {
+ "http://schema.org/author": [
+ {
+ "@list": [
+ {
+ "http://schema.org/familyName": [
+ {"@value": "Foo"}
+ ],
+ "http://schema.org/givenName": [
+ {"@value": "Bar"}
+ ],
+ }
+ ]
+ }
+ ],
+ }
+ ],
+ },
+ {
+ **origin2,
+ "sha1": hashlib.sha1(origin2["url"].encode()).hexdigest(),
+ "jsonld": [
+ {
+ "http://schema.org/author": [
+ {"@list": [{"@value": "Bar Baz"}]}
+ ],
+ }
+ ],
+ },
+ {
+ **origin3,
+ "sha1": hashlib.sha1(origin3["url"].encode()).hexdigest(),
+ "jsonld": [
+ {
+ "http://schema.org/author": [
+ {"@list": [{"@value": "Baz"}, {"@value": "Qux"}]}
+ ],
+ }
+ ],
+ },
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ assert self.search.origin_get(origin1["url"]) == expanded_origins[0]
+ assert self.search.origin_get(origin2["url"]) == expanded_origins[1]
+ assert self.search.origin_get(origin3["url"]) == expanded_origins[2]
+ assert self.search.origin_get("http://origin4") is None
+
def test_visit_types_count(self):
assert self.search.visit_types_count() == Counter()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 4 2024, 9:20 PM (20 w, 9 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233077
Attached To
D8597: Add method 'origin_get' endpoint
Event Timeline
Log In to Comment