Page MenuHomeSoftware Heritage

D4722.id16726.diff
No OneTemporary

D4722.id16726.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,5 @@
# Add here internal Software Heritage dependencies, one per line.
swh.core[http] >= 0.3.0
+swh.indexer
swh.journal >= 0.1.0
swh.model
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -10,6 +10,7 @@
from elasticsearch.helpers import bulk, scan
import msgpack
+from swh.indexer import codemeta
from swh.model import model
from swh.model.identifiers import origin_identifier
from swh.search.interface import PagedResult
@@ -21,6 +22,8 @@
for field_name in ("intrinsic_metadata", "has_visits"):
if field_name in origin:
res[field_name] = origin.pop(field_name)
+ if "intrinsic_metadata" in res:
+ res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"])
return res
diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py
--- a/swh/search/tests/test_in_memory.py
+++ b/swh/search/tests/test_in_memory.py
@@ -39,3 +39,7 @@
@pytest.mark.skip("Not implemented in the in-memory search")
def test_origin_intrinsic_metadata_paging(self):
pass
+
+ @pytest.mark.skip("Not implemented in the in-memory search")
+ def test_origin_intrinsic_metadata_inconsistent_type(self):
+ pass
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -205,6 +205,75 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
+ def test_origin_intrinsic_metadata_inconsistent_type(self):
+ """Checks the same field can have a concrete value, an object, or an array
+ in different documents."""
+ origin1_foobar = {"url": "http://origin1"}
+ origin2_barbaz = {"url": "http://origin2"}
+ origin3_bazqux = {"url": "http://origin3"}
+
+ self.search.origin_update(
+ [
+ {
+ **origin1_foobar,
+ "intrinsic_metadata": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": {"familyName": "Foo", "givenName": "Bar",},
+ },
+ },
+ ]
+ )
+ self.search.flush()
+ self.search.origin_update(
+ [
+ {
+ **origin2_barbaz,
+ "intrinsic_metadata": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": "Bar Baz",
+ },
+ },
+ {
+ **origin3_bazqux,
+ "intrinsic_metadata": {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "author": ["Baz", "Qux"],
+ },
+ },
+ ]
+ )
+ self.search.flush()
+
+ actual_page = self.search.origin_search(metadata_pattern="bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_barbaz, origin1_foobar]
+
+ actual_page = self.search.origin_search(metadata_pattern="baz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_barbaz, origin3_bazqux]
+
+ actual_page = self.search.origin_search(metadata_pattern="foo")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin1_foobar]
+
+ actual_page = self.search.origin_search(metadata_pattern="bar baz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_barbaz]
+
+ actual_page = self.search.origin_search(metadata_pattern="qux")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_bazqux]
+
+ actual_page = self.search.origin_search(metadata_pattern="baz qux")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_bazqux]
+
+ # FIXME: the following won't work because "foo" and "bar" are not in the
+ # same field.
+ # actual_page = self.search.origin_search(metadata_pattern="foo bar")
+ # assert actual_page.next_page_token is None
+ # assert actual_page.results == [origin2_foobar]
+
# TODO: add more tests with more codemeta terms
# TODO: add more tests with edge cases

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:37 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218276

Event Timeline