Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F8393657
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
22 KB
Subscribers
None
View Options
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
index 578b08b..e657096 100644
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -1,105 +1,135 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
import itertools
import re
from typing import Any, Dict, Iterable, Iterator, List, Optional
from swh.model.identifiers import origin_identifier
from swh.search.interface import PagedResult
+_words_regexp = re.compile(r"\w+")
+
+
+def _dict_words_set(d):
+ """Recursively extract set of words from dict content."""
+ values = set()
+
+ def extract(obj, words):
+ if isinstance(obj, dict):
+ for k, v in obj.items():
+ extract(v, words)
+ elif isinstance(obj, list):
+ for item in obj:
+ extract(item, words)
+ else:
+ words.update(_words_regexp.findall(str(obj).lower()))
+ return words
+
+ return extract(d, values)
+
class InMemorySearch:
def __init__(self):
pass
def check(self):
return True
def deinitialize(self) -> None:
if hasattr(self, "_origins"):
del self._origins
del self._origin_ids
def initialize(self) -> None:
self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict)
self._origin_ids: List[str] = []
def flush(self) -> None:
pass
_url_splitter = re.compile(r"\W")
def origin_update(self, documents: Iterable[Dict]) -> None:
for document in documents:
document = document.copy()
id_ = origin_identifier(document)
if "url" in document:
document["_url_tokens"] = set(self._url_splitter.split(document["url"]))
self._origins[id_].update(document)
if id_ not in self._origin_ids:
self._origin_ids.append(id_)
def origin_search(
self,
*,
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
hits: Iterator[Dict[str, Any]] = (
self._origins[id_] for id_ in self._origin_ids
)
if url_pattern:
tokens = set(self._url_splitter.split(url_pattern))
def predicate(match):
missing_tokens = tokens - match["_url_tokens"]
if len(missing_tokens) == 0:
return True
elif len(missing_tokens) > 1:
return False
else:
# There is one missing token, look up by prefix.
(missing_token,) = missing_tokens
return any(
token.startswith(missing_token)
for token in match["_url_tokens"]
)
hits = filter(predicate, hits)
if metadata_pattern:
- raise NotImplementedError(
- "Metadata search is not implemented in the in-memory backend."
+ metadata_pattern_words = set(
+ _words_regexp.findall(metadata_pattern.lower())
)
+ def predicate(match):
+ if "intrinsic_metadata" not in match:
+ return False
+
+ return metadata_pattern_words.issubset(
+ _dict_words_set(match["intrinsic_metadata"])
+ )
+
+ hits = filter(predicate, hits)
+
if not url_pattern and not metadata_pattern:
raise ValueError(
"At least one of url_pattern and metadata_pattern must be provided."
)
next_page_token: Optional[str] = None
if with_visit:
hits = filter(lambda o: o.get("has_visits"), hits)
start_at_index = int(page_token) if page_token else 0
origins = [
{"url": hit["url"]}
for hit in itertools.islice(hits, start_at_index, start_at_index + limit)
]
if len(origins) == limit:
next_page_token = str(start_at_index + limit)
assert len(origins) <= limit
return PagedResult(results=origins, next_page_token=next_page_token,)
diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py
index 7623b5e..6ee2c06 100644
--- a/swh/search/tests/test_in_memory.py
+++ b/swh/search/tests/test_in_memory.py
@@ -1,53 +1,25 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
import pytest
from swh.search import get_search
from .test_search import CommonSearchTest
class InmemorySearchTest(unittest.TestCase, CommonSearchTest):
@pytest.fixture(autouse=True)
def _instantiate_search(self):
self.search = get_search("memory")
def setUp(self):
self.reset()
def reset(self):
self.search.deinitialize()
self.search.initialize()
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_description(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_all_terms(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_nested(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_paging(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_inconsistent_type(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_matches_cross_fields(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_long_description(self):
- pass
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
index 6c81218..821ba83 100644
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1,406 +1,408 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from hypothesis import given, settings, strategies
from swh.core.api.classes import stream_results
class CommonSearchTest:
def test_origin_url_unique_word_prefix(self):
origin_foobar_baz = {"url": "http://foobar.baz"}
origin_barbaz_qux = {"url": "http://barbaz.qux"}
origin_qux_quux = {"url": "http://qux.quux"}
origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
self.search.origin_update(origins)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
actual_page = self.search.origin_search(url_pattern="barb")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_barbaz_qux]
# 'bar' is part of 'foobar', but is not the beginning of it
actual_page = self.search.origin_search(url_pattern="bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_barbaz_qux]
actual_page = self.search.origin_search(url_pattern="barbaz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_barbaz_qux]
def test_origin_url_unique_word_prefix_multiple_results(self):
origin_foobar_baz = {"url": "http://foobar.baz"}
origin_barbaz_qux = {"url": "http://barbaz.qux"}
origin_qux_quux = {"url": "http://qux.quux"}
self.search.origin_update(
[origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="qu")
assert actual_page.next_page_token is None
results = [r["url"] for r in actual_page.results]
expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]]
assert sorted(results) == sorted(expected_results)
actual_page = self.search.origin_search(url_pattern="qux")
assert actual_page.next_page_token is None
results = [r["url"] for r in actual_page.results]
expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]]
assert sorted(results) == sorted(expected_results)
def test_origin_url_all_terms(self):
origin_foo_bar_baz = {"url": "http://foo.bar/baz"}
origin_foo_bar_foo_bar = {"url": "http://foo.bar/foo.bar"}
origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar]
self.search.origin_update(origins)
self.search.flush()
# Only results containing all terms should be returned.
actual_page = self.search.origin_search(url_pattern="foo bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foo_bar_baz]
def test_origin_with_visit(self):
origin_foobar_baz = {"url": "http://foobar/baz"}
self.search.origin_update(
[{**o, "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
def test_origin_with_visit_added(self):
origin_foobar_baz = {"url": "http://foobar.baz"}
self.search.origin_update([origin_foobar_baz])
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == []
self.search.origin_update(
[{**o, "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
def test_origin_intrinsic_metadata_description(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_barbaz = {"url": "http://origin3"}
self.search.origin_update(
[
{**origin1_nothin, "intrinsic_metadata": {},},
{
**origin2_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
},
},
{
**origin3_barbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "bar baz",
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="foo bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
def test_origin_intrinsic_metadata_all_terms(self):
origin1_foobarfoobar = {"url": "http://origin1"}
origin3_foobarbaz = {"url": "http://origin2"}
self.search.origin_update(
[
{
**origin1_foobarfoobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar foo bar",
},
},
{
**origin3_foobarbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar baz",
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_foobarbaz]
def test_origin_intrinsic_metadata_long_description(self):
origin1 = {"url": "http://origin1"}
self.search.origin_update(
[
{
**origin1,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": " ".join(f"foo{i}" for i in range(1000000)),
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo42")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1]
def test_origin_intrinsic_metadata_matches_cross_fields(self):
"""Checks the backend finds results even if the two words in the query are
each in a different field."""
origin1 = {"url": "http://origin1"}
self.search.origin_update(
[
{
**origin1,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"author": "John Doe",
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo John")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1]
def test_origin_intrinsic_metadata_nested(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_barbaz = {"url": "http://origin3"}
self.search.origin_update(
[
{**origin1_nothin, "intrinsic_metadata": {},},
{
**origin2_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
**origin3_barbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["bar", "baz"],
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="foo bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
def test_origin_intrinsic_metadata_inconsistent_type(self):
"""Checks the same field can have a concrete value, an object, or an array
in different documents."""
origin1_foobar = {"url": "http://origin1"}
origin2_barbaz = {"url": "http://origin2"}
origin3_bazqux = {"url": "http://origin3"}
self.search.origin_update(
[
{
**origin1_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"familyName": "Foo", "givenName": "Bar",},
},
},
]
)
self.search.flush()
self.search.origin_update(
[
{
**origin2_barbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": "Bar Baz",
},
},
{
**origin3_bazqux,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": ["Baz", "Qux"],
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="bar")
assert actual_page.next_page_token is None
- assert actual_page.results == [origin2_barbaz, origin1_foobar]
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [o["url"] for o in [origin2_barbaz, origin1_foobar]]
+ assert sorted(results) == sorted(expected_results)
actual_page = self.search.origin_search(metadata_pattern="baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_barbaz, origin3_bazqux]
actual_page = self.search.origin_search(metadata_pattern="foo")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1_foobar]
actual_page = self.search.origin_search(metadata_pattern="bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_barbaz]
actual_page = self.search.origin_search(metadata_pattern="qux")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_bazqux]
actual_page = self.search.origin_search(metadata_pattern="baz qux")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_bazqux]
actual_page = self.search.origin_search(metadata_pattern="foo bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1_foobar]
# TODO: add more tests with more codemeta terms
# TODO: add more tests with edge cases
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
def test_origin_url_paging(self, limit):
# TODO: no hypothesis
origin1_foo = {"url": "http://origin1/foo"}
origin2_foobar = {"url": "http://origin2/foo/bar"}
origin3_foobarbaz = {"url": "http://origin3/foo/bar/baz"}
self.reset()
self.search.origin_update([origin1_foo, origin2_foobar, origin3_foobarbaz])
self.search.flush()
results = stream_results(
self.search.origin_search, url_pattern="foo bar baz", limit=limit
)
results = [res["url"] for res in results]
expected_results = [o["url"] for o in [origin3_foobarbaz]]
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
results = stream_results(
self.search.origin_search, url_pattern="foo bar", limit=limit
)
results = [res["url"] for res in results]
expected_results = [o["url"] for o in [origin2_foobar, origin3_foobarbaz]]
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
results = stream_results(
self.search.origin_search, url_pattern="foo", limit=limit
)
results = [res["url"] for res in results]
expected_results = [
o["url"] for o in [origin1_foo, origin2_foobar, origin3_foobarbaz]
]
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
def test_origin_intrinsic_metadata_paging(self, limit):
# TODO: no hypothesis
origin1_foo = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_foobarbaz = {"url": "http://origin3"}
self.reset()
self.search.origin_update(
[
{
**origin1_foo,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo"],
},
},
{
**origin2_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
**origin3_foobarbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar", "baz"],
},
},
]
)
self.search.flush()
results = stream_results(
self.search.origin_search, metadata_pattern="foo bar baz", limit=limit
)
assert list(results) == [origin3_foobarbaz]
results = stream_results(
self.search.origin_search, metadata_pattern="foo bar", limit=limit
)
assert list(results) == [origin2_foobar, origin3_foobarbaz]
results = stream_results(
self.search.origin_search, metadata_pattern="foo", limit=limit
)
assert list(results) == [origin1_foo, origin2_foobar, origin3_foobarbaz]
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Wed, Jun 4, 7:15 PM (6 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3290909
Attached To
rDSEA Archive search
Event Timeline
Log In to Comment