Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
index 578b08b..e657096 100644
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -1,105 +1,135 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
import itertools
import re
from typing import Any, Dict, Iterable, Iterator, List, Optional
from swh.model.identifiers import origin_identifier
from swh.search.interface import PagedResult
+_words_regexp = re.compile(r"\w+")
+
+
+def _dict_words_set(d):
+ """Recursively extract set of words from dict content."""
+ values = set()
+
+ def extract(obj, words):
+ if isinstance(obj, dict):
+ for k, v in obj.items():
+ extract(v, words)
+ elif isinstance(obj, list):
+ for item in obj:
+ extract(item, words)
+ else:
+ words.update(_words_regexp.findall(str(obj).lower()))
+ return words
+
+ return extract(d, values)
+
class InMemorySearch:
def __init__(self):
pass
def check(self):
return True
def deinitialize(self) -> None:
if hasattr(self, "_origins"):
del self._origins
del self._origin_ids
def initialize(self) -> None:
self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict)
self._origin_ids: List[str] = []
def flush(self) -> None:
pass
_url_splitter = re.compile(r"\W")
def origin_update(self, documents: Iterable[Dict]) -> None:
for document in documents:
document = document.copy()
id_ = origin_identifier(document)
if "url" in document:
document["_url_tokens"] = set(self._url_splitter.split(document["url"]))
self._origins[id_].update(document)
if id_ not in self._origin_ids:
self._origin_ids.append(id_)
def origin_search(
self,
*,
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
hits: Iterator[Dict[str, Any]] = (
self._origins[id_] for id_ in self._origin_ids
)
if url_pattern:
tokens = set(self._url_splitter.split(url_pattern))
def predicate(match):
missing_tokens = tokens - match["_url_tokens"]
if len(missing_tokens) == 0:
return True
elif len(missing_tokens) > 1:
return False
else:
# There is one missing token, look up by prefix.
(missing_token,) = missing_tokens
return any(
token.startswith(missing_token)
for token in match["_url_tokens"]
)
hits = filter(predicate, hits)
if metadata_pattern:
- raise NotImplementedError(
- "Metadata search is not implemented in the in-memory backend."
+ metadata_pattern_words = set(
+ _words_regexp.findall(metadata_pattern.lower())
)
+ def predicate(match):
+ if "intrinsic_metadata" not in match:
+ return False
+
+ return metadata_pattern_words.issubset(
+ _dict_words_set(match["intrinsic_metadata"])
+ )
+
+ hits = filter(predicate, hits)
+
if not url_pattern and not metadata_pattern:
raise ValueError(
"At least one of url_pattern and metadata_pattern must be provided."
)
next_page_token: Optional[str] = None
if with_visit:
hits = filter(lambda o: o.get("has_visits"), hits)
start_at_index = int(page_token) if page_token else 0
origins = [
{"url": hit["url"]}
for hit in itertools.islice(hits, start_at_index, start_at_index + limit)
]
if len(origins) == limit:
next_page_token = str(start_at_index + limit)
assert len(origins) <= limit
return PagedResult(results=origins, next_page_token=next_page_token,)
diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py
index 7623b5e..6ee2c06 100644
--- a/swh/search/tests/test_in_memory.py
+++ b/swh/search/tests/test_in_memory.py
@@ -1,53 +1,25 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
import pytest
from swh.search import get_search
from .test_search import CommonSearchTest
class InmemorySearchTest(unittest.TestCase, CommonSearchTest):
@pytest.fixture(autouse=True)
def _instantiate_search(self):
self.search = get_search("memory")
def setUp(self):
self.reset()
def reset(self):
self.search.deinitialize()
self.search.initialize()
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_description(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_all_terms(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_nested(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_paging(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_inconsistent_type(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_matches_cross_fields(self):
- pass
-
- @pytest.mark.skip("Not implemented in the in-memory search")
- def test_origin_intrinsic_metadata_long_description(self):
- pass
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
index 6c81218..821ba83 100644
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1,406 +1,408 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from hypothesis import given, settings, strategies
from swh.core.api.classes import stream_results
class CommonSearchTest:
def test_origin_url_unique_word_prefix(self):
origin_foobar_baz = {"url": "http://foobar.baz"}
origin_barbaz_qux = {"url": "http://barbaz.qux"}
origin_qux_quux = {"url": "http://qux.quux"}
origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
self.search.origin_update(origins)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
actual_page = self.search.origin_search(url_pattern="barb")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_barbaz_qux]
# 'bar' is part of 'foobar', but is not the beginning of it
actual_page = self.search.origin_search(url_pattern="bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_barbaz_qux]
actual_page = self.search.origin_search(url_pattern="barbaz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_barbaz_qux]
def test_origin_url_unique_word_prefix_multiple_results(self):
origin_foobar_baz = {"url": "http://foobar.baz"}
origin_barbaz_qux = {"url": "http://barbaz.qux"}
origin_qux_quux = {"url": "http://qux.quux"}
self.search.origin_update(
[origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="qu")
assert actual_page.next_page_token is None
results = [r["url"] for r in actual_page.results]
expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]]
assert sorted(results) == sorted(expected_results)
actual_page = self.search.origin_search(url_pattern="qux")
assert actual_page.next_page_token is None
results = [r["url"] for r in actual_page.results]
expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]]
assert sorted(results) == sorted(expected_results)
def test_origin_url_all_terms(self):
origin_foo_bar_baz = {"url": "http://foo.bar/baz"}
origin_foo_bar_foo_bar = {"url": "http://foo.bar/foo.bar"}
origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar]
self.search.origin_update(origins)
self.search.flush()
# Only results containing all terms should be returned.
actual_page = self.search.origin_search(url_pattern="foo bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foo_bar_baz]
def test_origin_with_visit(self):
origin_foobar_baz = {"url": "http://foobar/baz"}
self.search.origin_update(
[{**o, "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
def test_origin_with_visit_added(self):
origin_foobar_baz = {"url": "http://foobar.baz"}
self.search.origin_update([origin_foobar_baz])
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == []
self.search.origin_update(
[{**o, "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
def test_origin_intrinsic_metadata_description(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_barbaz = {"url": "http://origin3"}
self.search.origin_update(
[
{**origin1_nothin, "intrinsic_metadata": {},},
{
**origin2_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
},
},
{
**origin3_barbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "bar baz",
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="foo bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
def test_origin_intrinsic_metadata_all_terms(self):
origin1_foobarfoobar = {"url": "http://origin1"}
origin3_foobarbaz = {"url": "http://origin2"}
self.search.origin_update(
[
{
**origin1_foobarfoobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar foo bar",
},
},
{
**origin3_foobarbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar baz",
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_foobarbaz]
def test_origin_intrinsic_metadata_long_description(self):
origin1 = {"url": "http://origin1"}
self.search.origin_update(
[
{
**origin1,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": " ".join(f"foo{i}" for i in range(1000000)),
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo42")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1]
def test_origin_intrinsic_metadata_matches_cross_fields(self):
"""Checks the backend finds results even if the two words in the query are
each in a different field."""
origin1 = {"url": "http://origin1"}
self.search.origin_update(
[
{
**origin1,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
"author": "John Doe",
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo John")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1]
def test_origin_intrinsic_metadata_nested(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_barbaz = {"url": "http://origin3"}
self.search.origin_update(
[
{**origin1_nothin, "intrinsic_metadata": {},},
{
**origin2_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
**origin3_barbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["bar", "baz"],
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="foo")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="foo bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_foobar]
actual_page = self.search.origin_search(metadata_pattern="bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_barbaz]
def test_origin_intrinsic_metadata_inconsistent_type(self):
"""Checks the same field can have a concrete value, an object, or an array
in different documents."""
origin1_foobar = {"url": "http://origin1"}
origin2_barbaz = {"url": "http://origin2"}
origin3_bazqux = {"url": "http://origin3"}
self.search.origin_update(
[
{
**origin1_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": {"familyName": "Foo", "givenName": "Bar",},
},
},
]
)
self.search.flush()
self.search.origin_update(
[
{
**origin2_barbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": "Bar Baz",
},
},
{
**origin3_bazqux,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"author": ["Baz", "Qux"],
},
},
]
)
self.search.flush()
actual_page = self.search.origin_search(metadata_pattern="bar")
assert actual_page.next_page_token is None
- assert actual_page.results == [origin2_barbaz, origin1_foobar]
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [o["url"] for o in [origin2_barbaz, origin1_foobar]]
+ assert sorted(results) == sorted(expected_results)
actual_page = self.search.origin_search(metadata_pattern="baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_barbaz, origin3_bazqux]
actual_page = self.search.origin_search(metadata_pattern="foo")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1_foobar]
actual_page = self.search.origin_search(metadata_pattern="bar baz")
assert actual_page.next_page_token is None
assert actual_page.results == [origin2_barbaz]
actual_page = self.search.origin_search(metadata_pattern="qux")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_bazqux]
actual_page = self.search.origin_search(metadata_pattern="baz qux")
assert actual_page.next_page_token is None
assert actual_page.results == [origin3_bazqux]
actual_page = self.search.origin_search(metadata_pattern="foo bar")
assert actual_page.next_page_token is None
assert actual_page.results == [origin1_foobar]
# TODO: add more tests with more codemeta terms
# TODO: add more tests with edge cases
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
def test_origin_url_paging(self, limit):
# TODO: no hypothesis
origin1_foo = {"url": "http://origin1/foo"}
origin2_foobar = {"url": "http://origin2/foo/bar"}
origin3_foobarbaz = {"url": "http://origin3/foo/bar/baz"}
self.reset()
self.search.origin_update([origin1_foo, origin2_foobar, origin3_foobarbaz])
self.search.flush()
results = stream_results(
self.search.origin_search, url_pattern="foo bar baz", limit=limit
)
results = [res["url"] for res in results]
expected_results = [o["url"] for o in [origin3_foobarbaz]]
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
results = stream_results(
self.search.origin_search, url_pattern="foo bar", limit=limit
)
results = [res["url"] for res in results]
expected_results = [o["url"] for o in [origin2_foobar, origin3_foobarbaz]]
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
results = stream_results(
self.search.origin_search, url_pattern="foo", limit=limit
)
results = [res["url"] for res in results]
expected_results = [
o["url"] for o in [origin1_foo, origin2_foobar, origin3_foobarbaz]
]
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
def test_origin_intrinsic_metadata_paging(self, limit):
# TODO: no hypothesis
origin1_foo = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
origin3_foobarbaz = {"url": "http://origin3"}
self.reset()
self.search.origin_update(
[
{
**origin1_foo,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo"],
},
},
{
**origin2_foobar,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
**origin3_foobarbaz,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar", "baz"],
},
},
]
)
self.search.flush()
results = stream_results(
self.search.origin_search, metadata_pattern="foo bar baz", limit=limit
)
assert list(results) == [origin3_foobarbaz]
results = stream_results(
self.search.origin_search, metadata_pattern="foo bar", limit=limit
)
assert list(results) == [origin2_foobar, origin3_foobarbaz]
results = stream_results(
self.search.origin_search, metadata_pattern="foo", limit=limit
)
assert list(results) == [origin1_foo, origin2_foobar, origin3_foobarbaz]

File Metadata

Mime Type
text/x-diff
Expires
Wed, Jun 4, 7:15 PM (6 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3290909

Event Timeline