Changeset View
Changeset View
Standalone View
Standalone View
swh/search/in_memory.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
import itertools | import itertools | ||||
import re | import re | ||||
from typing import Any, Dict, Iterable, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||
from swh.model.identifiers import origin_identifier | from swh.model.identifiers import origin_identifier | ||||
from swh.search.interface import PagedResult | from swh.search.interface import PagedResult | ||||
_words_regexp = re.compile(r"\w+") | |||||
def _dict_words_set(d): | |||||
"""Recursively extract set of words from dict content.""" | |||||
values = set() | |||||
def extract(obj, words): | |||||
if isinstance(obj, dict): | |||||
for k, v in obj.items(): | |||||
extract(v, words) | |||||
elif isinstance(obj, list): | |||||
for item in obj: | |||||
extract(item, words) | |||||
else: | |||||
words.update(_words_regexp.findall(str(obj).lower())) | |||||
return words | |||||
return extract(d, values) | |||||
class InMemorySearch: | class InMemorySearch: | ||||
def __init__(self): | def __init__(self): | ||||
pass | pass | ||||
def check(self): | def check(self): | ||||
return True | return True | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | ) -> PagedResult[Dict[str, Any]]: | ||||
return any( | return any( | ||||
token.startswith(missing_token) | token.startswith(missing_token) | ||||
for token in match["_url_tokens"] | for token in match["_url_tokens"] | ||||
) | ) | ||||
hits = filter(predicate, hits) | hits = filter(predicate, hits) | ||||
if metadata_pattern: | if metadata_pattern: | ||||
raise NotImplementedError( | metadata_pattern_words = set( | ||||
"Metadata search is not implemented in the in-memory backend." | _words_regexp.findall(metadata_pattern.lower()) | ||||
) | |||||
def predicate(match): | |||||
if "intrinsic_metadata" not in match: | |||||
return False | |||||
return metadata_pattern_words.issubset( | |||||
_dict_words_set(match["intrinsic_metadata"]) | |||||
) | ) | ||||
hits = filter(predicate, hits) | |||||
if not url_pattern and not metadata_pattern: | if not url_pattern and not metadata_pattern: | ||||
raise ValueError( | raise ValueError( | ||||
"At least one of url_pattern and metadata_pattern must be provided." | "At least one of url_pattern and metadata_pattern must be provided." | ||||
) | ) | ||||
next_page_token: Optional[str] = None | next_page_token: Optional[str] = None | ||||
if with_visit: | if with_visit: | ||||
Show All 15 Lines |