Differential D5086 Diff 18174 swh/search/in_memory.py

Changeset View

Standalone View

swh/search/in_memory.py

# Copyright (C) 2019-2020 The Software Heritage developers		# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

from collections import defaultdict		from collections import defaultdict
import itertools		import itertools
import re		import re
from typing import Any, Dict, Iterable, Iterator, List, Optional		from typing import Any, Dict, Iterable, Iterator, List, Optional

from swh.model.identifiers import origin_identifier		from swh.model.identifiers import origin_identifier
from swh.search.interface import PagedResult		from swh.search.interface import PagedResult

		_words_regexp = re.compile(r"\w+")


		def _dict_words_set(d):
		"""Recursively extract set of words from dict content."""
		values = set()

		def extract(obj, words):
		if isinstance(obj, dict):
		for k, v in obj.items():
		extract(v, words)
		elif isinstance(obj, list):
		for item in obj:
		extract(item, words)
		else:
		words.update(_words_regexp.findall(str(obj).lower()))
		return words

		return extract(d, values)


class InMemorySearch:		class InMemorySearch:
def __init__(self):		def __init__(self):
pass		pass

def check(self):		def check(self):
return True		return True

▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	) -> PagedResult[Dict[str, Any]]:
return any(		return any(
token.startswith(missing_token)		token.startswith(missing_token)
for token in match["_url_tokens"]		for token in match["_url_tokens"]
)		)

hits = filter(predicate, hits)		hits = filter(predicate, hits)

if metadata_pattern:		if metadata_pattern:
raise NotImplementedError(		metadata_pattern_words = set(
"Metadata search is not implemented in the in-memory backend."		_words_regexp.findall(metadata_pattern.lower())
		)

		def predicate(match):
		if "intrinsic_metadata" not in match:
		return False

		return metadata_pattern_words.issubset(
		_dict_words_set(match["intrinsic_metadata"])
)		)

		hits = filter(predicate, hits)

if not url_pattern and not metadata_pattern:		if not url_pattern and not metadata_pattern:
raise ValueError(		raise ValueError(
"At least one of url_pattern and metadata_pattern must be provided."		"At least one of url_pattern and metadata_pattern must be provided."
)		)

next_page_token: Optional[str] = None		next_page_token: Optional[str] = None

if with_visit:		if with_visit:
Show All 15 Lines