D790.diff
No OneTemporary
Actions

Size

6 KB

Subscribers

None

D790.diff
View Options

	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/__init__.py
	@@ -627,7 +627,7 @@
	Yields:
	list: dictionaries with the following keys:

	- - id (int)
	+ - origin_id (int)
	- translated_metadata (str): associated metadata
	- tool (dict): tool used to compute metadata

	diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
	--- a/swh/indexer/storage/in_memory.py
	+++ b/swh/indexer/storage/in_memory.py
	@@ -4,8 +4,12 @@
	# See top-level LICENSE file for more information

	import bisect
	-from collections import defaultdict
	+from collections import defaultdict, Counter
	+import itertools
	import json
	+import operator
	+import math
	+import re

	SHA1_DIGEST_SIZE = 160

	@@ -70,6 +74,9 @@
	**self._data[key],
	}

	+ def get_all(self):
	+ yield from self.get(list(self._tools_per_id))
	+
	def get_range(self, start, end, indexer_configuration_id, limit):
	"""Retrieve data within range [start, end] bound by limit.

	@@ -175,6 +182,7 @@
	self._licenses = SubStorage(self._tools)
	self._content_metadata = SubStorage(self._tools)
	self._revision_metadata = SubStorage(self._tools)
	+ self._origin_intrinsic_metadata = SubStorage(self._tools)

	def content_mimetype_missing(self, mimetypes):
	"""Generate mimetypes missing from storage.
	@@ -560,6 +568,96 @@
	raise TypeError('identifiers must be bytes.')
	self._revision_metadata.add(metadata, conflict_update)

	+ def origin_intrinsic_metadata_get(self, ids):
	+ """Retrieve origin metadata per id.
	+
	+ Args:
	+ ids (iterable): origin identifiers
	+
	+ Yields:
	+ list: dictionaries with the following keys:
	+
	+ - origin_id (int)
	+ - translated_metadata (str): associated metadata
	+ - tool (dict): tool used to compute metadata
	+
	+ """
	+ for item in self._origin_intrinsic_metadata.get(ids):
	+ item['origin_id'] = item.pop('id')
	+ yield item
	+
	+ def origin_intrinsic_metadata_add(self, metadata,
	+ conflict_update=False):
	+ """Add origin metadata not present in storage.
	+
	+ Args:
	+ metadata (iterable): dictionaries with keys:
	+
	+ - origin_id: origin identifier
	+ - from_revision: sha1 id of the revision used to generate
	+ these metadata.
	+ - metadata: arbitrary dict
	+ - indexer_configuration_id: tool used to compute metadata
	+
	+ conflict_update: Flag to determine if we want to overwrite (true)
	+ or skip duplicates (false, the default)
	+
	+ """
	+
	+ for item in metadata:
	+ item = item.copy()
	+ item['id'] = item.pop('origin_id')
	+ self._origin_intrinsic_metadata.add([item], conflict_update)
	+
	+ def origin_intrinsic_metadata_search_fulltext(
	+ self, conjunction, limit=100):
	+ """Returns the list of origins whose metadata contain all the terms.
	+
	+ Args:
	+ conjunction (List[str]): List of terms to be searched for.
	+ limit (int): The maximum number of results to return
	+
	+ Yields:
	+ list: dictionaries with the following keys:
	+
	+ - id (int)
	+ - metadata (str): associated metadata
	+ - tool (dict): tool used to compute metadata
	+
	+ """
	+ # A very crude fulltext search implementation, but that's enough
	+ # to work on English metadata
	+ tokens_re = re.compile('[a-zA-Z0-9]+')
	+ search_tokens = list(itertools.chain(
	+ *map(tokens_re.findall, conjunction)))
	+
	+ def rank(data):
	+ # Tokenize the metadata
	+ text = json.dumps(data['metadata'])
	+ text_tokens = tokens_re.findall(text)
	+ text_token_occurences = Counter(text_tokens)
	+
	+ # Count the number of occurences of search tokens in the text
	+ score = 0
	+ for search_token in search_tokens:
	+ if text_token_occurences[search_token] == 0:
	+ # Search token is not in the text.
	+ return 0
	+ score += text_token_occurences[search_token]
	+
	+ # Normalize according to the text's length
	+ return score / math.log(len(text_tokens))
	+
	+ results = [(rank(data), data)
	+ for data in self._origin_intrinsic_metadata.get_all()]
	+ results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
	+ results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
	+ reverse=True)
	+ for (rank_, result) in results[:limit]:
	+ result = result.copy()
	+ result['origin_id'] = result.pop('id')
	+ yield result
	+
	def indexer_configuration_add(self, tools):
	"""Add new tools to the storage.

	diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
	--- a/swh/indexer/tests/storage/test_in_memory.py
	+++ b/swh/indexer/tests/storage/test_in_memory.py
	@@ -19,31 +19,3 @@
	@pytest.mark.xfail
	def test_check_config(self):
	pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_add_drop_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_search_fulltext(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_search_fulltext_rank(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_indexer_configuration_metadata_get_missing_context(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_indexer_configuration_metadata_get(self):
	- pass

File Metadata

Mime Type: text/plain
Expires: Wed, Dec 18, 12:41 AM (2 d, 11 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3218955

D790.diffNo OneTemporaryActions

D790.diffView Options

File Metadata

Event Timeline

D790.diff
No OneTemporary
Actions

D790.diff
View Options