Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123155
D790.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D790.diff
View Options
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -627,7 +627,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
+ - **origin_id** (int)
- **translated_metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -4,8 +4,12 @@
# See top-level LICENSE file for more information
import bisect
-from collections import defaultdict
+from collections import defaultdict, Counter
+import itertools
import json
+import operator
+import math
+import re
SHA1_DIGEST_SIZE = 160
@@ -70,6 +74,9 @@
**self._data[key],
}
+ def get_all(self):
+ yield from self.get(list(self._tools_per_id))
+
def get_range(self, start, end, indexer_configuration_id, limit):
"""Retrieve data within range [start, end] bound by limit.
@@ -175,6 +182,7 @@
self._licenses = SubStorage(self._tools)
self._content_metadata = SubStorage(self._tools)
self._revision_metadata = SubStorage(self._tools)
+ self._origin_intrinsic_metadata = SubStorage(self._tools)
def content_mimetype_missing(self, mimetypes):
"""Generate mimetypes missing from storage.
@@ -560,6 +568,96 @@
raise TypeError('identifiers must be bytes.')
self._revision_metadata.add(metadata, conflict_update)
+ def origin_intrinsic_metadata_get(self, ids):
+ """Retrieve origin metadata per id.
+
+ Args:
+ ids (iterable): origin identifiers
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ - **origin_id** (int)
+ - **translated_metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+
+ """
+ for item in self._origin_intrinsic_metadata.get(ids):
+ item['origin_id'] = item.pop('id')
+ yield item
+
+ def origin_intrinsic_metadata_add(self, metadata,
+ conflict_update=False):
+ """Add origin metadata not present in storage.
+
+ Args:
+ metadata (iterable): dictionaries with keys:
+
+ - **origin_id**: origin identifier
+ - **from_revision**: sha1 id of the revision used to generate
+ these metadata.
+ - **metadata**: arbitrary dict
+ - **indexer_configuration_id**: tool used to compute metadata
+
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+
+ for item in metadata:
+ item = item.copy()
+ item['id'] = item.pop('origin_id')
+ self._origin_intrinsic_metadata.add([item], conflict_update)
+
+ def origin_intrinsic_metadata_search_fulltext(
+ self, conjunction, limit=100):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ conjunction (List[str]): List of terms to be searched for.
+ limit (int): The maximum number of results to return
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+
+ """
+ # A very crude fulltext search implementation, but that's enough
+ # to work on English metadata
+ tokens_re = re.compile('[a-zA-Z0-9]+')
+ search_tokens = list(itertools.chain(
+ *map(tokens_re.findall, conjunction)))
+
+ def rank(data):
+ # Tokenize the metadata
+ text = json.dumps(data['metadata'])
+ text_tokens = tokens_re.findall(text)
+ text_token_occurences = Counter(text_tokens)
+
+ # Count the number of occurences of search tokens in the text
+ score = 0
+ for search_token in search_tokens:
+ if text_token_occurences[search_token] == 0:
+ # Search token is not in the text.
+ return 0
+ score += text_token_occurences[search_token]
+
+ # Normalize according to the text's length
+ return score / math.log(len(text_tokens))
+
+ results = [(rank(data), data)
+ for data in self._origin_intrinsic_metadata.get_all()]
+ results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
+ results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
+ reverse=True)
+ for (rank_, result) in results[:limit]:
+ result = result.copy()
+ result['origin_id'] = result.pop('id')
+ yield result
+
def indexer_configuration_add(self, tools):
"""Add new tools to the storage.
diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
--- a/swh/indexer/tests/storage/test_in_memory.py
+++ b/swh/indexer/tests/storage/test_in_memory.py
@@ -19,31 +19,3 @@
@pytest.mark.xfail
def test_check_config(self):
pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_get(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_add_drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_search_fulltext(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_search_fulltext_rank(self):
- pass
-
- @pytest.mark.xfail
- def test_indexer_configuration_metadata_get_missing_context(self):
- pass
-
- @pytest.mark.xfail
- def test_indexer_configuration_metadata_get(self):
- pass
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 12:41 AM (2 d, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218955
Attached To
D790: Add in-mem storage for intrinsic metadata.
Event Timeline
Log In to Comment