Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696645
D790.id2491.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
D790.id2491.diff
View Options
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -104,6 +104,7 @@
- indexer_configuration_id (int): tool used to compute the output
"""
+ assert isinstance(id, bytes)
content_path = self.write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data)
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -405,11 +405,14 @@
bytes: Identifier of contents to index.
"""
+ if not isinstance(start, bytes) or not isinstance(end, bytes):
+ raise TypeError('identifiers must be bytes, not %r and %r.' %
+ (start, end))
while start:
result = self.storage.content_get_range(start, end)
contents = result['contents']
for c in contents:
- _id = c['sha1']
+ _id = hashutil.hash_to_bytes(c['sha1'])
if _id in indexed:
continue
yield _id
@@ -435,6 +438,10 @@
hashutil.hash_to_hex(sha1))
continue
res = self.index(sha1, raw_content, **kwargs)
+ if not isinstance(res['id'], bytes):
+ raise TypeError(
+ '%r.index should return ids as bytes, not %r' %
+ (self.__class__.__name__, res['id']))
if res:
yield res
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -346,7 +346,7 @@
- **id** (bytes): content's identifier
- **name** (str): symbol's name
- **kind** (str): symbol's kind
- - **language** (str): language for that content
+ - **lang** (str): language for that content
- **tool** (dict): tool used to compute the ctags' info
@@ -365,7 +365,7 @@
- **id** (bytes): sha1
- **ctags** ([list): List of dictionary with keys: name, kind,
- line, language
+ line, lang
"""
def _convert_ctags(__ctags):
@@ -412,9 +412,8 @@
ids (iterable): sha1 checksums
Yields:
- list: dictionaries with the following keys:
+ `{id: facts}` where `facts` is a dict with the following keys:
- - **id** (bytes)
- **licenses** ([str]): associated licenses for that content
- **tool** (dict): Tool used to compute the license
@@ -439,7 +438,7 @@
licenses (iterable): dictionaries with keys:
- **id**: sha1
- - **license** ([bytes]): List of licenses associated to sha1
+ - **licenses** ([bytes]): List of licenses associated to sha1
- **tool** (str): nomossa
conflict_update: Flag to determine if we want to overwrite (true)
@@ -628,7 +627,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
+ - **origin_id** (int)
- **translated_metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -3,31 +3,45 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from collections import defaultdict
+import bisect
+from collections import defaultdict, Counter
+import itertools
import json
+import operator
+import math
+import re
+SHA1_DIGEST_SIZE = 160
-class MetadataStorage:
- """Implements missing/get/add logic for both content_metadata and
- revision_metadata."""
+
+def _replace_key(d, old_key, new_key):
+ """Replaces the old key by a new key in a dict, returning a new dict."""
+ d = d.copy()
+ d[new_key] = d.pop(old_key)
+
+
+def _transform_tool(tool):
+ return {
+ 'id': tool['id'],
+ 'name': tool['tool_name'],
+ 'version': tool['tool_version'],
+ 'configuration': tool['tool_configuration'],
+ }
+
+
+class SubStorage:
+ """Implements common missing/get/add logic for each indexer type."""
def __init__(self, tools):
self._tools = tools
- self._metadata = {} # map (id_, tool_id) -> metadata_dict
+ self._sorted_ids = []
+ self._data = {} # map (id_, tool_id) -> metadata_dict
self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]
- def _transform_tool(self, tool):
- return {
- 'id': tool['id'],
- 'name': tool['tool_name'],
- 'version': tool['tool_version'],
- 'configuration': tool['tool_configuration'],
- }
-
def missing(self, ids):
- """List metadata missing from storage.
+ """List data missing from storage.
Args:
- metadata (iterable): dictionaries with keys:
+ data (iterable): dictionaries with keys:
- **id** (bytes): sha1 identifier
- **indexer_configuration_id** (int): tool used to compute
@@ -44,7 +58,7 @@
yield id_
def get(self, ids):
- """Retrieve metadata per id.
+ """Retrieve data per id.
Args:
ids (iterable): sha1 checksums
@@ -53,8 +67,8 @@
dict: dictionaries with the following keys:
- **id** (bytes)
- - **translated_metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
+ - arbitrary data (as provided to `add`)
"""
for id_ in ids:
@@ -62,36 +76,105 @@
key = (id_, tool_id)
yield {
'id': id_,
- 'tool': self._transform_tool(self._tools[tool_id]),
- 'translated_metadata': self._metadata[key],
+ 'tool': _transform_tool(self._tools[tool_id]),
+ **self._data[key],
}
- def add(self, metadata, conflict_update):
- """Add metadata not present in storage.
+ def get_all(self):
+ yield from self.get(list(self._tools_per_id))
+
+ def get_range(self, start, end, indexer_configuration_id, limit):
+ """Retrieve data within range [start, end] bound by limit.
Args:
- metadata (iterable): dictionaries with keys:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ if limit is None:
+ raise ValueError('Development error: limit should not be None')
+ from_index = bisect.bisect_left(self._sorted_ids, start)
+ to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index)
+ if to_index - from_index >= limit:
+ return {
+ 'ids': self._sorted_ids[from_index:from_index+limit],
+ 'next': self._sorted_ids[from_index+limit],
+ }
+ else:
+ return {
+ 'ids': self._sorted_ids[from_index:to_index],
+ 'next': None,
+ }
+
+ def add(self, data, conflict_update):
+ """Add data not present in storage.
+
+ Args:
+ data (iterable): dictionaries with keys:
- **id**: sha1
- - **translated_metadata**: arbitrary dict
- **indexer_configuration_id**: tool used to compute the
results
+ - arbitrary data
conflict_update (bool): Flag to determine if we want to overwrite
(true) or skip duplicates (false)
"""
- for item in metadata:
- tool_id = item['indexer_configuration_id']
- data = item['translated_metadata']
- id_ = item['id']
+ for item in data:
+ item = item.copy()
+ tool_id = item.pop('indexer_configuration_id')
+ id_ = item.pop('id')
+ data = item
if not conflict_update and \
tool_id in self._tools_per_id.get(id_, set()):
# Duplicate, should not be updated
continue
key = (id_, tool_id)
- self._metadata[key] = data
+ self._data[key] = data
self._tools_per_id[id_].add(tool_id)
+ if id_ not in self._sorted_ids:
+ bisect.insort(self._sorted_ids, id_)
+
+ def add_merge(self, new_data, conflict_update, merged_key):
+ for new_item in new_data:
+ id_ = new_item['id']
+ tool_id = new_item['indexer_configuration_id']
+ if conflict_update:
+ all_subitems = []
+ else:
+ existing = list(self.get([id_]))
+ all_subitems = [
+ old_subitem
+ for existing_item in existing
+ if existing_item['tool']['id'] == tool_id
+ for old_subitem in existing_item[merged_key]
+ ]
+ for new_subitem in new_item[merged_key]:
+ if new_subitem not in all_subitems:
+ all_subitems.append(new_subitem)
+ self.add([
+ {
+ 'id': id_,
+ 'indexer_configuration_id': tool_id,
+ merged_key: all_subitems,
+ }
+ ], conflict_update=True)
+ if id_ not in self._sorted_ids:
+ bisect.insort(self._sorted_ids, id_)
class IndexerStorage:
@@ -99,8 +182,296 @@
def __init__(self):
self._tools = {}
- self._content_metadata = MetadataStorage(self._tools)
- self._revision_metadata = MetadataStorage(self._tools)
+ self._mimetypes = SubStorage(self._tools)
+ self._languages = SubStorage(self._tools)
+ self._content_ctags = SubStorage(self._tools)
+ self._licenses = SubStorage(self._tools)
+ self._content_metadata = SubStorage(self._tools)
+ self._revision_metadata = SubStorage(self._tools)
+ self._origin_intrinsic_metadata = SubStorage(self._tools)
+
+ def content_mimetype_missing(self, mimetypes):
+ """Generate mimetypes missing from storage.
+
+ Args:
+ mimetypes (iterable): iterable of dict with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **indexer_configuration_id** (int): tool used to compute the
+ results
+
+ Yields:
+ tuple (id, indexer_configuration_id): missing id
+
+ """
+ yield from self._mimetypes.missing(mimetypes)
+
+ def content_mimetype_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ """Retrieve mimetypes within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._mimetypes.get_range(
+ start, end, indexer_configuration_id, limit)
+
+ def content_mimetype_add(self, mimetypes, conflict_update=False):
+ """Add mimetypes not present in storage.
+
+ Args:
+ mimetypes (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **mimetype** (bytes): raw content's mimetype
+ - **encoding** (bytes): raw content's encoding
+ - **indexer_configuration_id** (int): tool's id used to
+ compute the results
+ - **conflict_update** (bool): Flag to determine if we want to
+ overwrite (``True``) or skip duplicates (``False``, the
+ default)
+
+ """
+ if not all(isinstance(x['id'], bytes) for x in mimetypes):
+ raise TypeError('identifiers must be bytes.')
+ self._mimetypes.add(mimetypes, conflict_update)
+
+ def content_mimetype_get(self, ids, db=None, cur=None):
+ """Retrieve full content mimetype per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ mimetypes (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **mimetype** (bytes): raw content's mimetype
+ - **encoding** (bytes): raw content's encoding
+ - **tool** (dict): Tool used to compute the language
+
+ """
+ yield from self._mimetypes.get(ids)
+
+ def content_language_missing(self, languages):
+ """List languages missing from storage.
+
+ Args:
+ languages (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **indexer_configuration_id** (int): tool used to compute
+ the results
+
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
+
+ """
+ yield from self._languages.missing(languages)
+
+ def content_language_get(self, ids):
+ """Retrieve full content language per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ languages (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **lang** (bytes): raw content's language
+ - **tool** (dict): Tool used to compute the language
+
+ """
+ yield from self._languages.get(ids)
+
+ def content_language_add(self, languages, conflict_update=False):
+ """Add languages not present in storage.
+
+ Args:
+ languages (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1
+ - **lang** (bytes): language detected
+
+ conflict_update (bool): Flag to determine if we want to
+ overwrite (true) or skip duplicates (false, the
+ default)
+
+ """
+ self._languages.add(languages, conflict_update)
+
+ def content_ctags_missing(self, ctags):
+ """List ctags missing from storage.
+
+ Args:
+ ctags (iterable): dicts with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **indexer_configuration_id** (int): tool used to compute
+ the results
+
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
+
+ """
+ yield from self._content_ctags.missing(ctags)
+
+ def content_ctags_get(self, ids):
+ """Retrieve ctags per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ Dictionaries with keys:
+
+ - **id** (bytes): content's identifier
+ - **name** (str): symbol's name
+ - **kind** (str): symbol's kind
+ - **lang** (str): language for that content
+ - **tool** (dict): tool used to compute the ctags' info
+
+
+ """
+ for item in self._content_ctags.get(ids):
+ for item_ctags_item in item['ctags']:
+ yield {
+ 'id': item['id'],
+ 'tool': item['tool'],
+ **item_ctags_item
+ }
+
+ def content_ctags_add(self, ctags, conflict_update=False):
+ """Add ctags not present in storage
+
+ Args:
+ ctags (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1
+ - **ctags** ([list): List of dictionary with keys: name, kind,
+ line, lang
+ - **indexer_configuration_id**: tool used to compute the
+ results
+
+ """
+ if not all(isinstance(x['id'], bytes) for x in ctags):
+ raise TypeError('identifiers must be bytes.')
+ self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
+
+ def content_ctags_search(self, expression,
+ limit=10, last_sha1=None, db=None, cur=None):
+ """Search through content's raw ctags symbols.
+
+ Args:
+ expression (str): Expression to search for
+ limit (int): Number of rows to return (default to 10).
+ last_sha1 (str): Offset from which retrieving data (default to '').
+
+ Yields:
+ rows of ctags including id, name, lang, kind, line, etc...
+
+ """
+ nb_matches = 0
+ for ((id_, tool_id), item) in \
+ sorted(self._content_ctags._data.items()):
+ if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
+ continue
+ nb_matches += 1
+ for ctags_item in item['ctags']:
+ if ctags_item['name'] != expression:
+ continue
+ yield {
+ 'id': id_,
+ 'tool': _transform_tool(self._tools[tool_id]),
+ **ctags_item
+ }
+ if nb_matches >= limit:
+ return
+
+ def content_fossology_license_get(self, ids):
+ """Retrieve licenses per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ `{id: facts}` where `facts` is a dict with the following keys:
+
+ - **licenses** ([str]): associated licenses for that content
+ - **tool** (dict): Tool used to compute the license
+
+ """
+ # TODO: remove this reformatting in order to yield items with the
+ # same format as other _get methods.
+ res = {}
+ for d in self._licenses.get(ids):
+ res.setdefault(d.pop('id'), []).append(d)
+ for (id_, facts) in res.items():
+ yield {id_: facts}
+
+ def content_fossology_license_add(self, licenses, conflict_update=False):
+ """Add licenses not present in storage.
+
+ Args:
+ licenses (iterable): dictionaries with keys:
+
+ - **id**: sha1
+ - **licenses** ([bytes]): List of licenses associated to sha1
+ - **tool** (str): nomossa
+
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ Returns:
+ list: content_license entries which failed due to unknown licenses
+
+ """
+ if not all(isinstance(x['id'], bytes) for x in licenses):
+ raise TypeError('identifiers must be bytes.')
+ self._licenses.add_merge(licenses, conflict_update, 'licenses')
+
+ def content_fossology_license_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ """Retrieve licenses within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._licenses.get_range(
+ start, end, indexer_configuration_id, limit)
def content_metadata_missing(self, metadata):
"""List metadata missing from storage.
@@ -149,6 +520,8 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._content_metadata.add(metadata, conflict_update)
def revision_metadata_missing(self, metadata):
@@ -197,8 +570,101 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._revision_metadata.add(metadata, conflict_update)
+ def origin_intrinsic_metadata_get(self, ids):
+ """Retrieve origin metadata per id.
+
+ Args:
+ ids (iterable): origin identifiers
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ - **origin_id** (int)
+ - **translated_metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+
+ """
+ for item in self._origin_intrinsic_metadata.get(ids):
+ item['origin_id'] = item.pop('id')
+ yield item
+
+ def origin_intrinsic_metadata_add(self, metadata,
+ conflict_update=False):
+ """Add origin metadata not present in storage.
+
+ Args:
+ metadata (iterable): dictionaries with keys:
+
+ - **origin_id**: origin identifier
+ - **from_revision**: sha1 id of the revision used to generate
+ these metadata.
+ - **metadata**: arbitrary dict
+ - **indexer_configuration_id**: tool used to compute metadata
+
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+
+ for item in metadata:
+ item = item.copy()
+ item['id'] = item.pop('origin_id')
+ self._origin_intrinsic_metadata.add([item], conflict_update)
+
+ def origin_intrinsic_metadata_search_fulltext(
+ self, conjunction, limit=100):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ conjunction (List[str]): List of terms to be searched for.
+ limit (int): The maximum number of results to return
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+
+ """
+ # A very crude fulltext search implementation, but that's enough
+ # to work on English metadata
+ tokens_re = re.compile('[a-zA-Z0-9]+')
+ search_tokens = list(itertools.chain(
+ *map(tokens_re.findall, conjunction)))
+
+ def rank(data):
+ # Tokenize the metadata
+ text = json.dumps(data['metadata'])
+ text_tokens = tokens_re.findall(text)
+ text_token_occurences = Counter(text_tokens)
+
+ # Count the number of occurences of search tokens in the text
+ score = 0
+ for search_token in search_tokens:
+ if text_token_occurences[search_token] == 0:
+ # Search token is not in the text.
+ return 0
+ score += text_token_occurences[search_token]
+
+ # Normalize according to the text's length
+ return score / math.log(len(text_tokens))
+
+ results = [(rank(data), data)
+ for data in self._origin_intrinsic_metadata.get_all()]
+ results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
+ results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
+ reverse=True)
+ results = [data for (rank_, data) in results]
+ for result in results[:limit]:
+ result = result.copy()
+ result['origin_id'] = result.pop('id')
+ yield result
+
def indexer_configuration_add(self, tools):
"""Add new tools to the storage.
diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
--- a/swh/indexer/tests/storage/test_in_memory.py
+++ b/swh/indexer/tests/storage/test_in_memory.py
@@ -13,131 +13,9 @@
}
super().setUp()
- @pytest.mark.xfail
- def test_check_config(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_missing(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_add__drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_add__update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_missing(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_add__drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_add__update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_missing(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_search(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_search_no_result(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_add__add_new_ctags_added(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_add__update_in_place(self):
- pass
-
- @pytest.mark.xfail
- def test_content_fossology_license_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_fossology_license_add__new_license_added(self):
- pass
-
- @pytest.mark.xfail
- def test_content_fossology_license_add__update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_get(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_add_drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_search_fulltext(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_search_fulltext_rank(self):
- pass
+ def reset_storage_tables(self):
+ self.storage = self.storage.__class__()
@pytest.mark.xfail
- def test_indexer_configuration_metadata_get_missing_context(self):
- pass
-
- @pytest.mark.xfail
- def test_indexer_configuration_metadata_get(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit_none(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit(self, mimetypes):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_fossology_license_get_range_limit_none(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_fossology_license_get_range_no_limit(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_fossology_license_get_range_no_limit_with_filter(
- self):
- pass
-
- @pytest.mark.xfail
- def test_generate_fossology_license_get_range_limit(self):
+ def test_check_config(self):
pass
diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py
--- a/swh/indexer/tests/test_ctags.py
+++ b/swh/indexer/tests/test_ctags.py
@@ -11,7 +11,7 @@
)
from swh.indexer.tests.test_utils import (
- BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest,
+ CommonContentIndexerTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool,
SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG
)
@@ -99,12 +99,6 @@
'workdir': '/nowhere',
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.objstorage = MockObjStorage()
- self.tool_config = self.config['tools']['configuration']
-
class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase):
"""Ctags indexer test scenarios:
@@ -113,8 +107,13 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.idx_storage.content_ctags_get(ids)
+
def setUp(self):
self.indexer = CtagsIndexerTest()
+ self.idx_storage = self.indexer.idx_storage
# Prepare test input
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import unittest
-import logging
from unittest.mock import patch
@@ -14,10 +13,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -78,12 +76,6 @@
},
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- self.objstorage = MockObjStorage()
-
class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
"""Language indexer test scenarios:
@@ -92,8 +84,14 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.idx_storage.content_ctags_get(ids)
+
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseTestIndexer()
+ self.idx_storage = self.indexer.idx_storage
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
@@ -138,15 +136,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestFossologyLicenseRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -159,12 +148,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py
--- a/swh/indexer/tests/test_language.py
+++ b/swh/indexer/tests/test_language.py
@@ -7,8 +7,8 @@
from swh.indexer import language
from swh.indexer.language import LanguageIndexer
from swh.indexer.tests.test_utils import (
- BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest,
- CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG
+ CommonContentIndexerTest, CommonIndexerWithErrorsTest,
+ CommonIndexerNoTool, BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -30,12 +30,6 @@
}
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.objstorage = MockObjStorage()
- self.tool_config = self.config['tools']['configuration']
-
class Language(unittest.TestCase):
"""Tests pygments tool for language detection
@@ -60,8 +54,14 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.indexer.idx_storage.content_language_get(ids)
+
def setUp(self):
self.indexer = LanguageTestIndexer()
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import unittest
-import logging
from unittest.mock import patch
@@ -13,10 +12,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -61,12 +59,6 @@
},
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- self.objstorage = MockObjStorage()
-
class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
"""Mimetype indexer test scenarios:
@@ -75,8 +67,13 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.idx_storage.content_mimetype_get(ids)
+
def setUp(self):
self.indexer = MimetypeTestIndexer()
+ self.idx_storage = self.indexer.idx_storage
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
@@ -123,15 +120,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- # sync objstorage and storage
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestMimetypeRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -144,12 +132,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = MimetypeRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -4,10 +4,12 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
+import random
from swh.objstorage.exc import ObjNotFoundError
from swh.model import hashutil
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.indexer.storage import INDEXER_CFG_KEY
@@ -488,6 +490,21 @@
'id': DIRECTORY_ID,
'entries': DIRECTORY,
}])
+ for (obj_id, content) in OBJ_STORAGE_DATA.items():
+ if hasattr(hashlib, 'blake2s'):
+ blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
+ else:
+ # fallback for Python <3.6
+ blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
+ storage.content_add([{
+ 'data': content,
+ 'length': len(content),
+ 'status': 'visible',
+ 'sha1': hash_to_bytes(obj_id),
+ 'sha1_git': hash_to_bytes(obj_id),
+ 'sha256': hashlib.sha256(content).digest(),
+ 'blake2s256': blake2s256
+ }])
class MockStorage():
@@ -659,7 +676,15 @@
class CommonContentIndexerTest:
- def assert_results_ok(self, actual_results, expected_results=None):
+ def get_indexer_results(self, ids):
+ """Override this for indexers that don't have a mock storage."""
+ return self.indexer.idx_storage.state
+
+ def assert_results_ok(self, sha1s, expected_results=None):
+ sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
+ for sha1 in sha1s]
+ actual_results = self.get_indexer_results(sha1s)
+
if expected_results is None:
expected_results = self.expected_results
@@ -678,15 +703,12 @@
# when
self.indexer.run(sha1s, policy_update='update-dups')
- actual_results = self.indexer.idx_storage.state
- self.assertTrue(self.indexer.idx_storage.conflict_update)
- self.assert_results_ok(actual_results)
+ self.assert_results_ok(sha1s)
# 2nd pass
self.indexer.run(sha1s, policy_update='ignore-dups')
- self.assertFalse(self.indexer.idx_storage.conflict_update)
- self.assert_results_ok(actual_results)
+ self.assert_results_ok(sha1s)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
@@ -696,29 +718,35 @@
# when
self.indexer.run(sha1s, policy_update='update-dups')
- actual_results = self.indexer.idx_storage.state
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
- self.assert_results_ok(actual_results, expected_results)
+ self.assert_results_ok(sha1s, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
+ def setUp(self):
+ self.contents = sorted(OBJ_STORAGE_DATA)
+
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
+ actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data['id']
- self.assertEqual(indexed_data, expected_results[_id])
- self.assertTrue(start <= _id and _id <= end)
+ assert isinstance(_id, bytes)
+ indexed_data = indexed_data.copy()
+ indexed_data['id'] = hash_to_hex(indexed_data['id'])
+ self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
+ self.assertTrue(start <= _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
@@ -726,7 +754,8 @@
"""Indexing contents without existing data results in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
@@ -737,12 +766,13 @@
"""Indexing contents with existing data results in less indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
- start, end, indexed=set(data_indexed))
+ start, end, indexed=set(map(hash_to_bytes, data_indexed)))
# craft the expected results
expected_results = self.expected_results.copy()
@@ -756,7 +786,8 @@
"""Optimal indexing should result in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
@@ -783,8 +814,9 @@
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
- start, end = ['0000000000000000000000000000000000000000',
- '0000000000000000000000000000000000000001']
+ _start, _end = ['0000000000000000000000000000000000000000',
+ '0000000000000000000000000000000000000001']
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(
start, end, incremental=False)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 8:53 PM (3 h, 52 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227188
Attached To
D790: Add in-mem storage for intrinsic metadata.
Event Timeline
Log In to Comment