Page MenuHomeSoftware Heritage

D790.id2491.diff
No OneTemporary

D790.id2491.diff

diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -104,6 +104,7 @@
- indexer_configuration_id (int): tool used to compute the output
"""
+ assert isinstance(id, bytes)
content_path = self.write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data)
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -405,11 +405,14 @@
bytes: Identifier of contents to index.
"""
+ if not isinstance(start, bytes) or not isinstance(end, bytes):
+ raise TypeError('identifiers must be bytes, not %r and %r.' %
+ (start, end))
while start:
result = self.storage.content_get_range(start, end)
contents = result['contents']
for c in contents:
- _id = c['sha1']
+ _id = hashutil.hash_to_bytes(c['sha1'])
if _id in indexed:
continue
yield _id
@@ -435,6 +438,10 @@
hashutil.hash_to_hex(sha1))
continue
res = self.index(sha1, raw_content, **kwargs)
+ if not isinstance(res['id'], bytes):
+ raise TypeError(
+ '%r.index should return ids as bytes, not %r' %
+ (self.__class__.__name__, res['id']))
if res:
yield res
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -346,7 +346,7 @@
- **id** (bytes): content's identifier
- **name** (str): symbol's name
- **kind** (str): symbol's kind
- - **language** (str): language for that content
+ - **lang** (str): language for that content
- **tool** (dict): tool used to compute the ctags' info
@@ -365,7 +365,7 @@
- **id** (bytes): sha1
- **ctags** ([list): List of dictionary with keys: name, kind,
- line, language
+ line, lang
"""
def _convert_ctags(__ctags):
@@ -412,9 +412,8 @@
ids (iterable): sha1 checksums
Yields:
- list: dictionaries with the following keys:
+ `{id: facts}` where `facts` is a dict with the following keys:
- - **id** (bytes)
- **licenses** ([str]): associated licenses for that content
- **tool** (dict): Tool used to compute the license
@@ -439,7 +438,7 @@
licenses (iterable): dictionaries with keys:
- **id**: sha1
- - **license** ([bytes]): List of licenses associated to sha1
+ - **licenses** ([bytes]): List of licenses associated to sha1
- **tool** (str): nomossa
conflict_update: Flag to determine if we want to overwrite (true)
@@ -628,7 +627,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
+ - **origin_id** (int)
- **translated_metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -3,31 +3,45 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from collections import defaultdict
+import bisect
+from collections import defaultdict, Counter
+import itertools
import json
+import operator
+import math
+import re
+SHA1_DIGEST_SIZE = 160
-class MetadataStorage:
- """Implements missing/get/add logic for both content_metadata and
- revision_metadata."""
+
+def _replace_key(d, old_key, new_key):
+ """Replaces the old key by a new key in a dict, returning a new dict."""
+ d = d.copy()
+ d[new_key] = d.pop(old_key)
+
+
+def _transform_tool(tool):
+ return {
+ 'id': tool['id'],
+ 'name': tool['tool_name'],
+ 'version': tool['tool_version'],
+ 'configuration': tool['tool_configuration'],
+ }
+
+
+class SubStorage:
+ """Implements common missing/get/add logic for each indexer type."""
def __init__(self, tools):
self._tools = tools
- self._metadata = {} # map (id_, tool_id) -> metadata_dict
+ self._sorted_ids = []
+ self._data = {} # map (id_, tool_id) -> metadata_dict
self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]
- def _transform_tool(self, tool):
- return {
- 'id': tool['id'],
- 'name': tool['tool_name'],
- 'version': tool['tool_version'],
- 'configuration': tool['tool_configuration'],
- }
-
def missing(self, ids):
- """List metadata missing from storage.
+ """List data missing from storage.
Args:
- metadata (iterable): dictionaries with keys:
+ data (iterable): dictionaries with keys:
- **id** (bytes): sha1 identifier
- **indexer_configuration_id** (int): tool used to compute
@@ -44,7 +58,7 @@
yield id_
def get(self, ids):
- """Retrieve metadata per id.
+ """Retrieve data per id.
Args:
ids (iterable): sha1 checksums
@@ -53,8 +67,8 @@
dict: dictionaries with the following keys:
- **id** (bytes)
- - **translated_metadata** (str): associated metadata
- **tool** (dict): tool used to compute metadata
+ - arbitrary data (as provided to `add`)
"""
for id_ in ids:
@@ -62,36 +76,105 @@
key = (id_, tool_id)
yield {
'id': id_,
- 'tool': self._transform_tool(self._tools[tool_id]),
- 'translated_metadata': self._metadata[key],
+ 'tool': _transform_tool(self._tools[tool_id]),
+ **self._data[key],
}
- def add(self, metadata, conflict_update):
- """Add metadata not present in storage.
+ def get_all(self):
+ yield from self.get(list(self._tools_per_id))
+
+ def get_range(self, start, end, indexer_configuration_id, limit):
+ """Retrieve data within range [start, end] bound by limit.
Args:
- metadata (iterable): dictionaries with keys:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ if limit is None:
+ raise ValueError('Development error: limit should not be None')
+ from_index = bisect.bisect_left(self._sorted_ids, start)
+ to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index)
+ if to_index - from_index >= limit:
+ return {
+ 'ids': self._sorted_ids[from_index:from_index+limit],
+ 'next': self._sorted_ids[from_index+limit],
+ }
+ else:
+ return {
+ 'ids': self._sorted_ids[from_index:to_index],
+ 'next': None,
+ }
+
+ def add(self, data, conflict_update):
+ """Add data not present in storage.
+
+ Args:
+ data (iterable): dictionaries with keys:
- **id**: sha1
- - **translated_metadata**: arbitrary dict
- **indexer_configuration_id**: tool used to compute the
results
+ - arbitrary data
conflict_update (bool): Flag to determine if we want to overwrite
(true) or skip duplicates (false)
"""
- for item in metadata:
- tool_id = item['indexer_configuration_id']
- data = item['translated_metadata']
- id_ = item['id']
+ for item in data:
+ item = item.copy()
+ tool_id = item.pop('indexer_configuration_id')
+ id_ = item.pop('id')
+ data = item
if not conflict_update and \
tool_id in self._tools_per_id.get(id_, set()):
# Duplicate, should not be updated
continue
key = (id_, tool_id)
- self._metadata[key] = data
+ self._data[key] = data
self._tools_per_id[id_].add(tool_id)
+ if id_ not in self._sorted_ids:
+ bisect.insort(self._sorted_ids, id_)
+
+ def add_merge(self, new_data, conflict_update, merged_key):
+ for new_item in new_data:
+ id_ = new_item['id']
+ tool_id = new_item['indexer_configuration_id']
+ if conflict_update:
+ all_subitems = []
+ else:
+ existing = list(self.get([id_]))
+ all_subitems = [
+ old_subitem
+ for existing_item in existing
+ if existing_item['tool']['id'] == tool_id
+ for old_subitem in existing_item[merged_key]
+ ]
+ for new_subitem in new_item[merged_key]:
+ if new_subitem not in all_subitems:
+ all_subitems.append(new_subitem)
+ self.add([
+ {
+ 'id': id_,
+ 'indexer_configuration_id': tool_id,
+ merged_key: all_subitems,
+ }
+ ], conflict_update=True)
+ if id_ not in self._sorted_ids:
+ bisect.insort(self._sorted_ids, id_)
class IndexerStorage:
@@ -99,8 +182,296 @@
def __init__(self):
self._tools = {}
- self._content_metadata = MetadataStorage(self._tools)
- self._revision_metadata = MetadataStorage(self._tools)
+ self._mimetypes = SubStorage(self._tools)
+ self._languages = SubStorage(self._tools)
+ self._content_ctags = SubStorage(self._tools)
+ self._licenses = SubStorage(self._tools)
+ self._content_metadata = SubStorage(self._tools)
+ self._revision_metadata = SubStorage(self._tools)
+ self._origin_intrinsic_metadata = SubStorage(self._tools)
+
+ def content_mimetype_missing(self, mimetypes):
+ """Generate mimetypes missing from storage.
+
+ Args:
+ mimetypes (iterable): iterable of dict with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **indexer_configuration_id** (int): tool used to compute the
+ results
+
+ Yields:
+ tuple (id, indexer_configuration_id): missing id
+
+ """
+ yield from self._mimetypes.missing(mimetypes)
+
+ def content_mimetype_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ """Retrieve mimetypes within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._mimetypes.get_range(
+ start, end, indexer_configuration_id, limit)
+
+ def content_mimetype_add(self, mimetypes, conflict_update=False):
+ """Add mimetypes not present in storage.
+
+ Args:
+ mimetypes (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **mimetype** (bytes): raw content's mimetype
+ - **encoding** (bytes): raw content's encoding
+ - **indexer_configuration_id** (int): tool's id used to
+ compute the results
+ - **conflict_update** (bool): Flag to determine if we want to
+ overwrite (``True``) or skip duplicates (``False``, the
+ default)
+
+ """
+ if not all(isinstance(x['id'], bytes) for x in mimetypes):
+ raise TypeError('identifiers must be bytes.')
+ self._mimetypes.add(mimetypes, conflict_update)
+
+ def content_mimetype_get(self, ids, db=None, cur=None):
+ """Retrieve full content mimetype per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ mimetypes (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **mimetype** (bytes): raw content's mimetype
+ - **encoding** (bytes): raw content's encoding
+ - **tool** (dict): Tool used to compute the language
+
+ """
+ yield from self._mimetypes.get(ids)
+
+ def content_language_missing(self, languages):
+ """List languages missing from storage.
+
+ Args:
+ languages (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **indexer_configuration_id** (int): tool used to compute
+ the results
+
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
+
+ """
+ yield from self._languages.missing(languages)
+
+ def content_language_get(self, ids):
+ """Retrieve full content language per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ languages (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **lang** (bytes): raw content's language
+ - **tool** (dict): Tool used to compute the language
+
+ """
+ yield from self._languages.get(ids)
+
+ def content_language_add(self, languages, conflict_update=False):
+ """Add languages not present in storage.
+
+ Args:
+ languages (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1
+ - **lang** (bytes): language detected
+
+ conflict_update (bool): Flag to determine if we want to
+ overwrite (true) or skip duplicates (false, the
+ default)
+
+ """
+ self._languages.add(languages, conflict_update)
+
+ def content_ctags_missing(self, ctags):
+ """List ctags missing from storage.
+
+ Args:
+ ctags (iterable): dicts with keys:
+
+ - **id** (bytes): sha1 identifier
+ - **indexer_configuration_id** (int): tool used to compute
+ the results
+
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
+
+ """
+ yield from self._content_ctags.missing(ctags)
+
+ def content_ctags_get(self, ids):
+ """Retrieve ctags per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ Dictionaries with keys:
+
+ - **id** (bytes): content's identifier
+ - **name** (str): symbol's name
+ - **kind** (str): symbol's kind
+ - **lang** (str): language for that content
+ - **tool** (dict): tool used to compute the ctags' info
+
+
+ """
+ for item in self._content_ctags.get(ids):
+ for item_ctags_item in item['ctags']:
+ yield {
+ 'id': item['id'],
+ 'tool': item['tool'],
+ **item_ctags_item
+ }
+
+ def content_ctags_add(self, ctags, conflict_update=False):
+ """Add ctags not present in storage
+
+ Args:
+ ctags (iterable): dictionaries with keys:
+
+ - **id** (bytes): sha1
+ - **ctags** ([list): List of dictionary with keys: name, kind,
+ line, lang
+ - **indexer_configuration_id**: tool used to compute the
+ results
+
+ """
+ if not all(isinstance(x['id'], bytes) for x in ctags):
+ raise TypeError('identifiers must be bytes.')
+ self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
+
+ def content_ctags_search(self, expression,
+ limit=10, last_sha1=None, db=None, cur=None):
+ """Search through content's raw ctags symbols.
+
+ Args:
+ expression (str): Expression to search for
+ limit (int): Number of rows to return (default to 10).
+ last_sha1 (str): Offset from which retrieving data (default to '').
+
+ Yields:
+ rows of ctags including id, name, lang, kind, line, etc...
+
+ """
+ nb_matches = 0
+ for ((id_, tool_id), item) in \
+ sorted(self._content_ctags._data.items()):
+ if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
+ continue
+ nb_matches += 1
+ for ctags_item in item['ctags']:
+ if ctags_item['name'] != expression:
+ continue
+ yield {
+ 'id': id_,
+ 'tool': _transform_tool(self._tools[tool_id]),
+ **ctags_item
+ }
+ if nb_matches >= limit:
+ return
+
+ def content_fossology_license_get(self, ids):
+ """Retrieve licenses per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ `{id: facts}` where `facts` is a dict with the following keys:
+
+ - **licenses** ([str]): associated licenses for that content
+ - **tool** (dict): Tool used to compute the license
+
+ """
+ # TODO: remove this reformatting in order to yield items with the
+ # same format as other _get methods.
+ res = {}
+ for d in self._licenses.get(ids):
+ res.setdefault(d.pop('id'), []).append(d)
+ for (id_, facts) in res.items():
+ yield {id_: facts}
+
+ def content_fossology_license_add(self, licenses, conflict_update=False):
+ """Add licenses not present in storage.
+
+ Args:
+ licenses (iterable): dictionaries with keys:
+
+ - **id**: sha1
+ - **licenses** ([bytes]): List of licenses associated to sha1
+ - **tool** (str): nomossa
+
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ Returns:
+ list: content_license entries which failed due to unknown licenses
+
+ """
+ if not all(isinstance(x['id'], bytes) for x in licenses):
+ raise TypeError('identifiers must be bytes.')
+ self._licenses.add_merge(licenses, conflict_update, 'licenses')
+
+ def content_fossology_license_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ """Retrieve licenses within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._licenses.get_range(
+ start, end, indexer_configuration_id, limit)
def content_metadata_missing(self, metadata):
"""List metadata missing from storage.
@@ -149,6 +520,8 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._content_metadata.add(metadata, conflict_update)
def revision_metadata_missing(self, metadata):
@@ -197,8 +570,101 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._revision_metadata.add(metadata, conflict_update)
+ def origin_intrinsic_metadata_get(self, ids):
+ """Retrieve origin metadata per id.
+
+ Args:
+ ids (iterable): origin identifiers
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ - **origin_id** (int)
+ - **translated_metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+
+ """
+ for item in self._origin_intrinsic_metadata.get(ids):
+ item['origin_id'] = item.pop('id')
+ yield item
+
+ def origin_intrinsic_metadata_add(self, metadata,
+ conflict_update=False):
+ """Add origin metadata not present in storage.
+
+ Args:
+ metadata (iterable): dictionaries with keys:
+
+ - **origin_id**: origin identifier
+ - **from_revision**: sha1 id of the revision used to generate
+ these metadata.
+ - **metadata**: arbitrary dict
+ - **indexer_configuration_id**: tool used to compute metadata
+
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+
+ for item in metadata:
+ item = item.copy()
+ item['id'] = item.pop('origin_id')
+ self._origin_intrinsic_metadata.add([item], conflict_update)
+
+ def origin_intrinsic_metadata_search_fulltext(
+ self, conjunction, limit=100):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ conjunction (List[str]): List of terms to be searched for.
+ limit (int): The maximum number of results to return
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+
+ """
+ # A very crude fulltext search implementation, but that's enough
+ # to work on English metadata
+ tokens_re = re.compile('[a-zA-Z0-9]+')
+ search_tokens = list(itertools.chain(
+ *map(tokens_re.findall, conjunction)))
+
+ def rank(data):
+ # Tokenize the metadata
+ text = json.dumps(data['metadata'])
+ text_tokens = tokens_re.findall(text)
+ text_token_occurences = Counter(text_tokens)
+
+ # Count the number of occurences of search tokens in the text
+ score = 0
+ for search_token in search_tokens:
+ if text_token_occurences[search_token] == 0:
+ # Search token is not in the text.
+ return 0
+ score += text_token_occurences[search_token]
+
+ # Normalize according to the text's length
+ return score / math.log(len(text_tokens))
+
+ results = [(rank(data), data)
+ for data in self._origin_intrinsic_metadata.get_all()]
+ results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
+ results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
+ reverse=True)
+ results = [data for (rank_, data) in results]
+ for result in results[:limit]:
+ result = result.copy()
+ result['origin_id'] = result.pop('id')
+ yield result
+
def indexer_configuration_add(self, tools):
"""Add new tools to the storage.
diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
--- a/swh/indexer/tests/storage/test_in_memory.py
+++ b/swh/indexer/tests/storage/test_in_memory.py
@@ -13,131 +13,9 @@
}
super().setUp()
- @pytest.mark.xfail
- def test_check_config(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_missing(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_add__drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_add__update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_mimetype_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_missing(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_add__drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_language_add__update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_missing(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_search(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_search_no_result(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_add__add_new_ctags_added(self):
- pass
-
- @pytest.mark.xfail
- def test_content_ctags_add__update_in_place(self):
- pass
-
- @pytest.mark.xfail
- def test_content_fossology_license_get(self):
- pass
-
- @pytest.mark.xfail
- def test_content_fossology_license_add__new_license_added(self):
- pass
-
- @pytest.mark.xfail
- def test_content_fossology_license_add__update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_get(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_add_drop_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_search_fulltext(self):
- pass
-
- @pytest.mark.xfail
- def test_origin_intrinsic_metadata_search_fulltext_rank(self):
- pass
+ def reset_storage_tables(self):
+ self.storage = self.storage.__class__()
@pytest.mark.xfail
- def test_indexer_configuration_metadata_get_missing_context(self):
- pass
-
- @pytest.mark.xfail
- def test_indexer_configuration_metadata_get(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit_none(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit(self, mimetypes):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_fossology_license_get_range_limit_none(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_fossology_license_get_range_no_limit(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_fossology_license_get_range_no_limit_with_filter(
- self):
- pass
-
- @pytest.mark.xfail
- def test_generate_fossology_license_get_range_limit(self):
+ def test_check_config(self):
pass
diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py
--- a/swh/indexer/tests/test_ctags.py
+++ b/swh/indexer/tests/test_ctags.py
@@ -11,7 +11,7 @@
)
from swh.indexer.tests.test_utils import (
- BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest,
+ CommonContentIndexerTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool,
SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG
)
@@ -99,12 +99,6 @@
'workdir': '/nowhere',
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.objstorage = MockObjStorage()
- self.tool_config = self.config['tools']['configuration']
-
class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase):
"""Ctags indexer test scenarios:
@@ -113,8 +107,13 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.idx_storage.content_ctags_get(ids)
+
def setUp(self):
self.indexer = CtagsIndexerTest()
+ self.idx_storage = self.indexer.idx_storage
# Prepare test input
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import unittest
-import logging
from unittest.mock import patch
@@ -14,10 +13,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -78,12 +76,6 @@
},
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- self.objstorage = MockObjStorage()
-
class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
"""Language indexer test scenarios:
@@ -92,8 +84,14 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.idx_storage.content_ctags_get(ids)
+
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseTestIndexer()
+ self.idx_storage = self.indexer.idx_storage
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
@@ -138,15 +136,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestFossologyLicenseRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -159,12 +148,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py
--- a/swh/indexer/tests/test_language.py
+++ b/swh/indexer/tests/test_language.py
@@ -7,8 +7,8 @@
from swh.indexer import language
from swh.indexer.language import LanguageIndexer
from swh.indexer.tests.test_utils import (
- BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest,
- CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG
+ CommonContentIndexerTest, CommonIndexerWithErrorsTest,
+ CommonIndexerNoTool, BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -30,12 +30,6 @@
}
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.objstorage = MockObjStorage()
- self.tool_config = self.config['tools']['configuration']
-
class Language(unittest.TestCase):
"""Tests pygments tool for language detection
@@ -60,8 +54,14 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.indexer.idx_storage.content_language_get(ids)
+
def setUp(self):
self.indexer = LanguageTestIndexer()
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import unittest
-import logging
from unittest.mock import patch
@@ -13,10 +12,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -61,12 +59,6 @@
},
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- self.objstorage = MockObjStorage()
-
class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
"""Mimetype indexer test scenarios:
@@ -75,8 +67,13 @@
- Unknown sha1 in the input list are not indexed
"""
+
+ def get_indexer_results(self, ids):
+ yield from self.idx_storage.content_mimetype_get(ids)
+
def setUp(self):
self.indexer = MimetypeTestIndexer()
+ self.idx_storage = self.indexer.idx_storage
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
@@ -123,15 +120,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- # sync objstorage and storage
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestMimetypeRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -144,12 +132,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = MimetypeRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -4,10 +4,12 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
+import random
from swh.objstorage.exc import ObjNotFoundError
from swh.model import hashutil
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.indexer.storage import INDEXER_CFG_KEY
@@ -488,6 +490,21 @@
'id': DIRECTORY_ID,
'entries': DIRECTORY,
}])
+ for (obj_id, content) in OBJ_STORAGE_DATA.items():
+ if hasattr(hashlib, 'blake2s'):
+ blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
+ else:
+ # fallback for Python <3.6
+ blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
+ storage.content_add([{
+ 'data': content,
+ 'length': len(content),
+ 'status': 'visible',
+ 'sha1': hash_to_bytes(obj_id),
+ 'sha1_git': hash_to_bytes(obj_id),
+ 'sha256': hashlib.sha256(content).digest(),
+ 'blake2s256': blake2s256
+ }])
class MockStorage():
@@ -659,7 +676,15 @@
class CommonContentIndexerTest:
- def assert_results_ok(self, actual_results, expected_results=None):
+ def get_indexer_results(self, ids):
+ """Override this for indexers that don't have a mock storage."""
+ return self.indexer.idx_storage.state
+
+ def assert_results_ok(self, sha1s, expected_results=None):
+ sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
+ for sha1 in sha1s]
+ actual_results = self.get_indexer_results(sha1s)
+
if expected_results is None:
expected_results = self.expected_results
@@ -678,15 +703,12 @@
# when
self.indexer.run(sha1s, policy_update='update-dups')
- actual_results = self.indexer.idx_storage.state
- self.assertTrue(self.indexer.idx_storage.conflict_update)
- self.assert_results_ok(actual_results)
+ self.assert_results_ok(sha1s)
# 2nd pass
self.indexer.run(sha1s, policy_update='ignore-dups')
- self.assertFalse(self.indexer.idx_storage.conflict_update)
- self.assert_results_ok(actual_results)
+ self.assert_results_ok(sha1s)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
@@ -696,29 +718,35 @@
# when
self.indexer.run(sha1s, policy_update='update-dups')
- actual_results = self.indexer.idx_storage.state
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
- self.assert_results_ok(actual_results, expected_results)
+ self.assert_results_ok(sha1s, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
+ def setUp(self):
+ self.contents = sorted(OBJ_STORAGE_DATA)
+
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
+ actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data['id']
- self.assertEqual(indexed_data, expected_results[_id])
- self.assertTrue(start <= _id and _id <= end)
+ assert isinstance(_id, bytes)
+ indexed_data = indexed_data.copy()
+ indexed_data['id'] = hash_to_hex(indexed_data['id'])
+ self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
+ self.assertTrue(start <= _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
@@ -726,7 +754,8 @@
"""Indexing contents without existing data results in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
@@ -737,12 +766,13 @@
"""Indexing contents with existing data results in less indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
- start, end, indexed=set(data_indexed))
+ start, end, indexed=set(map(hash_to_bytes, data_indexed)))
# craft the expected results
expected_results = self.expected_results.copy()
@@ -756,7 +786,8 @@
"""Optimal indexing should result in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
@@ -783,8 +814,9 @@
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
- start, end = ['0000000000000000000000000000000000000000',
- '0000000000000000000000000000000000000001']
+ _start, _end = ['0000000000000000000000000000000000000000',
+ '0000000000000000000000000000000000000001']
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(
start, end, incremental=False)

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 8:53 PM (3 h, 52 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227188

Event Timeline