diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -346,7 +346,7 @@ - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - - **language** (str): language for that content + - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info @@ -365,7 +365,7 @@ - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, - line, language + line, lang """ def _convert_ctags(__ctags): @@ -412,9 +412,8 @@ ids (iterable): sha1 checksums Yields: - list: dictionaries with the following keys: + `{id: facts}` where `facts` is a dict with the following keys: - - **id** (bytes) - **licenses** ([str]): associated licenses for that content - **tool** (dict): Tool used to compute the license @@ -439,7 +438,7 @@ licenses (iterable): dictionaries with keys: - **id**: sha1 - - **license** ([bytes]): List of licenses associated to sha1 + - **licenses** ([bytes]): List of licenses associated to sha1 - **tool** (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -3,31 +3,35 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import bisect from collections import defaultdict import json +SHA1_DIGEST_SIZE = 160 -class MetadataStorage: - """Implements missing/get/add logic for both content_metadata and - revision_metadata.""" + +def _transform_tool(tool): + return { + 'id': tool['id'], + 'name': tool['tool_name'], + 'version': tool['tool_version'], + 'configuration': tool['tool_configuration'], + } + + +class SubStorage: + """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools - self._metadata = {} # map (id_, tool_id) -> metadata_dict + self._sorted_ids = [] + self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] - def _transform_tool(self, tool): - return { - 'id': tool['id'], - 'name': tool['tool_name'], - 'version': tool['tool_version'], - 'configuration': tool['tool_configuration'], - } - def missing(self, ids): - """List metadata missing from storage. + """List data missing from storage. Args: - metadata (iterable): dictionaries with keys: + data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute @@ -44,7 +48,7 @@ yield id_ def get(self, ids): - """Retrieve metadata per id. + """Retrieve data per id. Args: ids (iterable): sha1 checksums @@ -53,8 +57,8 @@ dict: dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - arbitrary data (as provided to `add`) """ for id_ in ids: @@ -62,36 +66,102 @@ key = (id_, tool_id) yield { 'id': id_, - 'tool': self._transform_tool(self._tools[tool_id]), - 'translated_metadata': self._metadata[key], + 'tool': _transform_tool(self._tools[tool_id]), + **self._data[key], } - def add(self, metadata, conflict_update): - """Add metadata not present in storage. + def get_range(self, start, end, indexer_configuration_id, limit): + """Retrieve data within range [start, end] bound by limit. Args: - metadata (iterable): dictionaries with keys: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + if limit is None: + raise ValueError('Development error: limit should not be None') + from_index = bisect.bisect_left(self._sorted_ids, start) + to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index) + if to_index - from_index >= limit: + return { + 'ids': self._sorted_ids[from_index:from_index+limit], + 'next': self._sorted_ids[from_index+limit], + } + else: + return { + 'ids': self._sorted_ids[from_index:to_index], + 'next': None, + } + + def add(self, data, conflict_update): + """Add data not present in storage. + + Args: + data (iterable): dictionaries with keys: - **id**: sha1 - - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute the results + - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ - for item in metadata: - tool_id = item['indexer_configuration_id'] - data = item['translated_metadata'] - id_ = item['id'] + for item in data: + item = item.copy() + tool_id = item.pop('indexer_configuration_id') + id_ = item.pop('id') + data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) - self._metadata[key] = data + self._data[key] = data self._tools_per_id[id_].add(tool_id) + if id_ not in self._sorted_ids: + bisect.insort(self._sorted_ids, id_) + + def add_merge(self, new_data, conflict_update, merged_key): + for new_item in new_data: + id_ = new_item['id'] + tool_id = new_item['indexer_configuration_id'] + if conflict_update: + all_subitems = [] + else: + existing = list(self.get([id_])) + all_subitems = [ + old_subitem + for existing_item in existing + if existing_item['tool']['id'] == tool_id + for old_subitem in existing_item[merged_key] + ] + for new_subitem in new_item[merged_key]: + if new_subitem not in all_subitems: + all_subitems.append(new_subitem) + self.add([ + { + 'id': id_, + 'indexer_configuration_id': tool_id, + merged_key: all_subitems, + } + ], conflict_update=True) + if id_ not in self._sorted_ids: + bisect.insort(self._sorted_ids, id_) class IndexerStorage: @@ -99,8 +169,214 @@ def __init__(self): self._tools = {} - self._content_metadata = MetadataStorage(self._tools) - self._revision_metadata = MetadataStorage(self._tools) + self._mimetypes = SubStorage(self._tools) + self._content_ctags = SubStorage(self._tools) + self._licenses = SubStorage(self._tools) + self._content_metadata = SubStorage(self._tools) + self._revision_metadata = SubStorage(self._tools) + + def content_mimetype_missing(self, mimetypes): + """Generate mimetypes missing from storage. + + Args: + mimetypes (iterable): iterable of dict with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute the + results + + Yields: + tuple (id, indexer_configuration_id): missing id + + """ + yield from self._mimetypes.missing(mimetypes) + + def content_mimetype_add(self, mimetypes, conflict_update=False): + """Add mimetypes not present in storage. + + Args: + mimetypes (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **mimetype** (bytes): raw content's mimetype + - **encoding** (bytes): raw content's encoding + - **indexer_configuration_id** (int): tool's id used to + compute the results + - **conflict_update** (bool): Flag to determine if we want to + overwrite (``True``) or skip duplicates (``False``, the + default) + + """ + self._mimetypes.add(mimetypes, conflict_update) + + def content_mimetype_get(self, ids, db=None, cur=None): + """Retrieve full content mimetype per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + mimetypes (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **mimetype** (bytes): raw content's mimetype + - **encoding** (bytes): raw content's encoding + - **tool** (dict): Tool used to compute the language + + """ + yield from self._mimetypes.get(ids) + + def content_ctags_missing(self, ctags): + """List ctags missing from storage. + + Args: + ctags (iterable): dicts with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) + + """ + yield from self._content_ctags.missing(ctags) + + def content_ctags_get(self, ids): + """Retrieve ctags per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + Dictionaries with keys: + + - **id** (bytes): content's identifier + - **name** (str): symbol's name + - **kind** (str): symbol's kind + - **lang** (str): language for that content + - **tool** (dict): tool used to compute the ctags' info + + + """ + for item in self._content_ctags.get(ids): + for item_ctags_item in item['ctags']: + yield { + 'id': item['id'], + 'tool': item['tool'], + **item_ctags_item + } + + def content_ctags_add(self, ctags, conflict_update=False): + """Add ctags not present in storage + + Args: + ctags (iterable): dictionaries with keys: + + - **id** (bytes): sha1 + - **ctags** ([list): List of dictionary with keys: name, kind, + line, lang + - **indexer_configuration_id**: tool used to compute the + results + + """ + self._content_ctags.add_merge(ctags, conflict_update, 'ctags') + + def content_ctags_search(self, expression, + limit=10, last_sha1=None, db=None, cur=None): + """Search through content's raw ctags symbols. + + Args: + expression (str): Expression to search for + limit (int): Number of rows to return (default to 10). + last_sha1 (str): Offset from which retrieving data (default to ''). + + Yields: + rows of ctags including id, name, lang, kind, line, etc... + + """ + nb_matches = 0 + for ((id_, tool_id), item) in \ + sorted(self._content_ctags._data.items()): + if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): + continue + nb_matches += 1 + for ctags_item in item['ctags']: + if ctags_item['name'] != expression: + continue + yield { + 'id': id_, + 'tool': _transform_tool(self._tools[tool_id]), + **ctags_item + } + if nb_matches >= limit: + return + + def content_fossology_license_get(self, ids): + """Retrieve licenses per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + `{id: facts}` where `facts` is a dict with the following keys: + + - **licenses** ([str]): associated licenses for that content + - **tool** (dict): Tool used to compute the license + + """ + # TODO: remove this reformatting in order to yield items with the + # same format as other _get methods. + res = {} + for d in self._licenses.get(ids): + res.setdefault(d.pop('id'), []).append(d) + for (id_, facts) in res.items(): + yield {id_: facts} + + def content_fossology_license_add(self, licenses, conflict_update=False): + """Add licenses not present in storage. + + Args: + licenses (iterable): dictionaries with keys: + + - **id**: sha1 + - **licenses** ([bytes]): List of licenses associated to sha1 + - **tool** (str): nomossa + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + Returns: + list: content_license entries which failed due to unknown licenses + + """ + self._licenses.add_merge(licenses, conflict_update, 'licenses') + + def content_fossology_license_get_range( + self, start, end, indexer_configuration_id, limit=1000): + """Retrieve licenses within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + return self._licenses.get_range( + start, end, indexer_configuration_id, limit) def content_metadata_missing(self, metadata): """List metadata missing from storage. diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -13,24 +13,11 @@ } super().setUp() - @pytest.mark.xfail - def test_check_config(self): - pass + def reset_storage_tables(self): + self.storage = self.storage.__class__() @pytest.mark.xfail - def test_content_mimetype_missing(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_add__drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_mimetype_get(self): + def test_check_config(self): pass @pytest.mark.xfail @@ -50,42 +37,6 @@ pass @pytest.mark.xfail - def test_content_ctags_missing(self): - pass - - @pytest.mark.xfail - def test_content_ctags_get(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search_no_result(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__add_new_ctags_added(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__update_in_place(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_get(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_add__new_license_added(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail def test_origin_intrinsic_metadata_get(self): pass @@ -124,20 +75,3 @@ @pytest.mark.xfail def test_generate_content_mimetype_get_range_limit(self, mimetypes): pass - - @pytest.mark.xfail - def test_generate_content_fossology_license_get_range_limit_none(self): - pass - - @pytest.mark.xfail - def test_generate_content_fossology_license_get_range_no_limit(self): - pass - - @pytest.mark.xfail - def test_generate_content_fossology_license_get_range_no_limit_with_filter( - self): - pass - - @pytest.mark.xfail - def test_generate_fossology_license_get_range_limit(self): - pass diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -11,7 +11,7 @@ ) from swh.indexer.tests.test_utils import ( - BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest, + CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG ) @@ -99,12 +99,6 @@ 'workdir': '/nowhere', } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.objstorage = MockObjStorage() - self.tool_config = self.config['tools']['configuration'] - class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): """Ctags indexer test scenarios: @@ -113,8 +107,13 @@ - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_ctags_get(ids) + def setUp(self): self.indexer = CtagsIndexerTest() + self.idx_storage = self.indexer.idx_storage # Prepare test input self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -78,12 +78,6 @@ }, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - self.objstorage = MockObjStorage() - class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: @@ -92,8 +86,13 @@ - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_ctags_get(ids) + def setUp(self): self.indexer = FossologyLicenseTestIndexer() + self.idx_storage = self.indexer.idx_storage self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import unittest -import logging from unittest.mock import patch @@ -61,12 +60,6 @@ }, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - self.objstorage = MockObjStorage() - class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): """Mimetype indexer test scenarios: @@ -75,8 +68,13 @@ - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_mimetype_get(ids) + def setUp(self): self.indexer = MimetypeTestIndexer() + self.idx_storage = self.indexer.idx_storage self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -659,7 +659,13 @@ class CommonContentIndexerTest: - def assert_results_ok(self, actual_results, expected_results=None): + def get_indexer_results(self, ids): + """Override this for indexers that don't have a mock storage.""" + return self.indexer.idx_storage.state + + def assert_results_ok(self, sha1s, expected_results=None): + actual_results = self.get_indexer_results(sha1s) + if expected_results is None: expected_results = self.expected_results @@ -678,15 +684,12 @@ # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state - self.assertTrue(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') - self.assertFalse(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" @@ -696,14 +699,13 @@ # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } - self.assert_results_ok(actual_results, expected_results) + self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: