diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -346,7 +346,7 @@ - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - - **language** (str): language for that content + - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info @@ -365,7 +365,7 @@ - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, - line, language + line, lang """ def _convert_ctags(__ctags): @@ -412,9 +412,8 @@ ids (iterable): sha1 checksums Yields: - list: dictionaries with the following keys: + `{id: facts}` where `facts` is a dict with the following keys: - - **id** (bytes) - **licenses** ([str]): associated licenses for that content - **tool** (dict): Tool used to compute the license @@ -439,7 +438,7 @@ licenses (iterable): dictionaries with keys: - **id**: sha1 - - **license** ([bytes]): List of licenses associated to sha1 + - **licenses** ([bytes]): List of licenses associated to sha1 - **tool** (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -6,28 +6,30 @@ from collections import defaultdict import json +SHA1_DIGEST_SIZE = 160 -class MetadataStorage: - """Implements missing/get/add logic for both content_metadata and - revision_metadata.""" + +def _transform_tool(tool): + return { + 'id': tool['id'], + 'name': tool['tool_name'], + 'version': tool['tool_version'], + 'configuration': tool['tool_configuration'], + } + + +class SubStorage: + """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools - self._metadata = {} # map (id_, tool_id) -> metadata_dict + self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] - def _transform_tool(self, tool): - return { - 'id': tool['id'], - 'name': tool['tool_name'], - 'version': tool['tool_version'], - 'configuration': tool['tool_configuration'], - } - def missing(self, ids): - """List metadata missing from storage. + """List data missing from storage. Args: - metadata (iterable): dictionaries with keys: + data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute @@ -44,7 +46,7 @@ yield id_ def get(self, ids): - """Retrieve metadata per id. + """Retrieve data per id. Args: ids (iterable): sha1 checksums @@ -53,8 +55,8 @@ dict: dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - arbitrary data (as provided to `add`) """ for id_ in ids: @@ -62,45 +64,199 @@ key = (id_, tool_id) yield { 'id': id_, - 'tool': self._transform_tool(self._tools[tool_id]), - 'translated_metadata': self._metadata[key], + 'tool': _transform_tool(self._tools[tool_id]), + **self._data[key], } - def add(self, metadata, conflict_update): - """Add metadata not present in storage. + def add(self, data, conflict_update): + """Add data not present in storage. Args: - metadata (iterable): dictionaries with keys: + data (iterable): dictionaries with keys: - **id**: sha1 - - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute the results + - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ - for item in metadata: - tool_id = item['indexer_configuration_id'] - data = item['translated_metadata'] - id_ = item['id'] + for item in data: + item = item.copy() + tool_id = item.pop('indexer_configuration_id') + id_ = item.pop('id') + data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) - self._metadata[key] = data + self._data[key] = data self._tools_per_id[id_].add(tool_id) + def add_merge(self, new_data, conflict_update, merged_key): + for new_item in new_data: + tool_id = new_item['indexer_configuration_id'] + if conflict_update: + all_subitems = [] + else: + existing = list(self.get([new_item['id']])) + all_subitems = [ + old_subitem + for existing_item in existing + if existing_item['tool']['id'] == tool_id + for old_subitem in existing_item[merged_key] + ] + for new_subitem in new_item[merged_key]: + if new_subitem not in all_subitems: + all_subitems.append(new_subitem) + self.add([ + { + 'id': new_item['id'], + 'indexer_configuration_id': tool_id, + merged_key: all_subitems, + } + ], conflict_update=True) + class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} - self._content_metadata = MetadataStorage(self._tools) - self._revision_metadata = MetadataStorage(self._tools) + self._content_ctags = SubStorage(self._tools) + self._licenses = SubStorage(self._tools) + self._content_metadata = SubStorage(self._tools) + self._revision_metadata = SubStorage(self._tools) + + def content_ctags_missing(self, ctags): + """List ctags missing from storage. + + Args: + ctags (iterable): dicts with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) + + """ + yield from self._content_ctags.missing(ctags) + + def content_ctags_get(self, ids): + """Retrieve ctags per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + Dictionaries with keys: + + - **id** (bytes): content's identifier + - **name** (str): symbol's name + - **kind** (str): symbol's kind + - **lang** (str): language for that content + - **tool** (dict): tool used to compute the ctags' info + + + """ + for item in self._content_ctags.get(ids): + for item_ctags_item in item['ctags']: + yield { + 'id': item['id'], + 'tool': item['tool'], + **item_ctags_item + } + + def content_ctags_add(self, ctags, conflict_update=False): + """Add ctags not present in storage + + Args: + ctags (iterable): dictionaries with keys: + + - **id** (bytes): sha1 + - **ctags** ([list): List of dictionary with keys: name, kind, + line, lang + - **indexer_configuration_id**: tool used to compute the + results + + """ + self._content_ctags.add_merge(ctags, conflict_update, 'ctags') + + def content_ctags_search(self, expression, + limit=10, last_sha1=None, db=None, cur=None): + """Search through content's raw ctags symbols. + + Args: + expression (str): Expression to search for + limit (int): Number of rows to return (default to 10). + last_sha1 (str): Offset from which retrieving data (default to ''). + + Yields: + rows of ctags including id, name, lang, kind, line, etc... + + """ + nb_matches = 0 + for ((id_, tool_id), item) in \ + sorted(self._content_ctags._data.items()): + if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): + continue + nb_matches += 1 + for ctags_item in item['ctags']: + if ctags_item['name'] != expression: + continue + yield { + 'id': id_, + 'tool': _transform_tool(self._tools[tool_id]), + **ctags_item + } + if nb_matches >= limit: + return + + def content_fossology_license_get(self, ids): + """Retrieve licenses per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + `{id: facts}` where `facts` is a dict with the following keys: + + - **licenses** ([str]): associated licenses for that content + - **tool** (dict): Tool used to compute the license + + """ + # TODO: remove this reformatting in order to yield items with the + # same format as other _get methods. + res = {} + for d in self._licenses.get(ids): + res.setdefault(d.pop('id'), []).append(d) + for (id_, facts) in res.items(): + yield {id_: facts} + + def content_fossology_license_add(self, licenses, conflict_update=False): + """Add licenses not present in storage. + + Args: + licenses (iterable): dictionaries with keys: + + - **id**: sha1 + - **licenses** ([bytes]): List of licenses associated to sha1 + - **tool** (str): nomossa + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + Returns: + list: content_license entries which failed due to unknown licenses + + """ + self._licenses.add_merge(licenses, conflict_update, 'licenses') def content_metadata_missing(self, metadata): """List metadata missing from storage. diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -50,42 +50,6 @@ pass @pytest.mark.xfail - def test_content_ctags_missing(self): - pass - - @pytest.mark.xfail - def test_content_ctags_get(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search_no_result(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__add_new_ctags_added(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__update_in_place(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_get(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_add__new_license_added(self): - pass - - @pytest.mark.xfail - def test_content_fossology_license_add__update_in_place_duplicate(self): - pass - - @pytest.mark.xfail def test_origin_intrinsic_metadata_get(self): pass diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -11,7 +11,7 @@ ) from swh.indexer.tests.test_utils import ( - BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest, + CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG ) @@ -99,12 +99,6 @@ 'workdir': '/nowhere', } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.objstorage = MockObjStorage() - self.tool_config = self.config['tools']['configuration'] - class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): """Ctags indexer test scenarios: @@ -113,8 +107,13 @@ - Unknown sha1 in the input list are not indexed """ + + def get_indexer_results(self, ids): + yield from self.idx_storage.content_ctags_get(ids) + def setUp(self): self.indexer = CtagsIndexerTest() + self.idx_storage = self.indexer.idx_storage # Prepare test input self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -659,7 +659,13 @@ class CommonContentIndexerTest: - def assert_results_ok(self, actual_results, expected_results=None): + def get_indexer_results(self, ids): + """Override this for indexers that don't have a mock storage.""" + return self.indexer.idx_storage.state + + def assert_results_ok(self, sha1s, expected_results=None): + actual_results = self.get_indexer_results(sha1s) + if expected_results is None: expected_results = self.expected_results @@ -678,15 +684,12 @@ # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state - self.assertTrue(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) # 2nd pass self.indexer.run(sha1s, policy_update='ignore-dups') - self.assertFalse(self.indexer.idx_storage.conflict_update) - self.assert_results_ok(actual_results) + self.assert_results_ok(sha1s) def test_index_one_unknown_sha1(self): """Unknown sha1 are not indexed""" @@ -696,14 +699,13 @@ # when self.indexer.run(sha1s, policy_update='update-dups') - actual_results = self.indexer.idx_storage.state # then expected_results = { k: v for k, v in self.expected_results.items() if k in sha1s } - self.assert_results_ok(actual_results, expected_results) + self.assert_results_ok(sha1s, expected_results) class CommonContentIndexerRangeTest: