diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -346,7 +346,7 @@ - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - - **language** (str): language for that content + - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info @@ -365,7 +365,7 @@ - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, - line, language + line, lang """ def _convert_ctags(__ctags): diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -6,28 +6,31 @@ from collections import defaultdict import json +SHA1_DIGEST_SIZE = 160 -class MetadataStorage: - """Implements missing/get/add logic for both content_metadata and - revision_metadata.""" - def __init__(self, tools): + +def _transform_tool(tool): + return { + 'id': tool['id'], + 'name': tool['tool_name'], + 'version': tool['tool_version'], + 'configuration': tool['tool_configuration'], + } + + +class SubStorage: + """Implements common missing/get/add logic for each indexer type.""" + def __init__(self, tools, columns): + self._columns = columns self._tools = tools - self._metadata = {} # map (id_, tool_id) -> metadata_dict + self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] - def _transform_tool(self, tool): - return { - 'id': tool['id'], - 'name': tool['tool_name'], - 'version': tool['tool_version'], - 'configuration': tool['tool_configuration'], - } - def missing(self, ids): - """List metadata missing from storage. + """List data missing from storage. Args: - metadata (iterable): dictionaries with keys: + data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute @@ -44,7 +47,7 @@ yield id_ def get(self, ids): - """Retrieve metadata per id. + """Retrieve data per id. Args: ids (iterable): sha1 checksums @@ -53,8 +56,8 @@ dict: dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata + - arbitrary data (as provided to `add`) """ for id_ in ids: @@ -62,35 +65,36 @@ key = (id_, tool_id) yield { 'id': id_, - 'tool': self._transform_tool(self._tools[tool_id]), - 'translated_metadata': self._metadata[key], + 'tool': _transform_tool(self._tools[tool_id]), + **self._data[key], } - def add(self, metadata, conflict_update): - """Add metadata not present in storage. + def add(self, data, conflict_update): + """Add data not present in storage. Args: - metadata (iterable): dictionaries with keys: + data (iterable): dictionaries with keys: - **id**: sha1 - - **translated_metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute the results + - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ - for item in metadata: - tool_id = item['indexer_configuration_id'] - data = item['translated_metadata'] - id_ = item['id'] + for item in data: + item = item.copy() + tool_id = item.pop('indexer_configuration_id') + id_ = item.pop('id') + data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) - self._metadata[key] = data + self._data[key] = data self._tools_per_id[id_].add(tool_id) @@ -99,8 +103,123 @@ def __init__(self): self._tools = {} - self._content_metadata = MetadataStorage(self._tools) - self._revision_metadata = MetadataStorage(self._tools) + self._content_ctags = SubStorage( + self._tools, ['name', 'kind', 'lang']) + self._content_metadata = SubStorage(self._tools, ['metadata']) + self._revision_metadata = SubStorage(self._tools, ['metadata']) + + def content_ctags_missing(self, ctags): + """List ctags missing from storage. + + Args: + ctags (iterable): dicts with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) + + """ + yield from self._content_ctags.missing(ctags) + + def content_ctags_get(self, ids): + """Retrieve ctags per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + Dictionaries with keys: + + - **id** (bytes): content's identifier + - **name** (str): symbol's name + - **kind** (str): symbol's kind + - **lang** (str): language for that content + - **tool** (dict): tool used to compute the ctags' info + + + """ + for item in self._content_ctags.get(ids): + for item_ctags_item in item['ctags']: + yield { + 'id': item['id'], + 'tool': item['tool'], + **item_ctags_item + } + + def content_ctags_add(self, ctags, conflict_update=False): + """Add ctags not present in storage + + Args: + ctags (iterable): dictionaries with keys: + + - **id** (bytes): sha1 + - **ctags** ([list): List of dictionary with keys: name, kind, + line, lang + - **indexer_configuration_id**: tool used to compute the + results + + """ + for item in ctags: + tool_id = item['indexer_configuration_id'] + if conflict_update: + item_ctags = [] + else: + # TODO: this merges old ctags with new ctags. This is + # pointless, new ctags should replace the old ones. + existing = list(self._content_ctags.get([item['id']])) + item_ctags = [ + { + key: ctags_item[key] + for key in ('name', 'kind', 'line', 'lang') + } + for existing_item in existing + if existing_item['tool']['id'] == tool_id + for ctags_item in existing_item['ctags'] + ] + for new_item_ctags in item['ctags']: + if new_item_ctags not in item_ctags: + item_ctags.append(new_item_ctags) + self._content_ctags.add([ + { + 'id': item['id'], + 'indexer_configuration_id': tool_id, + 'ctags': item_ctags, + } + ], conflict_update=True) + + def content_ctags_search(self, expression, + limit=10, last_sha1=None, db=None, cur=None): + """Search through content's raw ctags symbols. + + Args: + expression (str): Expression to search for + limit (int): Number of rows to return (default to 10). + last_sha1 (str): Offset from which retrieving data (default to ''). + + Yields: + rows of ctags including id, name, lang, kind, line, etc... + + """ + nb_matches = 0 + for ((id_, tool_id), item) in \ + sorted(self._content_ctags._data.items()): + if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): + continue + nb_matches += 1 + for ctags_item in item['ctags']: + if ctags_item['name'] != expression: + continue + yield { + 'id': id_, + 'tool': _transform_tool(self._tools[tool_id]), + **ctags_item + } + if nb_matches >= limit: + return def content_metadata_missing(self, metadata): """List metadata missing from storage. diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -50,30 +50,6 @@ pass @pytest.mark.xfail - def test_content_ctags_missing(self): - pass - - @pytest.mark.xfail - def test_content_ctags_get(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search(self): - pass - - @pytest.mark.xfail - def test_content_ctags_search_no_result(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__add_new_ctags_added(self): - pass - - @pytest.mark.xfail - def test_content_ctags_add__update_in_place(self): - pass - - @pytest.mark.xfail def test_content_fossology_license_get(self): pass diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -11,7 +11,7 @@ ) from swh.indexer.tests.test_utils import ( - BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest, + CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG ) @@ -99,12 +99,6 @@ 'workdir': '/nowhere', } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.objstorage = MockObjStorage() - self.tool_config = self.config['tools']['configuration'] - class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): """Ctags indexer test scenarios: