diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -5,10 +5,12 @@ import json import subprocess -from typing import Any, Dict, List +from typing import Any, Dict, Iterator, List, Optional, Union from swh.core.config import merge_configs +from swh.indexer.storage.model import ContentCtagsRow from swh.model import hashutil +from swh.model.model import Revision from .indexer import ContentIndexer, write_to_temp @@ -30,7 +32,7 @@ ) -def run_ctags(path, lang=None, ctags_command="ctags"): +def run_ctags(path, lang=None, ctags_command="ctags") -> Iterator[Dict[str, Any]]: """Run ctags on file path with optional language. Args: @@ -74,7 +76,7 @@ } -class CtagsIndexer(ContentIndexer[Dict]): +class CtagsIndexer(ContentIndexer[ContentCtagsRow]): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = merge_configs(DEFAULT_CONFIG, self.config) @@ -89,7 +91,9 @@ ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) ) - def index(self, id, data): + def index( + self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs + ) -> List[ContentCtagsRow]: """Index sha1s' content and store result. Args: @@ -103,41 +107,41 @@ - **ctags** ([dict]): ctags list of symbols """ + assert isinstance(id, bytes) + assert data is not None + lang = compute_language(data, log=self.log)["lang"] if not lang: - return None + return [] ctags_lang = self.language_map.get(lang) if not ctags_lang: - return None + return [] - ctags = { - "id": id, - } + ctags = [] filename = hashutil.hash_to_hex(id) with write_to_temp( filename=filename, data=data, working_directory=self.working_directory ) as content_path: - result = run_ctags(content_path, lang=ctags_lang) - ctags.update( - {"ctags": list(result), "indexer_configuration_id": self.tool["id"],} - ) + for ctag_kwargs in run_ctags(content_path, lang=ctags_lang): + ctags.append( + ContentCtagsRow( + id=id, indexer_configuration_id=self.tool["id"], **ctag_kwargs, + ) + ) - return [ctags] + return ctags def persist_index_computations( - self, results: List[Dict], policy_update: str + self, results: List[ContentCtagsRow], policy_update: str ) -> Dict[str, int]: """Persist the results in storage. Args: - results: list of content_mimetype, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - ctags ([dict]): ctags list of symbols + results: list of ctags returned by index() policy_update: either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -5,7 +5,6 @@ from collections import Counter -import itertools import json from typing import Dict, Iterable, List, Optional, Tuple @@ -328,14 +327,20 @@ @timed @db_transaction() - def content_ctags_missing(self, ctags, db=None, cur=None): + def content_ctags_missing( + self, ctags: Iterable[Dict], db=None, cur=None + ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_ctags_missing_from_list(ctags, cur)] @timed @db_transaction() - def content_ctags_get(self, ids, db=None, cur=None): + def content_ctags_get( + self, ids: Iterable[Sha1], db=None, cur=None + ) -> List[ContentCtagsRow]: return [ - converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) + ContentCtagsRow.from_dict( + converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) + ) for c in db.content_ctags_get_from_list(ids, cur) ] @@ -343,15 +348,18 @@ @process_metrics @db_transaction() def content_ctags_add( - self, ctags: List[Dict], conflict_update: bool = False, db=None, cur=None + self, + ctags: List[ContentCtagsRow], + conflict_update: bool = False, + db=None, + cur=None, ) -> Dict[str, int]: - rows = list(itertools.chain.from_iterable(map(converters.ctags_to_db, ctags))) - check_id_duplicates(map(ContentCtagsRow.from_dict, rows)) - ctags.sort(key=lambda m: m["id"]) + check_id_duplicates(ctags) + ctags.sort(key=lambda m: m.id) db.mktemp_content_ctags(cur) db.copy_to( - rows, + [ctag.to_dict() for ctag in ctags], tblname="tmp_content_ctags", columns=["id", "name", "kind", "line", "lang", "indexer_configuration_id"], cur=cur, @@ -363,10 +371,17 @@ @timed @db_transaction() def content_ctags_search( - self, expression, limit=10, last_sha1=None, db=None, cur=None - ): + self, + expression: str, + limit: int = 10, + last_sha1: Optional[Sha1] = None, + db=None, + cur=None, + ) -> List[ContentCtagsRow]: return [ - converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) + ContentCtagsRow.from_dict( + converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) + ) for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur) ] diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -27,7 +27,7 @@ from swh.model.model import SHA1_SIZE, Sha1Git from swh.storage.utils import get_partition_bounds_bytes -from . import MAPPING_NAMES, check_id_duplicates, converters +from . import MAPPING_NAMES, check_id_duplicates from .exc import IndexerStorageArgumentException from .interface import PagedResult, Sha1 from .model import ( @@ -302,29 +302,21 @@ added = self._languages.add(languages, conflict_update) return {"content_language:add": added} - def content_ctags_missing(self, ctags): + def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]: return self._content_ctags.missing(ctags) - def content_ctags_get(self, ids): - results = [] - for item in self._content_ctags.get(ids): - results.append({"id": item.id, "tool": item.tool, **item.to_dict()}) - return results + def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]: + return self._content_ctags.get(ids) def content_ctags_add( - self, ctags: List[Dict], conflict_update: bool = False + self, ctags: List[ContentCtagsRow], conflict_update: bool = False ) -> Dict[str, int]: - check_id_types(ctags) - added = self._content_ctags.add( - map( - ContentCtagsRow.from_dict, - itertools.chain.from_iterable(map(converters.ctags_to_db, ctags)), - ), - conflict_update, - ) + added = self._content_ctags.add(ctags, conflict_update,) return {"content_ctags:add": added} - def content_ctags_search(self, expression, limit=10, last_sha1=None): + def content_ctags_search( + self, expression: str, limit: int = 10, last_sha1: Optional[Sha1] = None + ): nb_matches = 0 items_per_id: Dict[Tuple[Sha1Git, ToolId], List[ContentCtagsRow]] = {} for item in sorted(self._content_ctags.get_all()): @@ -336,21 +328,14 @@ results = [] for items in items_per_id.values(): - ctags = [] for item in items: if item.name != expression: continue nb_matches += 1 if nb_matches > limit: break - item_dict = item.to_dict() - id_ = item_dict.pop("id") - tool = item_dict.pop("tool") - ctags.append(item_dict) - - if ctags: - for ctag in ctags: - results.append({"id": id_, "tool": tool, **ctag}) + results.append(item) + return results def content_fossology_license_get( diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -8,6 +8,7 @@ from swh.core.api import remote_api_endpoint from swh.core.api.classes import PagedResult as CorePagedResult from swh.indexer.storage.model import ( + ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMimetypeRow, @@ -159,7 +160,7 @@ ... @remote_api_endpoint("content/ctags/missing") - def content_ctags_missing(self, ctags): + def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]: """List ctags missing from storage. Args: @@ -169,28 +170,22 @@ - **indexer_configuration_id** (int): tool used to compute the results - Yields: - an iterable of missing id for the tuple (id, + Returns: + list of missing id for the tuple (id, indexer_configuration_id) """ ... @remote_api_endpoint("content/ctags") - def content_ctags_get(self, ids): + def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]: """Retrieve ctags per id. Args: ids (iterable): sha1 checksums - Yields: - Dictionaries with keys: - - - **id** (bytes): content's identifier - - **name** (str): symbol's name - - **kind** (str): symbol's kind - - **lang** (str): language for that content - - **tool** (dict): tool used to compute the ctags' info + Returns: + list of language rows """ @@ -198,7 +193,7 @@ @remote_api_endpoint("content/ctags/add") def content_ctags_add( - self, ctags: List[Dict], conflict_update: bool = False + self, ctags: List[ContentCtagsRow], conflict_update: bool = False ) -> Dict[str, int]: """Add ctags not present in storage @@ -216,7 +211,9 @@ ... @remote_api_endpoint("content/ctags/search") - def content_ctags_search(self, expression, limit=10, last_sha1=None): + def content_ctags_search( + self, expression: str, limit: int = 10, last_sha1: Optional[Sha1] = None + ) -> List[ContentCtagsRow]: """Search through content's raw ctags symbols. Args: @@ -224,7 +221,7 @@ limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). - Yields: + Returns: rows of ctags including id, name, lang, kind, line, etc... """ diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -56,7 +56,8 @@ def unique_key(self) -> Dict: if self.indexer_configuration_id is None: raise ValueError( - "Can only call unique_key() on objects with indexer_configuration_id." + "Can only call unique_key() on objects without " + "indexer_configuration_id." ) return {key: getattr(self, key) for key in self.UNIQUE_KEY_FIELDS} diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -8,12 +8,14 @@ import threading from typing import Any, Dict, List, Tuple, Union +import attr import pytest from swh.indexer.storage.exc import DuplicateId, IndexerStorageArgumentException from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( BaseRow, + ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMimetypeRow, @@ -575,18 +577,12 @@ endpoint_type = "content_ctags" tool_name = "universal-ctags" example_data = [ - { - "ctags": [ - {"name": "done", "kind": "variable", "line": 119, "lang": "OCaml",} - ] - }, - { - "ctags": [ - {"name": "done", "kind": "variable", "line": 100, "lang": "Python",}, - {"name": "main", "kind": "function", "line": 119, "lang": "Python",}, - ] - }, + {"name": "done", "kind": "variable", "line": 119, "lang": "OCaml",}, + {"name": "done", "kind": "variable", "line": 100, "lang": "Python",}, + {"name": "main", "kind": "function", "line": 119, "lang": "Python",}, ] + row_from_dict = ContentCtagsRow.from_dict + dict_from_row = staticmethod(lambda x: x.to_dict()) # type: ignore # the following tests are disabled because CTAGS behaves differently @pytest.mark.skip @@ -617,119 +613,76 @@ tool = data.tools["universal-ctags"] tool_id = tool["id"] - ctag1 = { - "id": data.sha1_1, - "indexer_configuration_id": tool_id, - "ctags": [ + ctags1 = [ + ContentCtagsRow( + id=data.sha1_1, + indexer_configuration_id=tool_id, + **kwargs, # type: ignore + ) + for kwargs in [ {"name": "hello", "kind": "function", "line": 133, "lang": "Python",}, {"name": "counter", "kind": "variable", "line": 119, "lang": "Python",}, {"name": "hello", "kind": "variable", "line": 210, "lang": "Python",}, - ], - } + ] + ] + ctags1_with_tool = [ + attr.evolve(ctag, indexer_configuration_id=None, tool=tool) + for ctag in ctags1 + ] - ctag2 = { - "id": data.sha1_2, - "indexer_configuration_id": tool_id, - "ctags": [ + ctags2 = [ + ContentCtagsRow( + id=data.sha1_2, + indexer_configuration_id=tool_id, + **kwargs, # type: ignore + ) + for kwargs in [ {"name": "hello", "kind": "variable", "line": 100, "lang": "C",}, {"name": "result", "kind": "variable", "line": 120, "lang": "C",}, - ], - } + ] + ] + ctags2_with_tool = [ + attr.evolve(ctag, indexer_configuration_id=None, tool=tool) + for ctag in ctags2 + ] - storage.content_ctags_add([ctag1, ctag2]) + storage.content_ctags_add(ctags1 + ctags2) # 1. when actual_ctags = list(storage.content_ctags_search("hello", limit=1)) # 1. then - assert actual_ctags == [ - { - "id": ctag1["id"], - "tool": tool, - "name": "hello", - "kind": "function", - "line": 133, - "lang": "Python", - } - ] + assert actual_ctags == [ctags1_with_tool[0]] # 2. when actual_ctags = list( - storage.content_ctags_search("hello", limit=1, last_sha1=ctag1["id"]) + storage.content_ctags_search("hello", limit=1, last_sha1=data.sha1_1) ) # 2. then - assert actual_ctags == [ - { - "id": ctag2["id"], - "tool": tool, - "name": "hello", - "kind": "variable", - "line": 100, - "lang": "C", - } - ] + assert actual_ctags == [ctags2_with_tool[0]] # 3. when actual_ctags = list(storage.content_ctags_search("hello")) # 3. then assert actual_ctags == [ - { - "id": ctag1["id"], - "tool": tool, - "name": "hello", - "kind": "function", - "line": 133, - "lang": "Python", - }, - { - "id": ctag1["id"], - "tool": tool, - "name": "hello", - "kind": "variable", - "line": 210, - "lang": "Python", - }, - { - "id": ctag2["id"], - "tool": tool, - "name": "hello", - "kind": "variable", - "line": 100, - "lang": "C", - }, + ctags1_with_tool[0], + ctags1_with_tool[2], + ctags2_with_tool[0], ] # 4. when actual_ctags = list(storage.content_ctags_search("counter")) # then - assert actual_ctags == [ - { - "id": ctag1["id"], - "tool": tool, - "name": "counter", - "kind": "variable", - "line": 119, - "lang": "Python", - } - ] + assert actual_ctags == [ctags1_with_tool[1]] # 5. when actual_ctags = list(storage.content_ctags_search("result", limit=1)) # then - assert actual_ctags == [ - { - "id": ctag2["id"], - "tool": tool, - "name": "result", - "kind": "variable", - "line": 120, - "lang": "C", - } - ] + assert actual_ctags == [ctags2_with_tool[1]] def test_content_ctags_search_no_result( self, swh_indexer_storage: IndexerStorageInterface @@ -748,69 +701,42 @@ tool = data.tools["universal-ctags"] tool_id = tool["id"] - ctag_v1 = { - "id": data.sha1_2, - "indexer_configuration_id": tool_id, - "ctags": [ - {"name": "done", "kind": "variable", "line": 100, "lang": "Scheme",} - ], - } + ctag1 = ContentCtagsRow( + id=data.sha1_2, + indexer_configuration_id=tool_id, + name="done", + kind="variable", + line=100, + lang="Scheme", + ) + ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool) # given - storage.content_ctags_add([ctag_v1]) - storage.content_ctags_add([ctag_v1]) # conflict does nothing + storage.content_ctags_add([ctag1]) + storage.content_ctags_add([ctag1]) # conflict does nothing # when actual_ctags = list(storage.content_ctags_get([data.sha1_2])) # then - expected_ctags = [ - { - "id": data.sha1_2, - "name": "done", - "kind": "variable", - "line": 100, - "lang": "Scheme", - "tool": tool, - } - ] - - assert actual_ctags == expected_ctags + assert actual_ctags == [ctag1_with_tool] # given - ctag_v2 = ctag_v1.copy() - ctag_v2.update( - { - "ctags": [ - {"name": "defn", "kind": "function", "line": 120, "lang": "Scheme",} - ] - } + ctag2 = ContentCtagsRow( + id=data.sha1_2, + indexer_configuration_id=tool_id, + name="defn", + kind="function", + line=120, + lang="Scheme", ) + ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) - storage.content_ctags_add([ctag_v2]) - - expected_ctags = [ - { - "id": data.sha1_2, - "name": "done", - "kind": "variable", - "line": 100, - "lang": "Scheme", - "tool": tool, - }, - { - "id": data.sha1_2, - "name": "defn", - "kind": "function", - "line": 120, - "lang": "Scheme", - "tool": tool, - }, - ] + storage.content_ctags_add([ctag2]) actual_ctags = list(storage.content_ctags_get([data.sha1_2])) - assert actual_ctags == expected_ctags + assert actual_ctags == [ctag1_with_tool, ctag2_with_tool] def test_content_ctags_add__update_in_place( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] @@ -820,89 +746,49 @@ tool = data.tools["universal-ctags"] tool_id = tool["id"] - ctag_v1 = { - "id": data.sha1_2, - "indexer_configuration_id": tool_id, - "ctags": [ - {"name": "done", "kind": "variable", "line": 100, "lang": "Scheme",} - ], - } + ctag1 = ContentCtagsRow( + id=data.sha1_2, + indexer_configuration_id=tool_id, + name="done", + kind="variable", + line=100, + lang="Scheme", + ) + ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool) # given - storage.content_ctags_add([ctag_v1]) + storage.content_ctags_add([ctag1]) # when actual_ctags = list(storage.content_ctags_get([data.sha1_2])) # then - expected_ctags = [ - { - "id": data.sha1_2, - "name": "done", - "kind": "variable", - "line": 100, - "lang": "Scheme", - "tool": tool, - } - ] - assert actual_ctags == expected_ctags + assert actual_ctags == [ctag1_with_tool] # given - ctag_v2 = ctag_v1.copy() - ctag_v2.update( - { - "ctags": [ - { - "name": "done", - "kind": "variable", - "line": 100, - "lang": "Scheme", - }, - { - "name": "defn", - "kind": "function", - "line": 120, - "lang": "Scheme", - }, - ] - } + ctag2 = ContentCtagsRow( + id=data.sha1_2, + indexer_configuration_id=tool_id, + name="defn", + kind="function", + line=120, + lang="Scheme", ) + ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) - storage.content_ctags_add([ctag_v2], conflict_update=True) + storage.content_ctags_add([ctag1, ctag2], conflict_update=True) actual_ctags = list(storage.content_ctags_get([data.sha1_2])) - # ctag did change as the v2 was used to overwrite v1 - expected_ctags = [ - { - "id": data.sha1_2, - "name": "done", - "kind": "variable", - "line": 100, - "lang": "Scheme", - "tool": tool, - }, - { - "id": data.sha1_2, - "name": "defn", - "kind": "function", - "line": 120, - "lang": "Scheme", - "tool": tool, - }, - ] - assert actual_ctags == expected_ctags + assert actual_ctags == [ctag1_with_tool, ctag2_with_tool] def test_add_empty( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: (storage, data) = swh_indexer_storage_with_data etype = self.endpoint_type - tool = data.tools[self.tool_name] - summary = endpoint(storage, etype, "add")( - [{"id": data.sha1_2, "indexer_configuration_id": tool["id"], "ctags": [],}] - ) + summary = endpoint(storage, etype, "add")([]) assert summary == {"content_ctags:add": 0} actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2])) diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py --- a/swh/indexer/tests/test_ctags.py +++ b/swh/indexer/tests/test_ctags.py @@ -11,6 +11,7 @@ import swh.indexer.ctags from swh.indexer.ctags import CtagsIndexer, run_ctags +from swh.indexer.storage.model import ContentCtagsRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, OBJ_STORAGE_DATA, @@ -107,21 +108,18 @@ tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} self.expected_results = [ - { - "id": hash_to_bytes(self.id0), - "tool": tool, - **SHA1_TO_CTAGS[self.id0][0], - }, - { - "id": hash_to_bytes(self.id1), - "tool": tool, - **SHA1_TO_CTAGS[self.id1][0], - }, - { - "id": hash_to_bytes(self.id2), - "tool": tool, - **SHA1_TO_CTAGS[self.id2][0], - }, + *[ + ContentCtagsRow(id=hash_to_bytes(self.id0), tool=tool, **kwargs,) + for kwargs in SHA1_TO_CTAGS[self.id0] + ], + *[ + ContentCtagsRow(id=hash_to_bytes(self.id1), tool=tool, **kwargs,) + for kwargs in SHA1_TO_CTAGS[self.id1] + ], + *[ + ContentCtagsRow(id=hash_to_bytes(self.id2), tool=tool, **kwargs,) + for kwargs in SHA1_TO_CTAGS[self.id2] + ], ] self._set_mocks()