diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py deleted file mode 100644 --- a/swh/indexer/ctags.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (C) 2015-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import json -import subprocess -from typing import Any, Dict, Iterator, List, Optional - -from swh.core.config import merge_configs -from swh.indexer.storage import Sha1 -from swh.indexer.storage.model import ContentCtagsRow -from swh.model import hashutil - -from .indexer import ContentIndexer, write_to_temp - -# Options used to compute tags -__FLAGS = [ - "--fields=+lnz", # +l: language - # +n: line number of tag definition - # +z: include the symbol's kind (function, variable, ...) - "--sort=no", # sort output on tag name - "--links=no", # do not follow symlinks - "--output-format=json", # outputs in json -] - - -def compute_language(content, log=None): - raise NotImplementedError( - "Language detection was unreliable, so it is currently disabled. " - "See https://forge.softwareheritage.org/D1455" - ) - - -def run_ctags(path, lang=None, ctags_command="ctags") -> Iterator[Dict[str, Any]]: - """Run ctags on file path with optional language. - - Args: - path: path to the file - lang: language for that path (optional) - - Yields: - dict: ctags' output - - """ - optional = [] - if lang: - optional = ["--language-force=%s" % lang] - - cmd = [ctags_command] + __FLAGS + optional + [path] - output = subprocess.check_output(cmd, universal_newlines=True) - - for symbol in output.split("\n"): - if not symbol: - continue - js_symbol = json.loads(symbol) - yield { - "name": js_symbol["name"], - "kind": js_symbol["kind"], - "line": js_symbol["line"], - "lang": js_symbol["language"], - } - - -DEFAULT_CONFIG: Dict[str, Any] = { - "workdir": "/tmp/swh/indexer.ctags", - "tools": { - "name": "universal-ctags", - "version": "~git7859817b", - "configuration": { - "command_line": """ctags --fields=+lnz --sort=no --links=no """ - """--output-format=json """ - }, - }, - "languages": {}, -} - - -class CtagsIndexer(ContentIndexer[ContentCtagsRow]): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.config = merge_configs(DEFAULT_CONFIG, self.config) - self.working_directory = self.config["workdir"] - self.language_map = self.config["languages"] - - def filter(self, ids): - """Filter out known sha1s and return only missing ones.""" - yield from self.idx_storage.content_ctags_missing( - ( - { - "id": sha1, - "indexer_configuration_id": self.tool["id"], - } - for sha1 in ids - ) - ) - - def index( - self, id: Sha1, data: Optional[bytes] = None, **kwargs - ) -> List[ContentCtagsRow]: - """Index sha1s' content and store result. - - Args: - id (bytes): content's identifier - data (bytes): raw content in bytes - - Returns: - dict: a dict representing a content_mimetype with keys: - - - **id** (bytes): content's identifier (sha1) - - **ctags** ([dict]): ctags list of symbols - - """ - assert isinstance(id, bytes) - assert data is not None - - lang = compute_language(data, log=self.log)["lang"] - - if not lang: - return [] - - ctags_lang = self.language_map.get(lang) - - if not ctags_lang: - return [] - - ctags = [] - - filename = hashutil.hash_to_hex(id) - with write_to_temp( - filename=filename, data=data, working_directory=self.working_directory - ) as content_path: - for ctag_kwargs in run_ctags(content_path, lang=ctags_lang): - ctags.append( - ContentCtagsRow( - id=id, - indexer_configuration_id=self.tool["id"], - **ctag_kwargs, - ) - ) - - return ctags - - def persist_index_computations( - self, results: List[ContentCtagsRow] - ) -> Dict[str, int]: - """Persist the results in storage. - - Args: - results: list of ctags returned by index() - - """ - return self.idx_storage.content_ctags_add(results) diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -25,7 +25,6 @@ from .interface import PagedResult, Sha1 from .metrics import process_metrics, send_metric, timed from .model import ( - ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, @@ -354,66 +353,6 @@ count = db.content_language_add_from_temp(cur) return {"content_language:add": count} - @timed - @db_transaction() - def content_ctags_missing( - self, ctags: Iterable[Dict], db=None, cur=None - ) -> List[Tuple[Sha1, int]]: - return [obj[0] for obj in db.content_ctags_missing_from_list(ctags, cur)] - - @timed - @db_transaction() - def content_ctags_get( - self, ids: Iterable[Sha1], db=None, cur=None - ) -> List[ContentCtagsRow]: - return [ - ContentCtagsRow.from_dict( - converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) - ) - for c in db.content_ctags_get_from_list(ids, cur) - ] - - @timed - @process_metrics - @db_transaction() - def content_ctags_add( - self, - ctags: List[ContentCtagsRow], - db=None, - cur=None, - ) -> Dict[str, int]: - check_id_duplicates(ctags) - ctags.sort(key=lambda m: m.id) - self.journal_writer.write_additions("content_ctags", ctags) - - db.mktemp_content_ctags(cur) - db.copy_to( - [ctag.to_dict() for ctag in ctags], - tblname="tmp_content_ctags", - columns=["id", "name", "kind", "line", "lang", "indexer_configuration_id"], - cur=cur, - ) - - count = db.content_ctags_add_from_temp(cur) - return {"content_ctags:add": count} - - @timed - @db_transaction() - def content_ctags_search( - self, - expression: str, - limit: int = 10, - last_sha1: Optional[Sha1] = None, - db=None, - cur=None, - ) -> List[ContentCtagsRow]: - return [ - ContentCtagsRow.from_dict( - converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) - ) - for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur) - ] - @timed @db_transaction() def content_fossology_license_get( diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py --- a/swh/indexer/storage/converters.py +++ b/swh/indexer/storage/converters.py @@ -1,87 +1,11 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -def ctags_to_db(ctags): - """Convert a ctags entry into a ready ctags entry. - - Args: - ctags (dict): ctags entry with the following keys: - - - id (bytes): content's identifier - - tool_id (int): tool id used to compute ctags - - ctags ([dict]): List of dictionary with the following keys: - - - name (str): symbol's name - - kind (str): symbol's kind - - line (int): symbol's line in the content - - language (str): language - - Returns: - list: list of ctags entries as dicts with the following keys: - - - id (bytes): content's identifier - - name (str): symbol's name - - kind (str): symbol's kind - - language (str): language for that content - - tool_id (int): tool id used to compute ctags - - """ - id = ctags["id"] - tool_id = ctags["indexer_configuration_id"] - for ctag in ctags["ctags"]: - yield { - "id": id, - "name": ctag["name"], - "kind": ctag["kind"], - "line": ctag["line"], - "lang": ctag["lang"], - "indexer_configuration_id": tool_id, - } - - -def db_to_ctags(ctag): - """Convert a ctags entry into a ready ctags entry. - - Args: - ctags (dict): ctags entry with the following keys: - - - id (bytes): content's identifier - - ctags ([dict]): List of dictionary with the following keys: - - name (str): symbol's name - - kind (str): symbol's kind - - line (int): symbol's line in the content - - language (str): language - - Returns: - list: list of ctags ready entry (dict with the following keys): - - - id (bytes): content's identifier - - name (str): symbol's name - - kind (str): symbol's kind - - language (str): language for that content - - tool (dict): tool used to compute the ctags - - """ - return { - "id": ctag["id"], - "name": ctag["name"], - "kind": ctag["kind"], - "line": ctag["line"], - "lang": ctag["lang"], - "tool": { - "id": ctag["tool_id"], - "name": ctag["tool_name"], - "version": ctag["tool_version"], - "configuration": ctag["tool_configuration"], - }, - } - - def db_to_mimetype(mimetype): - """Convert a ctags entry into a ready ctags output.""" + """Convert a mimetype entry into a ready mimetype output.""" return { "id": mimetype["id"], "encoding": mimetype["encoding"], diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -7,7 +7,6 @@ from swh.core.db import BaseDb from swh.core.db.db_utils import execute_values_generator, stored_procedure -from swh.model import hashutil from .interface import Sha1 @@ -213,75 +212,6 @@ "content_language", ids, self.content_language_cols, cur=cur ) - content_ctags_hash_keys = ["id", "indexer_configuration_id"] - - def content_ctags_missing_from_list(self, ctags, cur=None): - """List missing ctags.""" - yield from self._missing_from_list( - "content_ctags", ctags, self.content_ctags_hash_keys, cur=cur - ) - - content_ctags_cols = [ - "id", - "name", - "kind", - "line", - "lang", - "tool_id", - "tool_name", - "tool_version", - "tool_configuration", - ] - - @stored_procedure("swh_mktemp_content_ctags") - def mktemp_content_ctags(self, cur=None): - pass - - def content_ctags_add_from_temp(self, cur=None): - cur = self._cursor(cur) - cur.execute("select * from swh_content_ctags_add()") - return cur.fetchone()[0] - - def content_ctags_get_from_list(self, ids, cur=None): - cur = self._cursor(cur) - keys = map(self._convert_key, self.content_ctags_cols) - yield from execute_values_generator( - cur, - """ - select %s - from (values %%s) as t(id) - inner join content_ctags c - on c.id=t.id - inner join indexer_configuration i - on c.indexer_configuration_id=i.id - order by line - """ - % ", ".join(keys), - ((_id,) for _id in ids), - ) - - def content_ctags_search(self, expression, last_sha1, limit, cur=None): - cur = self._cursor(cur) - if not last_sha1: - query = """SELECT %s - FROM swh_content_ctags_search(%%s, %%s)""" % ( - ",".join(self.content_ctags_cols) - ) - cur.execute(query, (expression, limit)) - else: - if last_sha1 and isinstance(last_sha1, bytes): - last_sha1 = "\\x%s" % hashutil.hash_to_hex(last_sha1) - elif last_sha1: - last_sha1 = "\\x%s" % last_sha1 - - query = """SELECT %s - FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( - ",".join(self.content_ctags_cols) - ) - cur.execute(query, (expression, limit, last_sha1)) - - yield from cur - content_fossology_license_cols = [ "id", "tool_id", diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,7 +25,7 @@ from swh.core.collections import SortedList from swh.model.hashutil import hash_to_bytes, hash_to_hex -from swh.model.model import SHA1_SIZE, Sha1Git +from swh.model.model import SHA1_SIZE from swh.storage.utils import get_partition_bounds_bytes from . import MAPPING_NAMES, check_id_duplicates @@ -33,7 +33,6 @@ from .interface import PagedResult, Sha1 from .model import ( BaseRow, - ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, @@ -248,7 +247,6 @@ args = (self._tools, self.journal_writer) self._mimetypes = SubStorage(ContentMimetypeRow, *args) self._languages = SubStorage(ContentLanguageRow, *args) - self._content_ctags = SubStorage(ContentCtagsRow, *args) self._licenses = SubStorage(ContentLicenseRow, *args) self._content_metadata = SubStorage(ContentMetadataRow, *args) self._directory_intrinsic_metadata = SubStorage( @@ -300,40 +298,6 @@ added = self._languages.add(languages) return {"content_language:add": added} - def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]: - return self._content_ctags.missing(ctags) - - def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]: - return self._content_ctags.get(ids) - - def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]: - added = self._content_ctags.add(ctags) - return {"content_ctags:add": added} - - def content_ctags_search( - self, expression: str, limit: int = 10, last_sha1: Optional[Sha1] = None - ) -> List[ContentCtagsRow]: - nb_matches = 0 - items_per_id: Dict[Tuple[Sha1Git, ToolId], List[ContentCtagsRow]] = {} - for item in sorted(self._content_ctags.get_all()): - if item.id <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): - continue - items_per_id.setdefault( - (item.id, item.indexer_configuration_id), [] - ).append(item) - - results = [] - for items in items_per_id.values(): - for item in items: - if item.name != expression: - continue - nb_matches += 1 - if nb_matches > limit: - break - results.append(item) - - return results - def content_fossology_license_get( self, ids: Iterable[Sha1] ) -> List[ContentLicenseRow]: diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,6 @@ from swh.core.api import remote_api_endpoint from swh.core.api.classes import PagedResult as CorePagedResult from swh.indexer.storage.model import ( - ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, @@ -161,72 +160,6 @@ """ ... - @remote_api_endpoint("content/ctags/missing") - def content_ctags_missing(self, ctags: Iterable[Dict]) -> List[Tuple[Sha1, int]]: - """List ctags missing from storage. - - Args: - ctags (iterable): dicts with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Returns: - list of missing id for the tuple (id, - indexer_configuration_id) - - """ - ... - - @remote_api_endpoint("content/ctags") - def content_ctags_get(self, ids: Iterable[Sha1]) -> List[ContentCtagsRow]: - """Retrieve ctags per id. - - Args: - ids (iterable): sha1 checksums - - Returns: - list of language rows - - - """ - ... - - @remote_api_endpoint("content/ctags/add") - def content_ctags_add(self, ctags: List[ContentCtagsRow]) -> Dict[str, int]: - """Add ctags not present in storage - - Args: - ctags (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **ctags** ([list): List of dictionary with keys: name, kind, - line, lang - - Returns: - Dict summary of number of rows added - - """ - ... - - @remote_api_endpoint("content/ctags/search") - def content_ctags_search( - self, expression: str, limit: int = 10, last_sha1: Optional[Sha1] = None - ) -> List[ContentCtagsRow]: - """Search through content's raw ctags symbols. - - Args: - expression (str): Expression to search for - limit (int): Number of rows to return (default to 10). - last_sha1 (str): Offset from which retrieving data (default to ''). - - Returns: - rows of ctags including id, name, lang, kind, line, etc... - - """ - ... - @remote_api_endpoint("content/fossology_license") def content_fossology_license_get( self, ids: Iterable[Sha1] diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -83,25 +83,6 @@ lang = attr.ib(type=str) -@attr.s -class ContentCtagsRow(BaseRow): - object_type: Final = "content_ctags" - UNIQUE_KEY_FIELDS = ( - "id", - "indexer_configuration_id", - "name", - "kind", - "line", - "lang", - ) - - id = attr.ib(type=Sha1Git) - name = attr.ib(type=str) - kind = attr.ib(type=str) - line = attr.ib(type=int) - lang = attr.ib(type=str) - - @attr.s class ContentLicenseRow(BaseRow): object_type: Final = "content_fossology_license" diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 The Software Heritage developers +# Copyright (C) 2016-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,7 +6,6 @@ from celery import shared_task -from .ctags import CtagsIndexer from .fossology_license import FossologyLicenseIndexer, FossologyLicensePartitionIndexer from .metadata import OriginMetadataIndexer from .mimetype import MimetypeIndexer, MimetypePartitionIndexer @@ -18,11 +17,6 @@ return OriginMetadataIndexer().run(*args, **kwargs) -@shared_task(name=__name__ + ".Ctags") -def ctags(*args, **kwargs): - return CtagsIndexer().run(*args, **kwargs) - - @shared_task(name=__name__ + ".ContentFossologyLicense") def fossology_license(*args, **kwargs): return FossologyLicenseIndexer().run(*args, **kwargs) diff --git a/swh/indexer/tests/storage/generate_data_test.py b/swh/indexer/tests/storage/generate_data_test.py --- a/swh/indexer/tests/storage/generate_data_test.py +++ b/swh/indexer/tests/storage/generate_data_test.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -81,14 +81,6 @@ TOOLS = [ - { - "tool_name": "universal-ctags", - "tool_version": "~git7859817b", - "tool_configuration": { - "command_line": "ctags --fields=+lnz --sort=no --links=no " - "--output-format=json " - }, - }, { "tool_name": "swh-metadata-translator", "tool_version": "0.0.1", diff --git a/swh/indexer/tests/storage/test_converters.py b/swh/indexer/tests/storage/test_converters.py --- a/swh/indexer/tests/storage/test_converters.py +++ b/swh/indexer/tests/storage/test_converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,85 +6,6 @@ from swh.indexer.storage import converters -def test_ctags_to_db() -> None: - input_ctag = { - "id": b"some-id", - "indexer_configuration_id": 100, - "ctags": [ - { - "name": "some-name", - "kind": "some-kind", - "line": 10, - "lang": "Yaml", - }, - { - "name": "main", - "kind": "function", - "line": 12, - "lang": "Yaml", - }, - ], - } - - expected_ctags = [ - { - "id": b"some-id", - "name": "some-name", - "kind": "some-kind", - "line": 10, - "lang": "Yaml", - "indexer_configuration_id": 100, - }, - { - "id": b"some-id", - "name": "main", - "kind": "function", - "line": 12, - "lang": "Yaml", - "indexer_configuration_id": 100, - }, - ] - - # when - actual_ctags = list(converters.ctags_to_db(input_ctag)) - - # then - assert actual_ctags == expected_ctags - - -def test_db_to_ctags() -> None: - input_ctags = { - "id": b"some-id", - "name": "some-name", - "kind": "some-kind", - "line": 10, - "lang": "Yaml", - "tool_id": 200, - "tool_name": "some-toolname", - "tool_version": "some-toolversion", - "tool_configuration": {}, - } - expected_ctags = { - "id": b"some-id", - "name": "some-name", - "kind": "some-kind", - "line": 10, - "lang": "Yaml", - "tool": { - "id": 200, - "name": "some-toolname", - "version": "some-toolversion", - "configuration": {}, - }, - } - - # when - actual_ctags = converters.db_to_ctags(input_ctags) - - # then - assert actual_ctags == expected_ctags - - def test_db_to_mimetype() -> None: input_mimetype = { "id": b"some-id", diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,7 +14,6 @@ from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult from swh.indexer.storage.model import ( BaseRow, - ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, @@ -520,268 +519,6 @@ row_class = ContentLanguageRow -class TestIndexerStorageContentCTags(StorageETypeTester): - """Test Indexer Storage content_ctags related methods""" - - endpoint_type = "content_ctags" - tool_name = "universal-ctags" - example_data = [ - { - "name": "done", - "kind": "variable", - "line": 119, - "lang": "OCaml", - }, - { - "name": "done", - "kind": "variable", - "line": 100, - "lang": "Python", - }, - { - "name": "main", - "kind": "function", - "line": 119, - "lang": "Python", - }, - ] - row_class = ContentCtagsRow - - # the following tests are disabled because CTAGS behaves differently - @pytest.mark.skip - def test_add__update_in_place_duplicate(self): - pass - - @pytest.mark.skip - def test_add_deadlock(self): - pass - - def test_content_ctags_search( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - # 1. given - tool = data.tools["universal-ctags"] - tool_id = tool["id"] - - ctags1 = [ - ContentCtagsRow( - id=data.sha1_1, - indexer_configuration_id=tool_id, - **kwargs, # type: ignore - ) - for kwargs in [ - { - "name": "hello", - "kind": "function", - "line": 133, - "lang": "Python", - }, - { - "name": "counter", - "kind": "variable", - "line": 119, - "lang": "Python", - }, - { - "name": "hello", - "kind": "variable", - "line": 210, - "lang": "Python", - }, - ] - ] - ctags1_with_tool = [ - attr.evolve(ctag, indexer_configuration_id=None, tool=tool) - for ctag in ctags1 - ] - - ctags2 = [ - ContentCtagsRow( - id=data.sha1_2, - indexer_configuration_id=tool_id, - **kwargs, # type: ignore - ) - for kwargs in [ - { - "name": "hello", - "kind": "variable", - "line": 100, - "lang": "C", - }, - { - "name": "result", - "kind": "variable", - "line": 120, - "lang": "C", - }, - ] - ] - ctags2_with_tool = [ - attr.evolve(ctag, indexer_configuration_id=None, tool=tool) - for ctag in ctags2 - ] - - storage.content_ctags_add(ctags1 + ctags2) - - # 1. when - actual_ctags = list(storage.content_ctags_search("hello", limit=1)) - - # 1. then - assert actual_ctags == [ctags1_with_tool[0]] - - # 2. when - actual_ctags = list( - storage.content_ctags_search("hello", limit=1, last_sha1=data.sha1_1) - ) - - # 2. then - assert actual_ctags == [ctags2_with_tool[0]] - - # 3. when - actual_ctags = list(storage.content_ctags_search("hello")) - - # 3. then - assert actual_ctags == [ - ctags1_with_tool[0], - ctags1_with_tool[2], - ctags2_with_tool[0], - ] - - # 4. when - actual_ctags = list(storage.content_ctags_search("counter")) - - # then - assert actual_ctags == [ctags1_with_tool[1]] - - # 5. when - actual_ctags = list(storage.content_ctags_search("result", limit=1)) - - # then - assert actual_ctags == [ctags2_with_tool[1]] - - def test_content_ctags_search_no_result( - self, swh_indexer_storage: IndexerStorageInterface - ) -> None: - storage = swh_indexer_storage - actual_ctags = list(storage.content_ctags_search("counter")) - - assert not actual_ctags - - def test_content_ctags_add__add_new_ctags_added( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - - # given - tool = data.tools["universal-ctags"] - tool_id = tool["id"] - - ctag1 = ContentCtagsRow( - id=data.sha1_2, - indexer_configuration_id=tool_id, - name="done", - kind="variable", - line=100, - lang="Scheme", - ) - ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool) - - # given - storage.content_ctags_add([ctag1]) - storage.content_ctags_add([ctag1]) # conflict does nothing - - # when - actual_ctags = list(storage.content_ctags_get([data.sha1_2])) - - # then - assert actual_ctags == [ctag1_with_tool] - - # given - ctag2 = ContentCtagsRow( - id=data.sha1_2, - indexer_configuration_id=tool_id, - name="defn", - kind="function", - line=120, - lang="Scheme", - ) - ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) - - storage.content_ctags_add([ctag2]) - - actual_ctags = list(storage.content_ctags_get([data.sha1_2])) - - assert actual_ctags == [ctag1_with_tool, ctag2_with_tool] - - def test_content_ctags_add__update_in_place( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - storage, data = swh_indexer_storage_with_data - # given - tool = data.tools["universal-ctags"] - tool_id = tool["id"] - - ctag1 = ContentCtagsRow( - id=data.sha1_2, - indexer_configuration_id=tool_id, - name="done", - kind="variable", - line=100, - lang="Scheme", - ) - ctag1_with_tool = attr.evolve(ctag1, indexer_configuration_id=None, tool=tool) - - # given - storage.content_ctags_add([ctag1]) - - # when - actual_ctags = list(storage.content_ctags_get([data.sha1_2])) - - # then - assert actual_ctags == [ctag1_with_tool] - - # given - ctag2 = ContentCtagsRow( - id=data.sha1_2, - indexer_configuration_id=tool_id, - name="defn", - kind="function", - line=120, - lang="Scheme", - ) - ctag2_with_tool = attr.evolve(ctag2, indexer_configuration_id=None, tool=tool) - - storage.content_ctags_add([ctag1, ctag2]) - - actual_ctags = list(storage.content_ctags_get([data.sha1_2])) - - assert actual_ctags == [ctag1_with_tool, ctag2_with_tool] - - def test_add_empty( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - (storage, data) = swh_indexer_storage_with_data - etype = self.endpoint_type - - summary = endpoint(storage, etype, "add")([]) - assert summary == {"content_ctags:add": 0} - - actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2])) - - assert actual_ctags == [] - - def test_get_unknown( - self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] - ) -> None: - (storage, data) = swh_indexer_storage_with_data - etype = self.endpoint_type - - actual_ctags = list(endpoint(storage, etype, "get")([data.sha1_2])) - - assert actual_ctags == [] - - class TestIndexerStorageContentMetadata(StorageETypeTester): """Test Indexer Storage content_metadata related methods""" diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py deleted file mode 100644 --- a/swh/indexer/tests/test_ctags.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (C) 2017-2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import json -import unittest -from unittest.mock import patch - -import pytest - -import swh.indexer.ctags -from swh.indexer.ctags import CtagsIndexer, run_ctags -from swh.indexer.storage.model import ContentCtagsRow -from swh.indexer.tests.utils import ( - BASE_TEST_CONFIG, - OBJ_STORAGE_DATA, - RAW_CONTENT_IDS, - SHA1_TO_CTAGS, - CommonContentIndexerTest, - fill_obj_storage, - fill_storage, - filter_dict, -) -from swh.model.hashutil import hash_to_bytes - - -class BasicTest(unittest.TestCase): - @patch("swh.indexer.ctags.subprocess") - def test_run_ctags(self, mock_subprocess): - """Computing licenses from a raw content should return results""" - output0 = """ -{"name":"defun","kind":"function","line":1,"language":"scheme"} -{"name":"name","kind":"symbol","line":5,"language":"else"}""" - output1 = """ -{"name":"let","kind":"var","line":10,"language":"something"}""" - - expected_result0 = [ - {"name": "defun", "kind": "function", "line": 1, "lang": "scheme"}, - {"name": "name", "kind": "symbol", "line": 5, "lang": "else"}, - ] - - expected_result1 = [ - {"name": "let", "kind": "var", "line": 10, "lang": "something"} - ] - for path, lang, intermediary_result, expected_result in [ - (b"some/path", "lisp", output0, expected_result0), - (b"some/path/2", "markdown", output1, expected_result1), - ]: - mock_subprocess.check_output.return_value = intermediary_result - actual_result = list(run_ctags(path, lang=lang)) - self.assertEqual(actual_result, expected_result) - - -class InjectCtagsIndexer: - """Override ctags computations.""" - - def compute_ctags(self, path, lang): - """Inject fake ctags given path (sha1 identifier).""" - return {"lang": lang, **SHA1_TO_CTAGS.get(path)} - - -CONFIG = { - **BASE_TEST_CONFIG, - "tools": { - "name": "universal-ctags", - "version": "~git7859817b", - "configuration": { - "command_line": """ctags --fields=+lnz --sort=no """ - """ --links=no """, - "max_content_size": 1000, - }, - }, - "languages": { - "python": "python", - "haskell": "haskell", - "bar": "bar", - }, - "workdir": "/tmp", -} - - -class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase): - """Ctags indexer test scenarios: - - - Known sha1s in the input list have their data indexed - - Unknown sha1 in the input list are not indexed - - """ - - def get_indexer_results(self, ids): - yield from self.idx_storage.content_ctags_get(ids) - - def setUp(self): - super().setUp() - self.indexer = CtagsIndexer(config=CONFIG) - self.indexer.catch_exceptions = False - self.idx_storage = self.indexer.idx_storage - fill_storage(self.indexer.storage) - fill_obj_storage(self.indexer.objstorage) - - # Prepare test input - self.id0, self.id1, self.id2 = RAW_CONTENT_IDS - - tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} - - self.expected_results = [ - *[ - ContentCtagsRow( - id=self.id0, - tool=tool, - **kwargs, - ) - for kwargs in SHA1_TO_CTAGS[self.id0] - ], - *[ - ContentCtagsRow( - id=self.id1, - tool=tool, - **kwargs, - ) - for kwargs in SHA1_TO_CTAGS[self.id1] - ], - *[ - ContentCtagsRow( - id=self.id2, - tool=tool, - **kwargs, - ) - for kwargs in SHA1_TO_CTAGS[self.id2] - ], - ] - - self._set_mocks() - - def _set_mocks(self): - def find_ctags_for_content(raw_content): - for (sha1, ctags) in SHA1_TO_CTAGS.items(): - if OBJ_STORAGE_DATA[hash_to_bytes(sha1)] == raw_content: - return ctags - else: - raise ValueError( - ("%r not found in objstorage, can't mock its ctags.") % raw_content - ) - - def fake_language(raw_content, *args, **kwargs): - ctags = find_ctags_for_content(raw_content) - return {"lang": ctags[0]["lang"]} - - self._real_compute_language = swh.indexer.ctags.compute_language - swh.indexer.ctags.compute_language = fake_language - - def fake_check_output(cmd, *args, **kwargs): - id_ = cmd[-1].split("/")[-1] - return "\n".join( - json.dumps({"language": ctag["lang"], **ctag}) - for ctag in SHA1_TO_CTAGS[hash_to_bytes(id_)] - ) - - self._real_check_output = swh.indexer.ctags.subprocess.check_output - swh.indexer.ctags.subprocess.check_output = fake_check_output - - def tearDown(self): - swh.indexer.ctags.compute_language = self._real_compute_language - swh.indexer.ctags.subprocess.check_output = self._real_check_output - super().tearDown() - - -def test_ctags_w_no_tool(): - with pytest.raises(ValueError): - CtagsIndexer(config=filter_dict(CONFIG, "tools")) diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -241,34 +241,6 @@ } -SHA1_TO_CTAGS: Dict[bytes, List[Dict[str, Any]]] = { - RAW_CONTENT_IDS[0]: [ - { - "name": "foo", - "kind": "str", - "line": 10, - "lang": "bar", - } - ], - RAW_CONTENT_IDS[1]: [ - { - "name": "symbol", - "kind": "float", - "line": 99, - "lang": "python", - } - ], - RAW_CONTENT_IDS[2]: [ - { - "name": "let", - "kind": "int", - "line": 100, - "lang": "haskell", - } - ], -} - - DIRECTORY = Directory( entries=( DirectoryEntry( diff --git a/swh/indexer/tests/zz_celery/test_tasks.py b/swh/indexer/tests/zz_celery/test_tasks.py --- a/swh/indexer/tests/zz_celery/test_tasks.py +++ b/swh/indexer/tests/zz_celery/test_tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -22,24 +22,6 @@ assert res.result == {"status": "eventful"} -def test_task_ctags( - mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config -): - - mock_indexer = mocker.patch("swh.indexer.tasks.CtagsIndexer.run") - mock_indexer.return_value = {"status": "eventful"} - - res = swh_scheduler_celery_app.send_task( - "swh.indexer.tasks.Ctags", - args=["id0"], - ) - assert res - res.wait() - assert res.successful() - - assert res.result == {"status": "eventful"} - - def test_task_fossology_license( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ):