diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -25,7 +25,6 @@ from .interface import PagedResult, Sha1 from .metrics import process_metrics, send_metric, timed from .model import ( - ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, @@ -102,16 +101,17 @@ data (List[dict]): List of dictionaries to be inserted >>> check_id_duplicates([ - ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), - ... ContentLanguageRow(id=b'foo', indexer_configuration_id=32, lang="python"), + ... ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="GPL"), + ... ContentLicenseRow(id=b'foo', indexer_configuration_id=32, license="GPL"), ... ]) >>> check_id_duplicates([ - ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), - ... ContentLanguageRow(id=b'foo', indexer_configuration_id=42, lang="python"), + ... ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="AGPL"), + ... ContentLicenseRow(id=b'foo', indexer_configuration_id=42, license="AGPL"), ... ]) Traceback (most recent call last): - ... - swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42}] + ... + swh.indexer.storage.exc.DuplicateId: [{'id': b'foo', 'indexer_configuration_id': 42, 'license': 'AGPL'}] + """ # noqa counter = Counter(tuple(sorted(item.unique_key().items())) for item in data) duplicates = [id_ for (id_, count) in counter.items() if count >= 2] @@ -194,7 +194,7 @@ bound by limit. Args: - **indexer_type**: Type of data content to index (mimetype, language, etc...) + **indexer_type**: Type of data content to index (mimetype, etc...) **indexer_configuration_id**: The tool used to index data **partition_id**: index of the partition to fetch **nb_partitions**: total number of partitions to split into @@ -303,56 +303,6 @@ for c in db.content_mimetype_get_from_list(ids, cur) ] - @timed - @db_transaction() - def content_language_missing( - self, languages: Iterable[Dict], db=None, cur=None - ) -> List[Tuple[Sha1, int]]: - return [obj[0] for obj in db.content_language_missing_from_list(languages, cur)] - - @timed - @db_transaction() - def content_language_get( - self, ids: Iterable[Sha1], db=None, cur=None - ) -> List[ContentLanguageRow]: - return [ - ContentLanguageRow.from_dict( - converters.db_to_language(dict(zip(db.content_language_cols, c))) - ) - for c in db.content_language_get_from_list(ids, cur) - ] - - @timed - @process_metrics - @db_transaction() - def content_language_add( - self, - languages: List[ContentLanguageRow], - db=None, - cur=None, - ) -> Dict[str, int]: - check_id_duplicates(languages) - languages.sort(key=lambda m: m.id) - self.journal_writer.write_additions("content_language", languages) - db.mktemp_content_language(cur) - # empty language is mapped to 'unknown' - db.copy_to( - ( - { - "id": lang.id, - "lang": lang.lang or "unknown", - "indexer_configuration_id": lang.indexer_configuration_id, - } - for lang in languages - ), - "tmp_content_language", - ["id", "lang", "indexer_configuration_id"], - cur, - ) - - count = db.content_language_add_from_temp(cur) - return {"content_language:add": count} - @timed @db_transaction() def content_fossology_license_get( diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py --- a/swh/indexer/storage/converters.py +++ b/swh/indexer/storage/converters.py @@ -19,20 +19,6 @@ } -def db_to_language(language): - """Convert a language entry into a ready language output.""" - return { - "id": language["id"], - "lang": language["lang"], - "tool": { - "id": language["tool_id"], - "name": language["tool_name"], - "version": language["tool_version"], - "configuration": language["tool_configuration"], - }, - } - - def db_to_metadata(metadata): """Convert a metadata entry into a ready metadata output.""" metadata["tool"] = { diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -22,8 +22,7 @@ """Read from table the data with hash_keys that are missing. Args: - table: Table name (e.g content_mimetype, content_language, - etc...) + table: Table name (e.g content_mimetype, fossology_license, etc...) data: Dict of data to read from hash_keys: List of keys to read in the data dict. @@ -85,8 +84,7 @@ Expected: Tables content_{something} being aliased as 'c' (something - in {language, mimetype, ...}), table indexer_configuration - being aliased as 'i'. + in {mimetype, ...}), table indexer_configuration being aliased as 'i'. """ if key == "id": @@ -181,37 +179,6 @@ "content_mimetype", ids, self.content_mimetype_cols, cur=cur ) - content_language_hash_keys = ["id", "indexer_configuration_id"] - - def content_language_missing_from_list(self, languages, cur=None): - """List missing languages.""" - yield from self._missing_from_list( - "content_language", languages, self.content_language_hash_keys, cur=cur - ) - - content_language_cols = [ - "id", - "lang", - "tool_id", - "tool_name", - "tool_version", - "tool_configuration", - ] - - @stored_procedure("swh_mktemp_content_language") - def mktemp_content_language(self, cur=None): - pass - - def content_language_add_from_temp(self, cur=None): - cur = self._cursor(cur) - cur.execute("select * from swh_content_language_add()") - return cur.fetchone()[0] - - def content_language_get_from_list(self, ids, cur=None): - yield from self._get_from_list( - "content_language", ids, self.content_language_cols, cur=cur - ) - content_fossology_license_cols = [ "id", "tool_id", diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -33,7 +33,6 @@ from .interface import PagedResult, Sha1 from .model import ( BaseRow, - ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, @@ -154,7 +153,7 @@ bound by limit. Args: - **indexer_type**: Type of data content to index (mimetype, language, etc...) + **indexer_type**: Type of data content to index (mimetype, etc...) **indexer_configuration_id**: The tool used to index data **partition_id**: index of the partition to fetch **nb_partitions**: total number of partitions to split into @@ -246,7 +245,6 @@ self.journal_writer = JournalWriter(tool_getter, journal_writer) args = (self._tools, self.journal_writer) self._mimetypes = SubStorage(ContentMimetypeRow, *args) - self._languages = SubStorage(ContentLanguageRow, *args) self._licenses = SubStorage(ContentLicenseRow, *args) self._content_metadata = SubStorage(ContentMetadataRow, *args) self._directory_intrinsic_metadata = SubStorage( @@ -284,20 +282,6 @@ def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]: return self._mimetypes.get(ids) - def content_language_missing( - self, languages: Iterable[Dict] - ) -> List[Tuple[Sha1, int]]: - return self._languages.missing(languages) - - def content_language_get(self, ids: Iterable[Sha1]) -> List[ContentLanguageRow]: - return self._languages.get(ids) - - def content_language_add( - self, languages: List[ContentLanguageRow] - ) -> Dict[str, int]: - added = self._languages.add(languages) - return {"content_language:add": added} - def content_fossology_license_get( self, ids: Iterable[Sha1] ) -> List[ContentLicenseRow]: diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -10,7 +10,6 @@ from swh.core.api import remote_api_endpoint from swh.core.api.classes import PagedResult as CorePagedResult from swh.indexer.storage.model import ( - ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, @@ -113,53 +112,6 @@ """ ... - @remote_api_endpoint("content_language/missing") - def content_language_missing( - self, languages: Iterable[Dict] - ) -> List[Tuple[Sha1, int]]: - """List languages missing from storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Returns: - list of tuple (id, indexer_configuration_id) missing - - """ - ... - - @remote_api_endpoint("content_language") - def content_language_get(self, ids: Iterable[Sha1]) -> List[ContentLanguageRow]: - """Retrieve full content language per ids. - - Args: - ids (iterable): sha1 identifier - - Returns: - language row objects - - """ - ... - - @remote_api_endpoint("content_language/add") - def content_language_add( - self, languages: List[ContentLanguageRow] - ) -> Dict[str, int]: - """Add languages not present in storage. - - Args: - languages: language row objects - - Returns: - Dict summary of number of rows added - - """ - ... - @remote_api_endpoint("content/fossology_license") def content_fossology_license_get( self, ids: Iterable[Sha1] diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -75,14 +75,6 @@ encoding = attr.ib(type=str) -@attr.s -class ContentLanguageRow(BaseRow): - object_type: Final = "content_language" - - id = attr.ib(type=Sha1Git) - lang = attr.ib(type=str) - - @attr.s class ContentLicenseRow(BaseRow): object_type: Final = "content_fossology_license" diff --git a/swh/indexer/tests/storage/test_converters.py b/swh/indexer/tests/storage/test_converters.py --- a/swh/indexer/tests/storage/test_converters.py +++ b/swh/indexer/tests/storage/test_converters.py @@ -34,32 +34,6 @@ assert actual_mimetype == expected_mimetype -def test_db_to_language() -> None: - input_language = { - "id": b"some-id", - "tool_id": 20, - "tool_name": "some-toolname", - "tool_version": "some-toolversion", - "tool_configuration": {}, - "lang": b"css", - } - - expected_language = { - "id": b"some-id", - "lang": b"css", - "tool": { - "id": 20, - "name": "some-toolname", - "version": "some-toolversion", - "configuration": {}, - }, - } - - actual_language = converters.db_to_language(input_language) - - assert actual_language == expected_language - - def test_db_to_fossology_license() -> None: input_license = { "id": b"some-id", diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -14,7 +14,6 @@ from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult from swh.indexer.storage.model import ( BaseRow, - ContentLanguageRow, ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, @@ -503,22 +502,6 @@ assert actual_id in expected_ids -class TestIndexerStorageContentLanguage(StorageETypeTester): - """Test Indexer Storage content_language related methods""" - - endpoint_type = "content_language" - tool_name = "pygments" - example_data = [ - { - "lang": "haskell", - }, - { - "lang": "common-lisp", - }, - ] - row_class = ContentLanguageRow - - class TestIndexerStorageContentMetadata(StorageETypeTester): """Test Indexer Storage content_metadata related methods""" diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -67,7 +67,7 @@ class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): - """Language indexer test scenarios: + """Fossology license indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed