diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -278,14 +278,20 @@ @timed @db_transaction() - def content_language_missing(self, languages, db=None, cur=None): + def content_language_missing( + self, languages: Iterable[Dict], db=None, cur=None + ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_language_missing_from_list(languages, cur)] @timed @db_transaction() - def content_language_get(self, ids, db=None, cur=None): + def content_language_get( + self, ids: Iterable[Sha1], db=None, cur=None + ) -> List[ContentLanguageRow]: return [ - converters.db_to_language(dict(zip(db.content_language_cols, c))) + ContentLanguageRow.from_dict( + converters.db_to_language(dict(zip(db.content_language_cols, c))) + ) for c in db.content_language_get_from_list(ids, cur) ] @@ -293,18 +299,22 @@ @process_metrics @db_transaction() def content_language_add( - self, languages: List[Dict], conflict_update: bool = False, db=None, cur=None + self, + languages: List[ContentLanguageRow], + conflict_update: bool = False, + db=None, + cur=None, ) -> Dict[str, int]: - check_id_duplicates(map(ContentLanguageRow.from_dict, languages)) - languages.sort(key=lambda m: m["id"]) + check_id_duplicates(languages) + languages.sort(key=lambda m: m.id) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ( { - "id": lang["id"], - "lang": "unknown" if not lang["lang"] else lang["lang"], - "indexer_configuration_id": lang["indexer_configuration_id"], + "id": lang.id, + "lang": lang.lang or "unknown", + "indexer_configuration_id": lang.indexer_configuration_id, } for lang in languages ), diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -288,19 +288,18 @@ def content_mimetype_get(self, ids: Iterable[Sha1]) -> List[ContentMimetypeRow]: return self._mimetypes.get(ids) - def content_language_missing(self, languages): + def content_language_missing( + self, languages: Iterable[Dict] + ) -> List[Tuple[Sha1, int]]: return self._languages.missing(languages) - def content_language_get(self, ids): - return [obj.to_dict() for obj in self._languages.get(ids)] + def content_language_get(self, ids: Iterable[Sha1]) -> List[ContentLanguageRow]: + return self._languages.get(ids) def content_language_add( - self, languages: List[Dict], conflict_update: bool = False + self, languages: List[ContentLanguageRow], conflict_update: bool = False ) -> Dict[str, int]: - check_id_types(languages) - added = self._languages.add( - map(ContentLanguageRow.from_dict, languages), conflict_update - ) + added = self._languages.add(languages, conflict_update) return {"content_language:add": added} def content_ctags_missing(self, ctags): diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -7,7 +7,11 @@ from swh.core.api import remote_api_endpoint from swh.core.api.classes import PagedResult as CorePagedResult -from swh.indexer.storage.model import ContentLicenseRow, ContentMimetypeRow +from swh.indexer.storage.model import ( + ContentLanguageRow, + ContentLicenseRow, + ContentMimetypeRow, +) TResult = TypeVar("TResult") PagedResult = CorePagedResult[TResult, str] @@ -35,8 +39,8 @@ - **indexer_configuration_id** (int): tool used to compute the results - Yields: - tuple (id, indexer_configuration_id): missing id + Returns: + list of tuple (id, indexer_configuration_id) missing """ ... @@ -97,14 +101,16 @@ Args: ids: sha1 identifiers - Yields: + Returns: mimetype row objects """ ... @remote_api_endpoint("content_language/missing") - def content_language_missing(self, languages): + def content_language_missing( + self, languages: Iterable[Dict] + ) -> List[Tuple[Sha1, int]]: """List languages missing from storage. Args: @@ -114,41 +120,33 @@ - **indexer_configuration_id** (int): tool used to compute the results - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) + Returns: + list of tuple (id, indexer_configuration_id) missing """ ... @remote_api_endpoint("content_language") - def content_language_get(self, ids): + def content_language_get(self, ids: Iterable[Sha1]) -> List[ContentLanguageRow]: """Retrieve full content language per ids. Args: ids (iterable): sha1 identifier - Yields: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **lang** (bytes): raw content's language - - **tool** (dict): Tool used to compute the language + Returns: + language row objects """ ... @remote_api_endpoint("content_language/add") def content_language_add( - self, languages: List[Dict], conflict_update: bool = False + self, languages: List[ContentLanguageRow], conflict_update: bool = False ) -> Dict[str, int]: """Add languages not present in storage. Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **lang** (bytes): language detected + languages: language row objects conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false, the diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -12,7 +12,12 @@ from swh.indexer.storage.exc import DuplicateId, IndexerStorageArgumentException from swh.indexer.storage.interface import IndexerStorageInterface -from swh.indexer.storage.model import BaseRow, ContentLicenseRow, ContentMimetypeRow +from swh.indexer.storage.model import ( + BaseRow, + ContentLanguageRow, + ContentLicenseRow, + ContentMimetypeRow, +) from swh.model.hashutil import hash_to_bytes @@ -559,6 +564,8 @@ {"lang": "haskell",}, {"lang": "common-lisp",}, ] + row_from_dict = ContentLanguageRow.from_dict + dict_from_row = staticmethod(lambda x: x.to_dict()) # type: ignore class TestIndexerStorageContentCTags(StorageETypeTester):