diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information from copy import deepcopy -from typing import Any, Callable, Dict, Iterator, List, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple from swh.core.config import merge_configs from swh.core.utils import grouper @@ -14,6 +14,7 @@ from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.storage import INDEXER_CFG_KEY +from swh.indexer.storage.model import ContentMetadataRow from swh.model import hashutil REVISION_GET_BATCH_SIZE = 10 @@ -32,7 +33,7 @@ yield from f(list(group)) -class ContentMetadataIndexer(ContentIndexer[Dict]): +class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """Content-level indexer This indexer is in charge of: @@ -52,7 +53,9 @@ ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) ) - def index(self, id, data, log_suffix="unknown revision"): + def index( + self, id, data: Optional[bytes] = None, log_suffix="unknown revision", **kwargs + ) -> List[ContentMetadataRow]: """Index sha1s' content and store result. Args: @@ -65,26 +68,27 @@ be returned as None """ - result = { - "id": id, - "indexer_configuration_id": self.tool["id"], - "metadata": None, - } + assert isinstance(id, bytes) + assert data is not None try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) - result["metadata"] = MAPPINGS[mapping_name](log_suffix).translate(data) + metadata = MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id) ) - if result["metadata"] is None: + if metadata is None: return [] - return [result] + return [ + ContentMetadataRow( + id=id, indexer_configuration_id=self.tool["id"], metadata=metadata, + ) + ] def persist_index_computations( - self, results: List[Dict], policy_update: str + self, results: List[ContentMetadataRow], policy_update: str ) -> Dict[str, int]: """Persist the results in storage. @@ -247,9 +251,9 @@ ) for c in metadata_generator: # extracting metadata - sha1 = c["id"] + sha1 = c.id sha1s_in_storage.append(sha1) - local_metadata = c["metadata"] + local_metadata = c.metadata # local metadata is aggregated if local_metadata: metadata.append(local_metadata) @@ -268,7 +272,7 @@ ) # on the fly possibility: for result in c_metadata_indexer.results: - local_metadata = result["metadata"] + local_metadata = result.metadata metadata.append(local_metadata) except Exception: diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import abc import json import logging from typing import List @@ -11,7 +10,7 @@ from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values -class BaseMapping(metaclass=abc.ABCMeta): +class BaseMapping: """Base class for mappings to inherit from To implement a new mapping: @@ -27,14 +26,12 @@ ) @property - @abc.abstractmethod def name(self): """A name of this mapping, used as an identifier in the indexer storage.""" - pass + raise NotImplementedError(f"{self.__class__.__name__}.name") @classmethod - @abc.abstractmethod def detect_metadata_files(cls, files): """ Detects files potentially containing metadata @@ -45,11 +42,10 @@ Returns: list: list of sha1 (possibly empty) """ - pass + raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") - @abc.abstractmethod def translate(self, file_content): - pass + raise NotImplementedError(f"{self.__class__.__name__}.translate") def normalize_translation(self, metadata): return compact(metadata) @@ -59,10 +55,9 @@ """Base class for all mappings that use a single file as input.""" @property - @abc.abstractmethod def filename(self): """The .json file to extract metadata from.""" - pass + raise NotImplementedError(f"{self.__class__.__name__}.filename") @classmethod def detect_metadata_files(cls, file_entries): @@ -81,10 +76,9 @@ normalization.""" @property - @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" - pass + raise NotImplementedError(f"{self.__class__.__name__}.mapping") @staticmethod def _normalize_method_name(name): diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -447,14 +447,20 @@ @timed @db_transaction() - def content_metadata_missing(self, metadata, db=None, cur=None): + def content_metadata_missing( + self, metadata: Iterable[Dict], db=None, cur=None + ) -> List[Tuple[Sha1, int]]: return [obj[0] for obj in db.content_metadata_missing_from_list(metadata, cur)] @timed @db_transaction() - def content_metadata_get(self, ids, db=None, cur=None): + def content_metadata_get( + self, ids: Iterable[Sha1], db=None, cur=None + ) -> List[ContentMetadataRow]: return [ - converters.db_to_metadata(dict(zip(db.content_metadata_cols, c))) + ContentMetadataRow.from_dict( + converters.db_to_metadata(dict(zip(db.content_metadata_cols, c))) + ) for c in db.content_metadata_get_from_list(ids, cur) ] @@ -462,15 +468,19 @@ @process_metrics @db_transaction() def content_metadata_add( - self, metadata: List[Dict], conflict_update: bool = False, db=None, cur=None + self, + metadata: List[ContentMetadataRow], + conflict_update: bool = False, + db=None, + cur=None, ) -> Dict[str, int]: - check_id_duplicates(map(ContentMetadataRow.from_dict, metadata)) - metadata.sort(key=lambda m: m["id"]) + check_id_duplicates(metadata) + metadata.sort(key=lambda m: m.id) db.mktemp_content_metadata(cur) db.copy_to( - metadata, + [m.to_dict() for m in metadata], "tmp_content_metadata", ["id", "metadata", "indexer_configuration_id"], cur, diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -361,19 +361,18 @@ indexer_configuration_id, partition_id, nb_partitions, page_token, limit ) - def content_metadata_missing(self, metadata): + def content_metadata_missing( + self, metadata: Iterable[Dict] + ) -> List[Tuple[Sha1, int]]: return self._content_metadata.missing(metadata) - def content_metadata_get(self, ids): - return [obj.to_dict() for obj in self._content_metadata.get(ids)] + def content_metadata_get(self, ids: Iterable[Sha1]) -> List[ContentMetadataRow]: + return self._content_metadata.get(ids) def content_metadata_add( - self, metadata: List[Dict], conflict_update: bool = False + self, metadata: List[ContentMetadataRow], conflict_update: bool = False ) -> Dict[str, int]: - check_id_types(metadata) - added = self._content_metadata.add( - map(ContentMetadataRow.from_dict, metadata), conflict_update - ) + added = self._content_metadata.add(metadata, conflict_update) return {"content_metadata:add": added} def revision_intrinsic_metadata_missing(self, metadata): diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -11,6 +11,7 @@ ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, + ContentMetadataRow, ContentMimetypeRow, ) @@ -292,7 +293,9 @@ ... @remote_api_endpoint("content_metadata/missing") - def content_metadata_missing(self, metadata): + def content_metadata_missing( + self, metadata: Iterable[Dict] + ) -> List[Tuple[Sha1, int]]: """List metadata missing from storage. Args: @@ -309,7 +312,7 @@ ... @remote_api_endpoint("content_metadata") - def content_metadata_get(self, ids): + def content_metadata_get(self, ids: Iterable[Sha1]) -> List[ContentMetadataRow]: """Retrieve metadata per id. Args: @@ -327,7 +330,7 @@ @remote_api_endpoint("content_metadata/add") def content_metadata_add( - self, metadata: List[Dict], conflict_update: bool = False + self, metadata: List[ContentMetadataRow], conflict_update: bool = False ) -> Dict[str, int]: """Add metadata not present in storage. diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -18,6 +18,7 @@ ContentCtagsRow, ContentLanguageRow, ContentLicenseRow, + ContentMetadataRow, ContentMimetypeRow, ) from swh.model.hashutil import hash_to_bytes @@ -827,6 +828,8 @@ }, {"metadata": {"other": {}, "name": "test_metadata", "version": "0.0.1"},}, ] + row_from_dict = ContentMetadataRow.from_dict + dict_from_row = staticmethod(lambda x: x.to_dict()) # type: ignore class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester): diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -15,6 +15,7 @@ from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping +from swh.indexer.storage.model import ContentMetadataRow from swh.indexer.tests.utils import DIRECTORY2, REVISION from swh.model.hashutil import hash_to_bytes from swh.model.model import Directory, DirectoryEntry, Revision @@ -143,8 +144,10 @@ results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s)) expected_results = [ - { - "metadata": { + ContentMetadataRow( + id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), + tool=TRANSLATOR_TOOL, + metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "codeRepository": "git+https://github.com/moranegg/metadata_test", @@ -152,10 +155,11 @@ "name": "test_metadata", "version": "0.0.1", }, - "id": hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), - }, - { - "metadata": { + ), + ContentMetadataRow( + id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), + tool=TRANSLATOR_TOOL, + metadata={ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "issueTracker": "https://github.com/npm/npm/issues", @@ -180,12 +184,11 @@ ], "url": "https://docs.npmjs.com/", }, - "id": hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"), - }, + ), ] for result in results: - del result["tool"] + del result.tool["id"] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) @@ -1111,11 +1114,11 @@ metadata_indexer.idx_storage.content_metadata_add( [ - { - "indexer_configuration_id": tool["id"], - "id": DIRECTORY2.entries[0].target, - "metadata": YARN_PARSER_METADATA, - } + ContentMetadataRow( + id=DIRECTORY2.entries[0].target, + indexer_configuration_id=tool["id"], + metadata=YARN_PARSER_METADATA, + ) ] ) @@ -1172,11 +1175,11 @@ metadata_indexer.idx_storage.content_metadata_add( [ - { - "indexer_configuration_id": tool["id"], - "id": DIRECTORY2.entries[0].target, - "metadata": YARN_PARSER_METADATA, - } + ContentMetadataRow( + id=DIRECTORY2.entries[0].target, + indexer_configuration_id=tool["id"], + metadata=YARN_PARSER_METADATA, + ) ] )