diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -15,7 +15,6 @@ from swh.core.config import load_from_envvar, merge_configs from swh.indexer.storage import INDEXER_CFG_KEY, PagedResult, Sha1, get_indexer_storage from swh.indexer.storage.interface import IndexerStorageInterface -from swh.indexer.storage.model import BaseRow from swh.model import hashutil from swh.model.model import Revision from swh.objstorage.exc import ObjNotFoundError @@ -60,7 +59,7 @@ # TODO: should be bound=Optional[BaseRow] when all endpoints move away from dicts -TResult = TypeVar("TResult", bound=Union[None, Dict, BaseRow]) +TResult = TypeVar("TResult") class BaseIndexer(Generic[TResult], metaclass=abc.ABCMeta): diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -14,8 +14,9 @@ from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.storage import INDEXER_CFG_KEY -from swh.indexer.storage.model import ContentMetadataRow +from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow from swh.model import hashutil +from swh.model.model import Revision REVISION_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 @@ -115,7 +116,7 @@ } -class RevisionMetadataIndexer(RevisionIndexer[Dict]): +class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]): """Revision-level indexer This indexer is in charge of: @@ -145,7 +146,7 @@ ) ) - def index(self, rev): + def index(self, id, data=None, **kwargs) -> List[RevisionIntrinsicMetadataRow]: """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames @@ -165,12 +166,9 @@ - metadata: dict of retrieved metadata """ - result = { - "id": rev.id, - "indexer_configuration_id": self.tool["id"], - "mappings": None, - "metadata": None, - } + rev = id + assert isinstance(rev, Revision) + assert data is None try: root_dir = rev.directory @@ -179,20 +177,25 @@ # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_ls[0]["target"] - dir_ls = self.storage.directory_ls(subdir, recursive=False) + dir_ls = list(self.storage.directory_ls(subdir, recursive=False)) files = [entry for entry in dir_ls if entry["type"] == "file"] detected_files = detect_metadata(files) (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id), ) - result["mappings"] = mappings - result["metadata"] = metadata except Exception as e: self.log.exception("Problem when indexing rev: %r", e) - return [result] + return [ + RevisionIntrinsicMetadataRow( + id=rev.id, + indexer_configuration_id=self.tool["id"], + mappings=mappings, + metadata=metadata, + ) + ] def persist_index_computations( - self, results: List[Dict], policy_update: str + self, results: List[RevisionIntrinsicMetadataRow], policy_update: str ) -> Dict[str, int]: """Persist the results in storage. @@ -214,7 +217,7 @@ def translate_revision_intrinsic_metadata( self, detected_files: Dict[str, List[Any]], log_suffix: str - ) -> Tuple[List[Any], List[Any]]: + ) -> Tuple[List[Any], Any]: """ Determine plan of action to translate metadata when containing one or multiple detected files: @@ -282,7 +285,7 @@ return (used_mappings, metadata) -class OriginMetadataIndexer(OriginIndexer[Dict]): +class OriginMetadataIndexer(OriginIndexer[Tuple[Dict, RevisionIntrinsicMetadataRow]]): USE_TOOLS = False def __init__(self, config=None, **kwargs) -> None: @@ -323,35 +326,42 @@ for rev_metadata in self.revision_metadata_indexer.index(rev): # There is at most one rev_metadata orig_metadata = { - "from_revision": rev_metadata["id"], + "from_revision": rev_metadata.id, "id": origin.url, - "metadata": rev_metadata["metadata"], - "mappings": rev_metadata["mappings"], - "indexer_configuration_id": rev_metadata[ - "indexer_configuration_id" - ], + "metadata": rev_metadata.metadata, + "mappings": rev_metadata.mappings, + "indexer_configuration_id": rev_metadata.indexer_configuration_id, } results.append((orig_metadata, rev_metadata)) return results def persist_index_computations( - self, results: List[Dict], policy_update: str + self, + results: List[Tuple[Dict, RevisionIntrinsicMetadataRow]], + policy_update: str, ) -> Dict[str, int]: conflict_update = policy_update == "update-dups" # Deduplicate revisions - rev_metadata: List[Any] = [] - orig_metadata: List[Any] = [] - revs_to_delete: List[Any] = [] - origs_to_delete: List[Any] = [] + rev_metadata: List[RevisionIntrinsicMetadataRow] = [] + orig_metadata: List[Dict] = [] + revs_to_delete: List[Dict] = [] + origs_to_delete: List[Dict] = [] summary: Dict = {} for (orig_item, rev_item) in results: - assert rev_item["metadata"] == orig_item["metadata"] - if not rev_item["metadata"] or rev_item["metadata"].keys() <= {"@context"}: + assert rev_item.metadata == orig_item["metadata"] + if not rev_item.metadata or rev_item.metadata.keys() <= {"@context"}: # If we didn't find any metadata, don't store a DB record # (and delete existing ones, if any) if rev_item not in revs_to_delete: - revs_to_delete.append(rev_item) + revs_to_delete.append( + { + "id": rev_item.id, + "indexer_configuration_id": ( + rev_item.indexer_configuration_id + ), + } + ) if orig_item not in origs_to_delete: origs_to_delete.append(orig_item) else: diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -492,7 +492,9 @@ @timed @db_transaction() - def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): + def revision_intrinsic_metadata_missing( + self, metadata: Iterable[Dict], db=None, cur=None + ) -> List[Tuple[Sha1, int]]: return [ obj[0] for obj in db.revision_intrinsic_metadata_missing_from_list(metadata, cur) @@ -500,9 +502,15 @@ @timed @db_transaction() - def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): + def revision_intrinsic_metadata_get( + self, ids: Iterable[Sha1], db=None, cur=None + ) -> List[RevisionIntrinsicMetadataRow]: return [ - converters.db_to_metadata(dict(zip(db.revision_intrinsic_metadata_cols, c))) + RevisionIntrinsicMetadataRow.from_dict( + converters.db_to_metadata( + dict(zip(db.revision_intrinsic_metadata_cols, c)) + ) + ) for c in db.revision_intrinsic_metadata_get_from_list(ids, cur) ] @@ -510,15 +518,19 @@ @process_metrics @db_transaction() def revision_intrinsic_metadata_add( - self, metadata: List[Dict], conflict_update: bool = False, db=None, cur=None + self, + metadata: List[RevisionIntrinsicMetadataRow], + conflict_update: bool = False, + db=None, + cur=None, ) -> Dict[str, int]: - check_id_duplicates(map(RevisionIntrinsicMetadataRow.from_dict, metadata)) - metadata.sort(key=lambda m: m["id"]) + check_id_duplicates(metadata) + metadata.sort(key=lambda m: m.id) db.mktemp_revision_intrinsic_metadata(cur) db.copy_to( - metadata, + [m.to_dict() for m in metadata], "tmp_revision_intrinsic_metadata", ["id", "metadata", "mappings", "indexer_configuration_id"], cur, diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -375,19 +375,22 @@ added = self._content_metadata.add(metadata, conflict_update) return {"content_metadata:add": added} - def revision_intrinsic_metadata_missing(self, metadata): + def revision_intrinsic_metadata_missing( + self, metadata: Iterable[Dict] + ) -> List[Tuple[Sha1, int]]: return self._revision_intrinsic_metadata.missing(metadata) - def revision_intrinsic_metadata_get(self, ids): - return [obj.to_dict() for obj in self._revision_intrinsic_metadata.get(ids)] + def revision_intrinsic_metadata_get( + self, ids: Iterable[Sha1] + ) -> List[RevisionIntrinsicMetadataRow]: + return self._revision_intrinsic_metadata.get(ids) def revision_intrinsic_metadata_add( - self, metadata: List[Dict], conflict_update: bool = False + self, + metadata: List[RevisionIntrinsicMetadataRow], + conflict_update: bool = False, ) -> Dict[str, int]: - check_id_types(metadata) - added = self._revision_intrinsic_metadata.add( - map(RevisionIntrinsicMetadataRow.from_dict, metadata), conflict_update - ) + added = self._revision_intrinsic_metadata.add(metadata, conflict_update) return {"revision_intrinsic_metadata:add": added} def revision_intrinsic_metadata_delete(self, entries: List[Dict]) -> Dict: diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -13,6 +13,7 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + RevisionIntrinsicMetadataRow, ) TResult = TypeVar("TResult") @@ -350,7 +351,9 @@ ... @remote_api_endpoint("revision_intrinsic_metadata/missing") - def revision_intrinsic_metadata_missing(self, metadata): + def revision_intrinsic_metadata_missing( + self, metadata: Iterable[Dict] + ) -> List[Tuple[Sha1, int]]: """List metadata missing from storage. Args: @@ -360,45 +363,37 @@ - **indexer_configuration_id** (int): tool used to compute the results - Yields: + Returns: missing ids """ ... @remote_api_endpoint("revision_intrinsic_metadata") - def revision_intrinsic_metadata_get(self, ids): + def revision_intrinsic_metadata_get( + self, ids: Iterable[Sha1] + ) -> List[RevisionIntrinsicMetadataRow]: """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums - Yields: - : dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata + Returns: + ContentMetadataRow objects """ ... @remote_api_endpoint("revision_intrinsic_metadata/add") def revision_intrinsic_metadata_add( - self, metadata: List[Dict], conflict_update: bool = False + self, + metadata: List[RevisionIntrinsicMetadataRow], + conflict_update: bool = False, ) -> Dict[str, int]: """Add metadata not present in storage. Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1_git of revision - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata + metadata: ContentMetadataRow objects conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -20,6 +20,7 @@ ContentLicenseRow, ContentMetadataRow, ContentMimetypeRow, + RevisionIntrinsicMetadataRow, ) from swh.model.hashutil import hash_to_bytes @@ -857,6 +858,8 @@ "mappings": ["mapping2"], }, ] + row_from_dict = RevisionIntrinsicMetadataRow.from_dict + dict_from_row = staticmethod(lambda x: x.to_dict()) # type: ignore def test_revision_intrinsic_metadata_delete( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] @@ -866,11 +869,11 @@ tool = data.tools[self.tool_name] query = [data.sha1_2, data.sha1_1] - data1 = { - "id": data.sha1_2, - **self.example_data[0], - "indexer_configuration_id": tool["id"], - } + data1 = RevisionIntrinsicMetadataRow( + id=data.sha1_2, + indexer_configuration_id=tool["id"], + **self.example_data[0], # type: ignore + ) # when summary = endpoint(storage, etype, "add")([data1]) @@ -1139,12 +1142,12 @@ "version": None, "name": None, } - metadata_rev = { - "id": data.revision_id_2, - "metadata": metadata, - "mappings": ["mapping1"], - "indexer_configuration_id": tool_id, - } + metadata_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata=metadata, + mappings=["mapping1"], + indexer_configuration_id=tool_id, + ) metadata_origin = { "id": data.origin_url_1, "metadata": metadata, @@ -1185,12 +1188,12 @@ "version": None, "name": None, } - metadata_rev = { - "id": data.revision_id_2, - "metadata": metadata, - "mappings": ["mapping1"], - "indexer_configuration_id": tool_id, - } + metadata_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + indexer_configuration_id=tool_id, + metadata=metadata, + mappings=["mapping1"], + ) metadata_origin = { "id": data.origin_url_1, "metadata": metadata, @@ -1239,12 +1242,12 @@ "version": None, "name": None, } - metadata_rev_v1 = { - "id": data.revision_id_1, - "metadata": metadata_v1.copy(), - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata_rev_v1 = RevisionIntrinsicMetadataRow( + id=data.revision_id_1, + metadata=metadata_v1.copy(), + mappings=[], + indexer_configuration_id=tool_id, + ) metadata_origin_v1 = { "id": data.origin_url_1, "metadata": metadata_v1.copy(), @@ -1279,9 +1282,8 @@ metadata_v2.update( {"name": "test_metadata", "author": "MG",} ) - metadata_rev_v2 = metadata_rev_v1.copy() + metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2) metadata_origin_v2 = metadata_origin_v1.copy() - metadata_rev_v2["metadata"] = metadata_v2 metadata_origin_v2["metadata"] = metadata_v2 storage.revision_intrinsic_metadata_add([metadata_rev_v2]) @@ -1306,12 +1308,12 @@ "version": None, "name": None, } - metadata_rev_v1 = { - "id": data.revision_id_2, - "metadata": metadata_v1, - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata_rev_v1 = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata=metadata_v1, + mappings=[], + indexer_configuration_id=tool_id, + ) metadata_origin_v1 = { "id": data.origin_url_1, "metadata": metadata_v1.copy(), @@ -1346,9 +1348,8 @@ metadata_v2.update( {"name": "test_update_duplicated_metadata", "author": "MG",} ) - metadata_rev_v2 = metadata_rev_v1.copy() + metadata_rev_v2 = attr.evolve(metadata_rev_v1, metadata=metadata_v2) metadata_origin_v2 = metadata_origin_v1.copy() - metadata_rev_v2["metadata"] = metadata_v2 metadata_origin_v2 = { "id": data.origin_url_1, "metadata": metadata_v2.copy(), @@ -1397,12 +1398,12 @@ "mappings": [], } - metadata_rev_v1 = { - "id": data.revision_id_2, - "metadata": {"version": None, "name": None,}, - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata_rev_v1 = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata={"version": None, "name": None,}, + mappings=[], + indexer_configuration_id=tool_id, + ) data_v1 = [ { @@ -1490,12 +1491,12 @@ "developmentStatus": None, "name": None, } - metadata_rev = { - "id": data.revision_id_2, - "metadata": metadata, - "mappings": ["mapping1"], - "indexer_configuration_id": tool_id, - } + metadata_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata=metadata, + mappings=["mapping1"], + indexer_configuration_id=tool_id, + ) metadata_origin = { "id": data.origin_url_1, "metadata": metadata, @@ -1520,12 +1521,12 @@ metadata1 = { "author": "John Doe", } - metadata1_rev = { - "id": data.revision_id_1, - "metadata": metadata1, - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata1_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_1, + metadata=metadata1, + mappings=[], + indexer_configuration_id=tool_id, + ) metadata1_origin = { "id": data.origin_url_1, "metadata": metadata1, @@ -1536,12 +1537,12 @@ metadata2 = { "author": "Jane Doe", } - metadata2_rev = { - "id": data.revision_id_2, - "metadata": metadata2, - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata2_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata=metadata2, + mappings=[], + indexer_configuration_id=tool_id, + ) metadata2_origin = { "id": data.origin_url_2, "metadata": metadata2, @@ -1577,12 +1578,12 @@ # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words # for small values of nb_words). metadata1 = {"author": ["Random Person", "John Doe", "Jane Doe",]} - metadata1_rev = { - "id": data.revision_id_1, - "metadata": metadata1, - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata1_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_1, + metadata=metadata1, + mappings=[], + indexer_configuration_id=tool_id, + ) metadata1_origin = { "id": data.origin_url_1, "metadata": metadata1, @@ -1591,12 +1592,12 @@ "from_revision": data.revision_id_1, } metadata2 = {"author": ["Random Person", "Jane Doe",]} - metadata2_rev = { - "id": data.revision_id_2, - "metadata": metadata2, - "mappings": [], - "indexer_configuration_id": tool_id, - } + metadata2_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata=metadata2, + mappings=[], + indexer_configuration_id=tool_id, + ) metadata2_origin = { "id": data.origin_url_2, "metadata": metadata2, @@ -1636,12 +1637,12 @@ "@context": "foo", "author": "John Doe", } - metadata1_rev = { - "id": data.revision_id_1, - "metadata": metadata1, - "mappings": ["npm"], - "indexer_configuration_id": tool1_id, - } + metadata1_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_1, + metadata=metadata1, + mappings=["npm"], + indexer_configuration_id=tool1_id, + ) metadata1_origin = { "id": data.origin_url_1, "metadata": metadata1, @@ -1653,12 +1654,12 @@ "@context": "foo", "author": "Jane Doe", } - metadata2_rev = { - "id": data.revision_id_2, - "metadata": metadata2, - "mappings": ["npm", "gemspec"], - "indexer_configuration_id": tool2_id, - } + metadata2_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_2, + metadata=metadata2, + mappings=["npm", "gemspec"], + indexer_configuration_id=tool2_id, + ) metadata2_origin = { "id": data.origin_url_2, "metadata": metadata2, @@ -1669,12 +1670,12 @@ metadata3 = { "@context": "foo", } - metadata3_rev = { - "id": data.revision_id_3, - "metadata": metadata3, - "mappings": ["npm", "gemspec"], - "indexer_configuration_id": tool2_id, - } + metadata3_rev = RevisionIntrinsicMetadataRow( + id=data.revision_id_3, + metadata=metadata3, + mappings=["npm", "gemspec"], + indexer_configuration_id=tool2_id, + ) metadata3_origin = { "id": data.origin_url_3, "metadata": metadata3, @@ -1810,7 +1811,7 @@ } -class TestIndexerStorageIndexerCondifuration: +class TestIndexerStorageIndexerConfiguration: def test_indexer_configuration_add( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] ) -> None: diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -6,12 +6,15 @@ from functools import reduce import re import tempfile +from typing import Any, Dict, List from unittest.mock import patch from click.testing import CliRunner from confluent_kafka import Consumer, Producer from swh.indexer.cli import indexer_cli_group +from swh.indexer.storage.interface import IndexerStorageInterface +from swh.indexer.storage.model import RevisionIntrinsicMetadataRow from swh.journal.serializers import value_to_kafka from swh.model.hashutil import hash_to_bytes @@ -27,8 +30,8 @@ """ -def fill_idx_storage(idx_storage, nb_rows): - tools = [ +def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: + tools: List[Dict[str, Any]] = [ {"tool_name": "tool %d" % i, "tool_version": "0.0.1", "tool_configuration": {},} for i in range(2) ] @@ -45,12 +48,12 @@ for origin_id in range(nb_rows) ] revision_metadata = [ - { - "id": hash_to_bytes("abcd{:0>4}".format(origin_id)), - "indexer_configuration_id": tools[origin_id % 2]["id"], - "metadata": {"name": "origin %d" % origin_id}, - "mappings": ["mapping%d" % (origin_id % 10)], - } + RevisionIntrinsicMetadataRow( + id=hash_to_bytes("abcd{:0>4}".format(origin_id)), + indexer_configuration_id=tools[origin_id % 2]["id"], + metadata={"name": "origin %d" % origin_id}, + mappings=["mapping%d" % (origin_id % 10)], + ) for origin_id in range(nb_rows) ] diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -15,7 +15,7 @@ from swh.indexer.metadata_dictionary.maven import MavenMapping from swh.indexer.metadata_dictionary.npm import NpmMapping from swh.indexer.metadata_dictionary.ruby import GemspecMapping -from swh.indexer.storage.model import ContentMetadataRow +from swh.indexer.storage.model import ContentMetadataRow, RevisionIntrinsicMetadataRow from swh.indexer.tests.utils import DIRECTORY2, REVISION from swh.model.hashutil import hash_to_bytes from swh.model.model import Directory, DirectoryEntry, Revision @@ -1129,16 +1129,16 @@ ) expected_results = [ - { - "id": rev.id, - "tool": TRANSLATOR_TOOL, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], - } + RevisionIntrinsicMetadataRow( + id=rev.id, + tool=TRANSLATOR_TOOL, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) ] for result in results: - del result["tool"]["id"] + del result.tool["id"] # then self.assertEqual(results, expected_results) @@ -1190,16 +1190,16 @@ ) expected_results = [ - { - "id": new_rev.id, - "tool": TRANSLATOR_TOOL, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], - } + RevisionIntrinsicMetadataRow( + id=new_rev.id, + tool=TRANSLATOR_TOOL, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) ] for result in results: - del result["tool"]["id"] + del result.tool["id"] # then self.assertEqual(results, expected_results) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -6,43 +6,56 @@ from unittest.mock import patch from swh.indexer.metadata import OriginMetadataIndexer +from swh.indexer.storage.interface import IndexerStorageInterface +from swh.indexer.storage.model import RevisionIntrinsicMetadataRow from swh.model.model import Origin +from swh.storage.interface import StorageInterface from .test_metadata import REVISION_METADATA_CONFIG from .utils import REVISION, YARN_PARSER_METADATA -def test_origin_metadata_indexer(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) origin = "https://github.com/librariesio/yarn-parser" indexer.run([origin]) - rev_id = REVISION.id - rev_metadata = { - "id": rev_id, - "metadata": YARN_PARSER_METADATA, - "mappings": ["npm"], + tool = { + "name": "swh-metadata-translator", + "version": "0.0.2", + "configuration": {"context": "NpmMapping", "type": "local"}, } + + rev_id = REVISION.id + rev_metadata = RevisionIntrinsicMetadataRow( + id=rev_id, tool=tool, metadata=YARN_PARSER_METADATA, mappings=["npm"], + ) origin_metadata = { "id": origin, + "tool": tool, "from_revision": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"], } - results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - for result in results: - del result["tool"] - assert results == [rev_metadata] + rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) + for rev_result in rev_results: + assert rev_result.tool + del rev_result.tool["id"] + assert rev_results == [rev_metadata] - results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) - for result in results: - del result["tool"] - assert results == [origin_metadata] + orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin])) + for orig_result in orig_results: + del orig_result["tool"]["id"] + assert orig_results == [origin_metadata] -def test_origin_metadata_indexer_duplicate_origin(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_duplicate_origin( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage @@ -59,7 +72,9 @@ assert len(results) == 1 -def test_origin_metadata_indexer_missing_head(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_missing_head( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: storage.origin_add([Origin(url="https://example.com")]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) @@ -72,8 +87,8 @@ def test_origin_metadata_indexer_partial_missing_head( - idx_storage, storage, obj_storage -): + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: origin1 = "https://example.com" origin2 = "https://github.com/librariesio/yarn-parser" @@ -83,19 +98,22 @@ rev_id = REVISION.id - results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) - for result in results: - del result["tool"] - assert results == [ - {"id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"],} - ] - - results = list( + rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) + assert rev_results == [ + RevisionIntrinsicMetadataRow( + id=rev_id, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + tool=rev_results[0].tool, + ) + ] + + orig_results = list( indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]) ) - for result in results: - del result["tool"] - assert results == [ + for orig_result in orig_results: + del orig_result["tool"] + assert orig_results == [ { "id": origin2, "from_revision": rev_id, @@ -105,10 +123,13 @@ ] -def test_origin_metadata_indexer_duplicate_revision(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_duplicate_revision( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage + indexer.catch_exceptions = False origin1 = "https://github.com/librariesio/yarn-parser" origin2 = "https://github.com/librariesio/yarn-parser.git" indexer.run([origin1, origin2]) @@ -124,7 +145,9 @@ assert len(results) == 2 -def test_origin_metadata_indexer_no_metadata_file(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_no_metadata_file( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) origin = "https://github.com/librariesio/yarn-parser" @@ -140,7 +163,9 @@ assert results == [] -def test_origin_metadata_indexer_no_metadata(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_no_metadata( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) origin = "https://github.com/librariesio/yarn-parser" @@ -160,7 +185,9 @@ assert results == [] -def test_origin_metadata_indexer_error(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_error( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) origin = "https://github.com/librariesio/yarn-parser" @@ -180,7 +207,9 @@ assert results == [] -def test_origin_metadata_indexer_delete_metadata(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_delete_metadata( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) origin = "https://github.com/librariesio/yarn-parser" @@ -204,7 +233,9 @@ assert results == [] -def test_origin_metadata_indexer_unknown_origin(idx_storage, storage, obj_storage): +def test_origin_metadata_indexer_unknown_origin( + idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage +) -> None: indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) result = indexer.index_list(["https://unknown.org/foo"])