diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -14,6 +14,7 @@ Optional, Tuple, TypeVar, + cast, ) import sentry_sdk @@ -24,6 +25,7 @@ from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( @@ -202,27 +204,34 @@ - metadata: dict of retrieved metadata """ + dir_: List[DirectoryLsEntry] if data is None: - dir_ = list(self.storage.directory_ls(id, recursive=False)) + dir_ = cast( + List[DirectoryLsEntry], + list(self.storage.directory_ls(id, recursive=False)), + ) else: assert isinstance(data, Directory) - dir_ = data.to_dict() + dir_ = data.to_dict()["entries"] try: if [entry["type"] for entry in dir_] == ["dir"]: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_[0]["target"] - dir_ = list(self.storage.directory_ls(subdir, recursive=False)) + dir_ = cast( + List[DirectoryLsEntry], + list(self.storage.directory_ls(subdir, recursive=False)), + ) files = [entry for entry in dir_ if entry["type"] == "file"] - detected_files = detect_metadata(files) (mappings, metadata) = self.translate_directory_intrinsic_metadata( - detected_files, + files, log_suffix="directory=%s" % hashutil.hash_to_hex(id), ) except Exception as e: self.log.exception("Problem when indexing dir: %r", e) sentry_sdk.capture_exception() + return [] return [ DirectoryIntrinsicMetadataRow( id=id, @@ -250,22 +259,21 @@ return self.idx_storage.directory_intrinsic_metadata_add(results) def translate_directory_intrinsic_metadata( - self, detected_files: Dict[str, List[Any]], log_suffix: str + self, files: List[DirectoryLsEntry], log_suffix: str ) -> Tuple[List[Any], Any]: """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: - detected_files: dictionary mapping context names (e.g., - "npm", "authors") to list of sha1 + files: list of file entries, as returned by + :meth:`swh.storage.interface.StorageInterface.directory_ls` Returns: (List[str], dict): list of mappings used and dict with translated metadata according to the CodeMeta vocabulary """ - used_mappings = [MAPPINGS[context].name for context in detected_files] metadata = [] tool = { "name": "swh-metadata-translator", @@ -277,15 +285,15 @@ # -> translate each content config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]} config["tools"] = [tool] - for context in detected_files.keys(): + all_detected_files = detect_metadata(files) + used_mappings = [MAPPINGS[context].name for context in all_detected_files] + for (mapping_name, detected_files) in all_detected_files.items(): cfg = deepcopy(config) - cfg["tools"][0]["configuration"]["context"] = context + cfg["tools"][0]["configuration"]["context"] = mapping_name c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table sha1s_in_storage = [] - metadata_generator = self.idx_storage.content_metadata_get( - detected_files[context] - ) + metadata_generator = self.idx_storage.content_metadata_get(detected_files) for c in metadata_generator: # extracting metadata sha1 = c.id @@ -296,7 +304,7 @@ metadata.append(local_metadata) sha1s_filtered = [ - item for item in detected_files[context] if item not in sha1s_in_storage + item for item in detected_files if item not in sha1s_in_storage ] if sha1s_filtered: diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -1,12 +1,16 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Dict, List + from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.metadata_dictionary.base import DirectoryLsEntry +from swh.indexer.storage.interface import Sha1 -def detect_metadata(files): +def detect_metadata(files: List[DirectoryLsEntry]) -> Dict[str, List[Sha1]]: """ Detects files potentially containing metadata diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -7,7 +7,17 @@ import logging from typing import Any, Dict, List, Optional +from typing_extensions import TypedDict + from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values +from swh.indexer.storage.interface import Sha1 + + +class DirectoryLsEntry(TypedDict): + target: Sha1 + sha1: Sha1 + name: bytes + type: str class BaseMapping: @@ -32,15 +42,9 @@ raise NotImplementedError(f"{self.__class__.__name__}.name") @classmethod - def detect_metadata_files(cls, files: List[Dict[str, str]]) -> List[str]: + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: """ - Detects files potentially containing metadata - - Args: - file_entries (list): list of files - - Returns: - list: list of sha1 (possibly empty) + Returns the sha1 hashes of files which can be translated by this mapping """ raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") @@ -60,7 +64,7 @@ raise NotImplementedError(f"{self.__class__.__name__}.filename") @classmethod - def detect_metadata_files(cls, file_entries: List[Dict[str, str]]) -> List[str]: + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: for entry in file_entries: if entry["name"].lower() == cls.filename: return [entry["sha1"]] diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py --- a/swh/indexer/metadata_dictionary/ruby.py +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,8 +6,11 @@ import ast import itertools import re +from typing import List from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.metadata_dictionary.base import DirectoryLsEntry +from swh.indexer.storage.interface import Sha1 from .base import DictMapping @@ -28,7 +31,7 @@ _re_spec_entry = re.compile(r"\s*\w+\.(?P\w+)\s*=\s*(?P.*)") @classmethod - def detect_metadata_files(cls, file_entries): + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: for entry in file_entries: if entry["name"].endswith(b".gemspec"): return [entry["sha1"]]