Changeset View
Standalone View
swh/indexer/metadata_dictionary/base.py
# Copyright (C) 2017-2019 The Software Heritage developers | # Copyright (C) 2017-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import logging | import logging | ||||
from typing import Any, Dict, List, Set | from typing import Any, Dict, List, Optional, Set | ||||
from typing_extensions import TypedDict | |||||
from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values | from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values | ||||
class File_entries(TypedDict): | |||||
vlorentz: should be in CamelCase, see https://www.python.org/dev/peps/pep-0008/#class-names
And should… | |||||
name: bytes | |||||
type: str | |||||
dir_id: bytes | |||||
sha1_git: Optional[bytes] | |||||
target: Optional[bytes] | |||||
length: Optional[int] | |||||
status: Optional[str] | |||||
perms: Optional[int] | |||||
sha1: bytes | |||||
sha256: Optional[bytes] | |||||
class BaseMapping: | class BaseMapping: | ||||
"""Base class for mappings to inherit from | """Base class for mappings to inherit from | ||||
To implement a new mapping: | To implement a new mapping: | ||||
- inherit this class | - inherit this class | ||||
- override translate function | - override translate function | ||||
""" | """ | ||||
def __init__(self, log_suffix: str = ""): | def __init__(self, log_suffix: str = ""): | ||||
self.log_suffix = log_suffix | self.log_suffix = log_suffix | ||||
self.log = logging.getLogger( | self.log = logging.getLogger( | ||||
"%s.%s" % (self.__class__.__module__, self.__class__.__name__) | "%s.%s" % (self.__class__.__module__, self.__class__.__name__) | ||||
) | ) | ||||
@property | @property | ||||
def name(self): | def name(self): | ||||
"""A name of this mapping, used as an identifier in the | """A name of this mapping, used as an identifier in the | ||||
indexer storage.""" | indexer storage.""" | ||||
raise NotImplementedError(f"{self.__class__.__name__}.name") | raise NotImplementedError(f"{self.__class__.__name__}.name") | ||||
@classmethod | @classmethod | ||||
def detect_metadata_files(cls, files: List) -> List[str]: | def detect_metadata_files(cls, files: List[File_entries]) -> List[bytes]: | ||||
Not Done Inline Actionsfull annotation, please vlorentz: full annotation, please | |||||
""" | """ | ||||
Detects files potentially containing metadata | Detects files potentially containing metadata | ||||
Args: | Args: | ||||
file_entries (list): list of files | file_entries (list): list of files | ||||
Returns: | Returns: | ||||
list: list of sha1 (possibly empty) | list: list of sha1 (possibly empty) | ||||
""" | """ | ||||
raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") | raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") | ||||
def translate(self, file_content: bytes) -> Any: | def translate(self, file_content: bytes) -> Optional[Dict[str, Any]]: | ||||
raise NotImplementedError(f"{self.__class__.__name__}.translate") | raise NotImplementedError(f"{self.__class__.__name__}.translate") | ||||
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: | def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: | ||||
return compact(metadata) | return compact(metadata) | ||||
class SingleFileMapping(BaseMapping): | class SingleFileMapping(BaseMapping): | ||||
"""Base class for all mappings that use a single file as input.""" | """Base class for all mappings that use a single file as input.""" | ||||
@property | @property | ||||
def filename(self): | def filename(self): | ||||
"""The .json file to extract metadata from.""" | """The .json file to extract metadata from.""" | ||||
raise NotImplementedError(f"{self.__class__.__name__}.filename") | raise NotImplementedError(f"{self.__class__.__name__}.filename") | ||||
@classmethod | @classmethod | ||||
def detect_metadata_files(cls, file_entries: List[Dict[str, Any]]) -> List[str]: | def detect_metadata_files(cls, file_entries: List[File_entries]) -> List[bytes]: | ||||
Not Done Inline ActionsCan you do better than Any? vlorentz: Can you do better than `Any`? | |||||
Not Done Inline ActionsI could go deeper into annotations and put up Union[bytes, str, int], but Union will make the implementation of isinstance() fucntions complicated. rohan-sachan: I could go deeper into annotations and put up Union[bytes, str, int], but Union will make the… | |||||
Not Done Inline ActionsLook into TypedDict vlorentz: Look into `TypedDict` | |||||
for entry in file_entries: | for entry in file_entries: | ||||
if entry["name"].lower() == cls.filename.lower(): # type: ignore | if isinstance(entry["name"], bytes) and isinstance(cls.filename, bytes): | ||||
Not Done Inline Actionsand why this one? vlorentz: and why this one? | |||||
Not Done Inline ActionsGetting a ' ... no attribute "lower()" ' error with mypy without using #type: ignore. Will remove it in next revision by using isinstance() check instead. rohan-sachan: Getting a ' ... no attribute "lower()" ' error with mypy without using #type: ignore. Will… | |||||
Not Done Inline ActionsNo, don't use isinstance. They are supposed to be bytes. vlorentz: No, don't use isinstance. They are supposed to be bytes. | |||||
if entry["name"].lower() == cls.filename.lower(): | |||||
return [entry["sha1"]] | return [entry["sha1"]] | ||||
return [] | return [] | ||||
class DictMapping(BaseMapping): | class DictMapping(BaseMapping): | ||||
"""Base class for mappings that take as input a file that is mostly | """Base class for mappings that take as input a file that is mostly | ||||
a key-value store (eg. a shallow JSON dict).""" | a key-value store (eg. a shallow JSON dict).""" | ||||
string_fields: List[str] = [] | string_fields: List[str] = [] | ||||
"""List of fields that are simple strings, and don't need any | """List of fields that are simple strings, and don't need any | ||||
normalization.""" | normalization.""" | ||||
@property | @property | ||||
def mapping(self): | def mapping(self): | ||||
"""A translation dict to map dict keys into a canonical name.""" | """A translation dict to map dict keys into a canonical name.""" | ||||
raise NotImplementedError(f"{self.__class__.__name__}.mapping") | raise NotImplementedError(f"{self.__class__.__name__}.mapping") | ||||
@staticmethod | @staticmethod | ||||
def _normalize_method_name(name: str): | def _normalize_method_name(name: str) -> str: | ||||
return name.replace("-", "_") | return name.replace("-", "_") | ||||
@classmethod | @classmethod | ||||
def supported_terms(cls) -> Set[str]: | def supported_terms(cls) -> Set[str]: | ||||
if isinstance(cls.mapping, Dict): | |||||
Not Done Inline Actionsditto vlorentz: ditto | |||||
Not Done Inline ActionsGetting a ' ... no attribute "items()" ' error with mypy without using #type: ignore. Will remove it in next revision by using isinstance() check instead. rohan-sachan: Getting a ' ... no attribute "items()" ' error with mypy without using #type: ignore. Will… | |||||
Not Done Inline ActionsThis is not an improvement over type: ignore. With type: ignore, you disable type checking with no runtime change. When adding if isinstance(cls.mapping, Dict):, you also go around type checking, and add a runtime behavior that is not tested -- and also that should never happen. (and it's also not tested) To phrase it differently: we use mypy to make sure we never get an expected type, but instead your changes replace type error exceptions with silent fallbacks. vlorentz: This is not an improvement over `type: ignore`.
With `type: ignore`, you disable type checking… | |||||
return { | return { | ||||
term | term | ||||
for (key, term) in cls.mapping.items() # type: ignore | for (key, term) in cls.mapping.items() | ||||
if key in cls.string_fields | if key in cls.string_fields | ||||
or hasattr(cls, "translate_" + cls._normalize_method_name(key)) | or hasattr(cls, "translate_" + cls._normalize_method_name(key)) | ||||
or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) | or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) | ||||
} | } | ||||
return set() | |||||
def _translate_dict(self, content_dict: Dict, *, normalize=True) -> Dict: | def _translate_dict( | ||||
self, content_dict: Dict[str, Any], *, normalize=True | |||||
) -> Dict[str, Any]: | |||||
Not Done Inline ActionsFull annotation if possible vlorentz: Full annotation if possible | |||||
""" | """ | ||||
Translates content by parsing content from a dict object | Translates content by parsing content from a dict object | ||||
and translating with the appropriate mapping | and translating with the appropriate mapping | ||||
Args: | Args: | ||||
content_dict (dict): content dict to translate | content_dict (dict): content dict to translate | ||||
Returns: | Returns: | ||||
Show All 39 Lines | ) -> Dict[str, Any]: | ||||
return self.normalize_translation(translated_metadata) | return self.normalize_translation(translated_metadata) | ||||
else: | else: | ||||
return translated_metadata | return translated_metadata | ||||
class JsonMapping(DictMapping, SingleFileMapping): | class JsonMapping(DictMapping, SingleFileMapping): | ||||
"""Base class for all mappings that use a JSON file as input.""" | """Base class for all mappings that use a JSON file as input.""" | ||||
def translate(self, raw_content: bytes) -> Any: | def translate(self, raw_content_bytes: bytes) -> Optional[Dict[str, Any]]: | ||||
""" | """ | ||||
Translates content by parsing content from a bytestring containing | Translates content by parsing content from a bytestring containing | ||||
json data and translating with the appropriate mapping | json data and translating with the appropriate mapping | ||||
Args: | Args: | ||||
raw_content (bytes): raw content to translate | raw_content (bytes): raw content to translate | ||||
Returns: | Returns: | ||||
dict: translated metadata in json-friendly form needed for | dict: translated metadata in json-friendly form needed for | ||||
the indexer | the indexer | ||||
""" | """ | ||||
try: | try: | ||||
raw_content = raw_content.decode() # type: ignore | raw_content = raw_content_bytes.decode() | ||||
except UnicodeDecodeError: | except UnicodeDecodeError: | ||||
self.log.warning("Error unidecoding from %s", self.log_suffix) | self.log.warning("Error unidecoding from %s", self.log_suffix) | ||||
return | return None | ||||
try: | try: | ||||
content_dict = json.loads(raw_content) | content_dict = json.loads(raw_content) | ||||
except json.JSONDecodeError: | except json.JSONDecodeError: | ||||
self.log.warning("Error unjsoning from %s", self.log_suffix) | self.log.warning("Error unjsoning from %s", self.log_suffix) | ||||
return | return None | ||||
if isinstance(content_dict, dict): | if isinstance(content_dict, dict): | ||||
return self._translate_dict(content_dict) | return self._translate_dict(content_dict) | ||||
else: | |||||
return None |
should be in CamelCase, see https://www.python.org/dev/peps/pep-0008/#class-names
And should be singular, because it represents a single entry.