Changeset View
Standalone View
swh/indexer/metadata_dictionary/base.py
# Copyright (C) 2017-2019 The Software Heritage developers | # Copyright (C) 2017-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import logging | import logging | ||||
from typing import List | from typing import Any, Dict, List, Set | ||||
from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values | from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values | ||||
vlorentz: should be in CamelCase, see https://www.python.org/dev/peps/pep-0008/#class-names
And should… | |||||
class BaseMapping: | class BaseMapping: | ||||
"""Base class for mappings to inherit from | """Base class for mappings to inherit from | ||||
To implement a new mapping: | To implement a new mapping: | ||||
- inherit this class | - inherit this class | ||||
- override translate function | - override translate function | ||||
""" | """ | ||||
def __init__(self, log_suffix=""): | def __init__(self, log_suffix: str = ""): | ||||
self.log_suffix = log_suffix | self.log_suffix = log_suffix | ||||
self.log = logging.getLogger( | self.log = logging.getLogger( | ||||
"%s.%s" % (self.__class__.__module__, self.__class__.__name__) | "%s.%s" % (self.__class__.__module__, self.__class__.__name__) | ||||
) | ) | ||||
@property | @property | ||||
def name(self): | def name(self): | ||||
"""A name of this mapping, used as an identifier in the | """A name of this mapping, used as an identifier in the | ||||
indexer storage.""" | indexer storage.""" | ||||
raise NotImplementedError(f"{self.__class__.__name__}.name") | raise NotImplementedError(f"{self.__class__.__name__}.name") | ||||
@classmethod | @classmethod | ||||
def detect_metadata_files(cls, files): | def detect_metadata_files(cls, files: List) -> List[str]: | ||||
vlorentzAuthorUnsubmitted Not Done Inline Actionsfull annotation, please vlorentz: full annotation, please | |||||
""" | """ | ||||
Detects files potentially containing metadata | Detects files potentially containing metadata | ||||
Args: | Args: | ||||
file_entries (list): list of files | file_entries (list): list of files | ||||
Returns: | Returns: | ||||
list: list of sha1 (possibly empty) | list: list of sha1 (possibly empty) | ||||
""" | """ | ||||
raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") | raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") | ||||
def translate(self, file_content): | def translate(self, file_content: bytes) -> Any: | ||||
raise NotImplementedError(f"{self.__class__.__name__}.translate") | raise NotImplementedError(f"{self.__class__.__name__}.translate") | ||||
def normalize_translation(self, metadata): | def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: | ||||
return compact(metadata) | return compact(metadata) | ||||
class SingleFileMapping(BaseMapping): | class SingleFileMapping(BaseMapping): | ||||
"""Base class for all mappings that use a single file as input.""" | """Base class for all mappings that use a single file as input.""" | ||||
@property | @property | ||||
def filename(self): | def filename(self): | ||||
"""The .json file to extract metadata from.""" | """The .json file to extract metadata from.""" | ||||
raise NotImplementedError(f"{self.__class__.__name__}.filename") | raise NotImplementedError(f"{self.__class__.__name__}.filename") | ||||
@classmethod | @classmethod | ||||
def detect_metadata_files(cls, file_entries): | def detect_metadata_files(cls, file_entries: List[Dict[str, Any]]) -> List[str]: | ||||
vlorentzAuthorUnsubmitted Not Done Inline ActionsCan you do better than Any? vlorentz: Can you do better than `Any`? | |||||
rohan-sachanUnsubmitted Not Done Inline ActionsI could go deeper into annotations and put up Union[bytes, str, int], but Union will make the implementation of isinstance() fucntions complicated. rohan-sachan: I could go deeper into annotations and put up Union[bytes, str, int], but Union will make the… | |||||
vlorentzAuthorUnsubmitted Not Done Inline ActionsLook into TypedDict vlorentz: Look into `TypedDict` | |||||
for entry in file_entries: | for entry in file_entries: | ||||
if entry["name"].lower() == cls.filename.lower(): | if entry["name"].lower() == cls.filename.lower(): # type: ignore | ||||
vlorentzAuthorUnsubmitted Not Done Inline Actionsand why this one? vlorentz: and why this one? | |||||
rohan-sachanUnsubmitted Not Done Inline ActionsGetting a ' ... no attribute "lower()" ' error with mypy without using #type: ignore. Will remove it in next revision by using isinstance() check instead. rohan-sachan: Getting a ' ... no attribute "lower()" ' error with mypy without using #type: ignore. Will… | |||||
vlorentzAuthorUnsubmitted Not Done Inline ActionsNo, don't use isinstance. They are supposed to be bytes. vlorentz: No, don't use isinstance. They are supposed to be bytes. | |||||
return [entry["sha1"]] | return [entry["sha1"]] | ||||
return [] | return [] | ||||
class DictMapping(BaseMapping): | class DictMapping(BaseMapping): | ||||
"""Base class for mappings that take as input a file that is mostly | """Base class for mappings that take as input a file that is mostly | ||||
a key-value store (eg. a shallow JSON dict).""" | a key-value store (eg. a shallow JSON dict).""" | ||||
string_fields = [] # type: List[str] | string_fields: List[str] = [] | ||||
"""List of fields that are simple strings, and don't need any | """List of fields that are simple strings, and don't need any | ||||
normalization.""" | normalization.""" | ||||
@property | @property | ||||
def mapping(self): | def mapping(self): | ||||
"""A translation dict to map dict keys into a canonical name.""" | """A translation dict to map dict keys into a canonical name.""" | ||||
raise NotImplementedError(f"{self.__class__.__name__}.mapping") | raise NotImplementedError(f"{self.__class__.__name__}.mapping") | ||||
@staticmethod | @staticmethod | ||||
def _normalize_method_name(name): | def _normalize_method_name(name: str): | ||||
return name.replace("-", "_") | return name.replace("-", "_") | ||||
@classmethod | @classmethod | ||||
def supported_terms(cls): | def supported_terms(cls) -> Set[str]: | ||||
return { | return { | ||||
term | term | ||||
for (key, term) in cls.mapping.items() | for (key, term) in cls.mapping.items() # type: ignore | ||||
vlorentzAuthorUnsubmitted Not Done Inline Actionsditto vlorentz: ditto | |||||
rohan-sachanUnsubmitted Not Done Inline ActionsGetting a ' ... no attribute "items()" ' error with mypy without using #type: ignore. Will remove it in next revision by using isinstance() check instead. rohan-sachan: Getting a ' ... no attribute "items()" ' error with mypy without using #type: ignore. Will… | |||||
vlorentzAuthorUnsubmitted Not Done Inline ActionsThis is not an improvement over type: ignore. With type: ignore, you disable type checking with no runtime change. When adding if isinstance(cls.mapping, Dict):, you also go around type checking, and add a runtime behavior that is not tested -- and also that should never happen. (and it's also not tested) To phrase it differently: we use mypy to make sure we never get an expected type, but instead your changes replace type error exceptions with silent fallbacks. vlorentz: This is not an improvement over `type: ignore`.
With `type: ignore`, you disable type checking… | |||||
if key in cls.string_fields | if key in cls.string_fields | ||||
or hasattr(cls, "translate_" + cls._normalize_method_name(key)) | or hasattr(cls, "translate_" + cls._normalize_method_name(key)) | ||||
or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) | or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) | ||||
} | } | ||||
def _translate_dict(self, content_dict, *, normalize=True): | def _translate_dict(self, content_dict: Dict, *, normalize=True) -> Dict: | ||||
vlorentzAuthorUnsubmitted Not Done Inline ActionsFull annotation if possible vlorentz: Full annotation if possible | |||||
""" | """ | ||||
Translates content by parsing content from a dict object | Translates content by parsing content from a dict object | ||||
and translating with the appropriate mapping | and translating with the appropriate mapping | ||||
Args: | Args: | ||||
content_dict (dict): content dict to translate | content_dict (dict): content dict to translate | ||||
Returns: | Returns: | ||||
Show All 39 Lines | def _translate_dict(self, content_dict: Dict, *, normalize=True) -> Dict: | ||||
return self.normalize_translation(translated_metadata) | return self.normalize_translation(translated_metadata) | ||||
else: | else: | ||||
return translated_metadata | return translated_metadata | ||||
class JsonMapping(DictMapping, SingleFileMapping): | class JsonMapping(DictMapping, SingleFileMapping): | ||||
"""Base class for all mappings that use a JSON file as input.""" | """Base class for all mappings that use a JSON file as input.""" | ||||
def translate(self, raw_content): | def translate(self, raw_content: bytes) -> Any: | ||||
""" | """ | ||||
Translates content by parsing content from a bytestring containing | Translates content by parsing content from a bytestring containing | ||||
json data and translating with the appropriate mapping | json data and translating with the appropriate mapping | ||||
Args: | Args: | ||||
raw_content (bytes): raw content to translate | raw_content (bytes): raw content to translate | ||||
Returns: | Returns: | ||||
dict: translated metadata in json-friendly form needed for | dict: translated metadata in json-friendly form needed for | ||||
the indexer | the indexer | ||||
""" | """ | ||||
try: | try: | ||||
raw_content = raw_content.decode() | raw_content = raw_content.decode() # type: ignore | ||||
except UnicodeDecodeError: | except UnicodeDecodeError: | ||||
self.log.warning("Error unidecoding from %s", self.log_suffix) | self.log.warning("Error unidecoding from %s", self.log_suffix) | ||||
return | return | ||||
try: | try: | ||||
content_dict = json.loads(raw_content) | content_dict = json.loads(raw_content) | ||||
except json.JSONDecodeError: | except json.JSONDecodeError: | ||||
self.log.warning("Error unjsoning from %s", self.log_suffix) | self.log.warning("Error unjsoning from %s", self.log_suffix) | ||||
return | return | ||||
if isinstance(content_dict, dict): | if isinstance(content_dict, dict): | ||||
return self._translate_dict(content_dict) | return self._translate_dict(content_dict) |
should be in CamelCase, see https://www.python.org/dev/peps/pep-0008/#class-names
And should be singular, because it represents a single entry.