diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -134,10 +134,10 @@ .. code-block:: python - from .base import DictMapping, SingleFileMapping + from .base import DictMapping, SingleFileIntrinsicMapping from swh.indexer.codemeta import CROSSWALK_TABLE - class MyMapping(DictMapping, SingleFileMapping): + class MyMapping(DictMapping, SingleFileIntrinsicMapping): """Dedicated class for ...""" name = 'my-mapping' filename = b'the-filename' @@ -145,6 +145,8 @@ .. _CodeMeta crosswalks: https://github.com/codemeta/codemeta/tree/master/crosswalks +And reference it from :const:`swh.indexer.metadata_dictionary.INTRINSIC_MAPPINGS`. + Then, add a ``string_fields`` attribute, that is the list of all keys whose values are simple text values. For instance, to `translate Python PKG-INFO`_, it's: diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -31,7 +31,7 @@ OriginIndexer, ) from swh.indexer.metadata_detector import detect_metadata -from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 @@ -116,7 +116,7 @@ metadata_items = [] mappings = [] - for (mapping_name, mapping) in MAPPINGS.items(): + for (mapping_name, mapping) in EXTRINSIC_MAPPINGS.items(): if data.format in mapping.extrinsic_metadata_formats(): metadata_item = mapping().translate(data.metadata) if metadata_item is not None: @@ -210,7 +210,7 @@ try: mapping_name = self.tool["tool_configuration"]["context"] log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id) - metadata = MAPPINGS[mapping_name](log_suffix).translate(data) + metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " @@ -364,7 +364,9 @@ config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]} config["tools"] = [tool] all_detected_files = detect_metadata(files) - used_mappings = [MAPPINGS[context].name for context in all_detected_files] + used_mappings = [ + INTRINSIC_MAPPINGS[context].name for context in all_detected_files + ] for (mapping_name, detected_files) in all_detected_files.items(): cfg = deepcopy(config) cfg["tools"][0]["configuration"]["context"] = mapping_name diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -5,7 +5,7 @@ from typing import Dict, List -from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer.metadata_dictionary import INTRINSIC_MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.storage.interface import Sha1 @@ -21,7 +21,7 @@ dict: {mapping_filenames[name]:f['sha1']} (may be empty) """ results = {} - for (mapping_name, mapping) in MAPPINGS.items(): + for (mapping_name, mapping) in INTRINSIC_MAPPINGS.items(): matches = mapping.detect_metadata_files(files) if matches: results[mapping_name] = matches diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -4,22 +4,30 @@ # See top-level LICENSE file for more information import collections +from typing import Dict, Type import click from . import cff, codemeta, composer, github, maven, npm, python, ruby +from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping -MAPPINGS = { +INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { "CffMapping": cff.CffMapping, "CodemetaMapping": codemeta.CodemetaMapping, "GemspecMapping": ruby.GemspecMapping, - "GitHubMapping": github.GitHubMapping, "MavenMapping": maven.MavenMapping, "NpmMapping": npm.NpmMapping, "PythonPkginfoMapping": python.PythonPkginfoMapping, "ComposerMapping": composer.ComposerMapping, } +EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { + "GitHubMapping": github.GitHubMapping, +} + + +MAPPINGS: Dict[str, Type[BaseMapping]] = {**INTRINSIC_MAPPINGS, **EXTRINSIC_MAPPINGS} + def list_terms(): """Returns a dictionary with all supported CodeMeta terms as keys, diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -41,13 +41,8 @@ class BaseMapping: - """Base class for mappings to inherit from - - To implement a new mapping: - - - inherit this class - - override translate function - """ + """Base class for :class:`BaseExtrinsicMapping` and :class:`BaseIntrinsicMapping`, + not to be inherited directly.""" def __init__(self, log_suffix=""): self.log_suffix = log_suffix @@ -61,12 +56,23 @@ indexer storage.""" raise NotImplementedError(f"{self.__class__.__name__}.name") - @classmethod - def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: - """ - Returns the sha1 hashes of files which can be translated by this mapping - """ - raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") + def translate(self, file_content: bytes) -> Optional[Dict]: + """Translates metadata, from the content of a file or of a RawExtrinsicMetadata + object.""" + raise NotImplementedError(f"{self.__class__.__name__}.translate") + + def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: + return compact(metadata) + + +class BaseExtrinsicMapping(BaseMapping): + """Base class for extrinsic-metadata mappings to inherit from + + To implement a new mapping: + + - inherit this class + - override translate function + """ @classmethod def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: @@ -76,15 +82,25 @@ """ raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats") - def translate(self, file_content: bytes) -> Optional[Dict]: - """Translates intrinsic metadata, from the content of a file.""" - raise NotImplementedError(f"{self.__class__.__name__}.translate") - def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: - return compact(metadata) +class BaseIntrinsicMapping(BaseMapping): + """Base class for intrinsic-metadata mappings to inherit from + + To implement a new mapping: + + - inherit this class + - override translate function + """ + + @classmethod + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: + """ + Returns the sha1 hashes of files which can be translated by this mapping + """ + raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") -class SingleFileMapping(BaseMapping): +class SingleFileIntrinsicMapping(BaseIntrinsicMapping): """Base class for all intrinsic metadata mappings that use a single file as input.""" @property @@ -99,11 +115,6 @@ return [entry["sha1"]] return [] - @classmethod - def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: - # this class is only used by intrinsic metadata mappings - return () - class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -4,7 +4,7 @@ from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI -from .base import DictMapping, SingleFileMapping +from .base import DictMapping, SingleFileIntrinsicMapping class SafeLoader(yaml.SafeLoader): @@ -14,7 +14,7 @@ } -class CffMapping(DictMapping, SingleFileMapping): +class CffMapping(DictMapping, SingleFileIntrinsicMapping): """Dedicated class for Citation (CITATION.cff) mapping and translation""" name = "cff" diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -8,10 +8,10 @@ from swh.indexer.codemeta import CODEMETA_TERMS, expand -from .base import SingleFileMapping +from .base import SingleFileIntrinsicMapping -class CodemetaMapping(SingleFileMapping): +class CodemetaMapping(SingleFileIntrinsicMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py --- a/swh/indexer/metadata_dictionary/composer.py +++ b/swh/indexer/metadata_dictionary/composer.py @@ -7,7 +7,7 @@ from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable -from .base import JsonMapping, SingleFileMapping +from .base import JsonMapping, SingleFileIntrinsicMapping COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv") @@ -15,7 +15,7 @@ (CODEMETA_TERMS, COMPOSER_TABLE) = _read_crosstable(fd) -class ComposerMapping(JsonMapping, SingleFileMapping): +class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping): """Dedicated class for Packagist(composer.json) mapping and translation""" name = "composer" diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -2,20 +2,20 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import json -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, Tuple from swh.indexer.codemeta import ACTIVITYSTREAMS_URI, CROSSWALK_TABLE, FORGEFED_URI -from swh.indexer.storage.interface import Sha1 -from .base import DirectoryLsEntry, JsonMapping, produce_terms +from .base import BaseExtrinsicMapping, JsonMapping, produce_terms def _prettyprint(d): print(json.dumps(d, indent=4)) -class GitHubMapping(JsonMapping): +class GitHubMapping(BaseExtrinsicMapping, JsonMapping): name = "github" mapping = CROSSWALK_TABLE["GitHub"] string_fields = [ @@ -28,10 +28,6 @@ "issues_url", ] - @classmethod - def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: - return [] - @classmethod def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: return ("application/vnd.github.v3+json",) diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -11,10 +11,10 @@ from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI -from .base import DictMapping, SingleFileMapping +from .base import DictMapping, SingleFileIntrinsicMapping -class MavenMapping(DictMapping, SingleFileMapping): +class MavenMapping(DictMapping, SingleFileIntrinsicMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -7,10 +7,10 @@ from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI -from .base import JsonMapping, SingleFileMapping +from .base import JsonMapping, SingleFileIntrinsicMapping -class NpmMapping(JsonMapping, SingleFileMapping): +class NpmMapping(JsonMapping, SingleFileIntrinsicMapping): """ dedicated class for NPM (package.json) mapping and translation """ diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py --- a/swh/indexer/metadata_dictionary/python.py +++ b/swh/indexer/metadata_dictionary/python.py @@ -9,7 +9,7 @@ from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI -from .base import DictMapping, SingleFileMapping +from .base import DictMapping, SingleFileIntrinsicMapping _normalize_pkginfo_key = str.lower @@ -22,7 +22,7 @@ return self.header_factory(name, value) -class PythonPkginfoMapping(DictMapping, SingleFileMapping): +class PythonPkginfoMapping(DictMapping, SingleFileIntrinsicMapping): """Dedicated class for Python's PKG-INFO mapping and translation. https://www.python.org/dev/peps/pep-0314/""" diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py --- a/swh/indexer/metadata_dictionary/ruby.py +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -6,13 +6,13 @@ import ast import itertools import re -from typing import List, Tuple +from typing import List from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.metadata_dictionary.base import DirectoryLsEntry from swh.indexer.storage.interface import Sha1 -from .base import DictMapping +from .base import BaseIntrinsicMapping, DictMapping def name_to_person(name): @@ -22,7 +22,7 @@ } -class GemspecMapping(DictMapping): +class GemspecMapping(BaseIntrinsicMapping, DictMapping): name = "gemspec" mapping = CROSSWALK_TABLE["Ruby Gem"] string_fields = ["name", "version", "description", "summary", "email"] @@ -37,11 +37,6 @@ return [entry["sha1"]] return [] - @classmethod - def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: - # this class is only used by intrinsic metadata mappings - return () - def translate(self, raw_content): try: raw_content = raw_content.decode()