Page MenuHomeSoftware Heritage

D8072.id29169.diff
No OneTemporary

D8072.id29169.diff

diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
--- a/docs/metadata-workflow.rst
+++ b/docs/metadata-workflow.rst
@@ -134,10 +134,10 @@
.. code-block:: python
- from .base import DictMapping, SingleFileMapping
+ from .base import DictMapping, SingleFileIntrinsicMapping
from swh.indexer.codemeta import CROSSWALK_TABLE
- class MyMapping(DictMapping, SingleFileMapping):
+ class MyMapping(DictMapping, SingleFileIntrinsicMapping):
"""Dedicated class for ..."""
name = 'my-mapping'
filename = b'the-filename'
@@ -145,6 +145,8 @@
.. _CodeMeta crosswalks: https://github.com/codemeta/codemeta/tree/master/crosswalks
+And reference it from :const:`swh.indexer.metadata_dictionary.INTRINSIC_MAPPINGS`.
+
Then, add a ``string_fields`` attribute, that is the list of all keys whose
values are simple text values. For instance, to
`translate Python PKG-INFO`_, it's:
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -31,7 +31,7 @@
OriginIndexer,
)
from swh.indexer.metadata_detector import detect_metadata
-from swh.indexer.metadata_dictionary import MAPPINGS
+from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
from swh.indexer.origin_head import get_head_swhid
from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
@@ -116,7 +116,7 @@
metadata_items = []
mappings = []
- for (mapping_name, mapping) in MAPPINGS.items():
+ for (mapping_name, mapping) in EXTRINSIC_MAPPINGS.items():
if data.format in mapping.extrinsic_metadata_formats():
metadata_item = mapping().translate(data.metadata)
if metadata_item is not None:
@@ -210,7 +210,7 @@
try:
mapping_name = self.tool["tool_configuration"]["context"]
log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
- metadata = MAPPINGS[mapping_name](log_suffix).translate(data)
+ metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
@@ -364,7 +364,9 @@
config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
config["tools"] = [tool]
all_detected_files = detect_metadata(files)
- used_mappings = [MAPPINGS[context].name for context in all_detected_files]
+ used_mappings = [
+ INTRINSIC_MAPPINGS[context].name for context in all_detected_files
+ ]
for (mapping_name, detected_files) in all_detected_files.items():
cfg = deepcopy(config)
cfg["tools"][0]["configuration"]["context"] = mapping_name
diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py
--- a/swh/indexer/metadata_detector.py
+++ b/swh/indexer/metadata_detector.py
@@ -5,7 +5,7 @@
from typing import Dict, List
-from swh.indexer.metadata_dictionary import MAPPINGS
+from swh.indexer.metadata_dictionary import INTRINSIC_MAPPINGS
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
from swh.indexer.storage.interface import Sha1
@@ -21,7 +21,7 @@
dict: {mapping_filenames[name]:f['sha1']} (may be empty)
"""
results = {}
- for (mapping_name, mapping) in MAPPINGS.items():
+ for (mapping_name, mapping) in INTRINSIC_MAPPINGS.items():
matches = mapping.detect_metadata_files(files)
if matches:
results[mapping_name] = matches
diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py
--- a/swh/indexer/metadata_dictionary/__init__.py
+++ b/swh/indexer/metadata_dictionary/__init__.py
@@ -4,22 +4,30 @@
# See top-level LICENSE file for more information
import collections
+from typing import Dict, Type
import click
from . import cff, codemeta, composer, github, maven, npm, python, ruby
+from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping
-MAPPINGS = {
+INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = {
"CffMapping": cff.CffMapping,
"CodemetaMapping": codemeta.CodemetaMapping,
"GemspecMapping": ruby.GemspecMapping,
- "GitHubMapping": github.GitHubMapping,
"MavenMapping": maven.MavenMapping,
"NpmMapping": npm.NpmMapping,
"PythonPkginfoMapping": python.PythonPkginfoMapping,
"ComposerMapping": composer.ComposerMapping,
}
+EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = {
+ "GitHubMapping": github.GitHubMapping,
+}
+
+
+MAPPINGS: Dict[str, Type[BaseMapping]] = {**INTRINSIC_MAPPINGS, **EXTRINSIC_MAPPINGS}
+
def list_terms():
"""Returns a dictionary with all supported CodeMeta terms as keys,
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -41,13 +41,8 @@
class BaseMapping:
- """Base class for mappings to inherit from
-
- To implement a new mapping:
-
- - inherit this class
- - override translate function
- """
+ """Base class for :class:`BaseExtrinsicMapping` and :class:`BaseIntrinsicMapping`,
+ not to be inherited directly."""
def __init__(self, log_suffix=""):
self.log_suffix = log_suffix
@@ -61,12 +56,23 @@
indexer storage."""
raise NotImplementedError(f"{self.__class__.__name__}.name")
- @classmethod
- def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
- """
- Returns the sha1 hashes of files which can be translated by this mapping
- """
- raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
+ def translate(self, file_content: bytes) -> Optional[Dict]:
+ """Translates metadata, from the content of a file or of a RawExtrinsicMetadata
+ object."""
+ raise NotImplementedError(f"{self.__class__.__name__}.translate")
+
+ def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
+ return compact(metadata)
+
+
+class BaseExtrinsicMapping(BaseMapping):
+ """Base class for extrinsic-metadata mappings to inherit from
+
+ To implement a new mapping:
+
+ - inherit this class
+ - override translate function
+ """
@classmethod
def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
@@ -76,15 +82,25 @@
"""
raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats")
- def translate(self, file_content: bytes) -> Optional[Dict]:
- """Translates intrinsic metadata, from the content of a file."""
- raise NotImplementedError(f"{self.__class__.__name__}.translate")
- def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
- return compact(metadata)
+class BaseIntrinsicMapping(BaseMapping):
+ """Base class for intrinsic-metadata mappings to inherit from
+
+ To implement a new mapping:
+
+ - inherit this class
+ - override translate function
+ """
+
+ @classmethod
+ def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
+ """
+ Returns the sha1 hashes of files which can be translated by this mapping
+ """
+ raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
-class SingleFileMapping(BaseMapping):
+class SingleFileIntrinsicMapping(BaseIntrinsicMapping):
"""Base class for all intrinsic metadata mappings that use a single file as input."""
@property
@@ -99,11 +115,6 @@
return [entry["sha1"]]
return []
- @classmethod
- def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
- # this class is only used by intrinsic metadata mappings
- return ()
-
class DictMapping(BaseMapping):
"""Base class for mappings that take as input a file that is mostly
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -4,7 +4,7 @@
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping, SingleFileMapping
+from .base import DictMapping, SingleFileIntrinsicMapping
class SafeLoader(yaml.SafeLoader):
@@ -14,7 +14,7 @@
}
-class CffMapping(DictMapping, SingleFileMapping):
+class CffMapping(DictMapping, SingleFileIntrinsicMapping):
"""Dedicated class for Citation (CITATION.cff) mapping and translation"""
name = "cff"
diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -8,10 +8,10 @@
from swh.indexer.codemeta import CODEMETA_TERMS, expand
-from .base import SingleFileMapping
+from .base import SingleFileIntrinsicMapping
-class CodemetaMapping(SingleFileMapping):
+class CodemetaMapping(SingleFileIntrinsicMapping):
"""
dedicated class for CodeMeta (codemeta.json) mapping and translation
"""
diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
--- a/swh/indexer/metadata_dictionary/composer.py
+++ b/swh/indexer/metadata_dictionary/composer.py
@@ -7,7 +7,7 @@
from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
-from .base import JsonMapping, SingleFileMapping
+from .base import JsonMapping, SingleFileIntrinsicMapping
COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv")
@@ -15,7 +15,7 @@
(CODEMETA_TERMS, COMPOSER_TABLE) = _read_crosstable(fd)
-class ComposerMapping(JsonMapping, SingleFileMapping):
+class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping):
"""Dedicated class for Packagist(composer.json) mapping and translation"""
name = "composer"
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -8,14 +8,14 @@
from swh.indexer.codemeta import ACTIVITYSTREAMS_URI, CROSSWALK_TABLE, FORGEFED_URI
from swh.indexer.storage.interface import Sha1
-from .base import DirectoryLsEntry, JsonMapping, produce_terms
+from .base import BaseExtrinsicMapping, DirectoryLsEntry, JsonMapping, produce_terms
def _prettyprint(d):
print(json.dumps(d, indent=4))
-class GitHubMapping(JsonMapping):
+class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
name = "github"
mapping = CROSSWALK_TABLE["GitHub"]
string_fields = [
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -11,10 +11,10 @@
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping, SingleFileMapping
+from .base import DictMapping, SingleFileIntrinsicMapping
-class MavenMapping(DictMapping, SingleFileMapping):
+class MavenMapping(DictMapping, SingleFileIntrinsicMapping):
"""
dedicated class for Maven (pom.xml) mapping and translation
"""
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -7,10 +7,10 @@
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import JsonMapping, SingleFileMapping
+from .base import JsonMapping, SingleFileIntrinsicMapping
-class NpmMapping(JsonMapping, SingleFileMapping):
+class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
"""
dedicated class for NPM (package.json) mapping and translation
"""
diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
--- a/swh/indexer/metadata_dictionary/python.py
+++ b/swh/indexer/metadata_dictionary/python.py
@@ -9,7 +9,7 @@
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping, SingleFileMapping
+from .base import DictMapping, SingleFileIntrinsicMapping
_normalize_pkginfo_key = str.lower
@@ -22,7 +22,7 @@
return self.header_factory(name, value)
-class PythonPkginfoMapping(DictMapping, SingleFileMapping):
+class PythonPkginfoMapping(DictMapping, SingleFileIntrinsicMapping):
"""Dedicated class for Python's PKG-INFO mapping and translation.
https://www.python.org/dev/peps/pep-0314/"""
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -12,7 +12,7 @@
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
from swh.indexer.storage.interface import Sha1
-from .base import DictMapping
+from .base import BaseIntrinsicMapping, DictMapping
def name_to_person(name):
@@ -22,7 +22,7 @@
}
-class GemspecMapping(DictMapping):
+class GemspecMapping(BaseIntrinsicMapping, DictMapping):
name = "gemspec"
mapping = CROSSWALK_TABLE["Ruby Gem"]
string_fields = ["name", "version", "description", "summary", "email"]

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 1:59 PM (11 w, 16 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216140

Event Timeline