diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -6,8 +6,10 @@ import json import logging from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar +import xml.parsers.expat from typing_extensions import TypedDict +import xmltodict import yaml from swh.indexer.codemeta import compact, merge_values @@ -159,7 +161,7 @@ return simple_terms | complex_terms - def _translate_dict(self, content_dict: Dict) -> Dict[str, str]: + def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]: """ Translates content by parsing content from a dict object and translating with the appropriate mapping @@ -249,6 +251,40 @@ return None +class XmlMapping(DictMapping): + """Base class for all mappings that use XML data as input.""" + + def translate(self, raw_content: bytes) -> Optional[Dict]: + """ + Translates content by parsing content from a bytestring containing + XML data and translating with the appropriate mapping + + Args: + raw_content (bytes): raw content to translate + + Returns: + dict: translated metadata in json-friendly form needed for + the indexer + + """ + try: + d = xmltodict.parse(raw_content) + except xml.parsers.expat.ExpatError: + self.log.warning("Error parsing XML from %s", self.log_suffix) + return None + except UnicodeDecodeError: + self.log.warning("Error unidecoding XML from %s", self.log_suffix) + return None + except (LookupError, ValueError): + # unknown encoding or multi-byte encoding + self.log.warning("Error detecting XML encoding from %s", self.log_suffix) + return None + if not isinstance(d, dict): + self.log.warning("Skipping ill-formed XML content: %s", raw_content) + return None + return self._translate_dict(d) + + class SafeLoader(yaml.SafeLoader): yaml_implicit_resolvers = { k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"] diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -4,18 +4,15 @@ # See top-level LICENSE file for more information import os -from typing import Any, Dict, Optional -import xml.parsers.expat - -import xmltodict +from typing import Any, Dict from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.namespaces import SCHEMA -from .base import DictMapping, SingleFileIntrinsicMapping +from .base import SingleFileIntrinsicMapping, XmlMapping -class MavenMapping(DictMapping, SingleFileIntrinsicMapping): +class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ @@ -25,26 +22,11 @@ mapping = CROSSWALK_TABLE["Java (Maven)"] string_fields = ["name", "version", "description", "email"] - def translate(self, content: bytes) -> Optional[Dict[str, Any]]: - try: - d = xmltodict.parse(content).get("project") or {} - except xml.parsers.expat.ExpatError: - self.log.warning("Error parsing XML from %s", self.log_suffix) - return None - except UnicodeDecodeError: - self.log.warning("Error unidecoding XML from %s", self.log_suffix) - return None - except (LookupError, ValueError): - # unknown encoding or multi-byte encoding - self.log.warning("Error detecting XML encoding from %s", self.log_suffix) - return None - if not isinstance(d, dict): - self.log.warning("Skipping ill-formed XML content: %s", content) - return None - return self._translate_dict(d) - _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} + def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]: + return super()._translate_dict(d.get("project") or {}) + def extra_translation(self, translated_metadata, d): repositories = self.parse_repositories(d) if repositories: diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py --- a/swh/indexer/metadata_dictionary/nuget.py +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -5,15 +5,13 @@ import os.path import re -from typing import Any, Dict, List, Optional - -import xmltodict +from typing import Any, Dict, List from swh.indexer.codemeta import _DATA_DIR, _read_crosstable from swh.indexer.namespaces import SCHEMA from swh.indexer.storage.interface import Sha1 -from .base import BaseIntrinsicMapping, DictMapping, DirectoryLsEntry +from .base import BaseIntrinsicMapping, DirectoryLsEntry, XmlMapping NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") @@ -21,7 +19,7 @@ (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) -class NuGetMapping(DictMapping, BaseIntrinsicMapping): +class NuGetMapping(XmlMapping, BaseIntrinsicMapping): """ dedicated class for NuGet (.nuspec) mapping and translation """ @@ -50,13 +48,8 @@ return [entry["sha1"]] return [] - def translate(self, content: bytes) -> Optional[Dict[str, Any]]: - d = xmltodict.parse(content).get("package", {}).get("metadata", {}) - if not isinstance(d, dict): - self.log.warning("Skipping ill-formed XML content: %s", content) - return None - - return self._translate_dict(d) + def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]: + return super()._translate_dict(d.get("package", {}).get("metadata", {})) def normalize_projectUrl(self, s): if isinstance(s, str):