Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/codemeta.py
# Copyright (C) 2018-2022 The Software Heritage developers | # Copyright (C) 2018-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import collections | import collections | ||||
import json | import json | ||||
import re | import re | ||||
from typing import Any, Dict, List, Optional, Tuple | from typing import Any, Dict, List, Optional, Tuple, Union | ||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
import xmltodict | import xmltodict | ||||
from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | ||||
from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | ||||
Show All 38 Lines | def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: | ||||
"sword-v2-atom-codemeta", | "sword-v2-atom-codemeta", | ||||
"sword-v2-atom-codemeta-v2", | "sword-v2-atom-codemeta-v2", | ||||
) | ) | ||||
@classmethod | @classmethod | ||||
def supported_terms(cls) -> List[str]: | def supported_terms(cls) -> List[str]: | ||||
return [term for term in CODEMETA_TERMS if not term.startswith("@")] | return [term for term in CODEMETA_TERMS if not term.startswith("@")] | ||||
def xml_to_jsonld(self, e: ET.Element) -> Dict[str, Any]: | def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: | ||||
doc: Dict[str, List[Dict[str, Any]]] = collections.defaultdict(list) | doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) | ||||
for child in e: | for child in e: | ||||
m = _TAG_RE.match(child.tag) | m = _TAG_RE.match(child.tag) | ||||
assert m, f"Tag with no namespace: {child}" | assert m, f"Tag with no namespace: {child}" | ||||
namespace = m.group("namespace") | namespace = m.group("namespace") | ||||
localname = m.group("localname") | localname = m.group("localname") | ||||
if namespace == ATOM_URI and localname in ("title", "name"): | if namespace == ATOM_URI and localname in ("title", "name"): | ||||
# Convert Atom to Codemeta name; in case codemeta:name | # Convert Atom to Codemeta name; in case codemeta:name | ||||
# is not provided or different | # is not provided or different | ||||
doc["name"].append(self.xml_to_jsonld(child)) | doc["name"].append(self.xml_to_jsonld(child)) | ||||
elif namespace == ATOM_URI and localname in ("author", "email"): | elif namespace == ATOM_URI and localname in ("author", "email"): | ||||
# ditto for these author properties (note that author email is also | # ditto for these author properties (note that author email is also | ||||
# covered by the previous test) | # covered by the previous test) | ||||
doc[localname].append(self.xml_to_jsonld(child)) | doc[localname].append(self.xml_to_jsonld(child)) | ||||
elif namespace in _IGNORED_NAMESPACES: | elif namespace in _IGNORED_NAMESPACES: | ||||
# SWORD-specific namespace that is not interesting to translate | # SWORD-specific namespace that is not interesting to translate | ||||
pass | pass | ||||
elif namespace.lower() == CODEMETA_CONTEXT_URL: | elif namespace.lower() == CODEMETA_CONTEXT_URL: | ||||
# It is a term defined by the context; write is as-is and JSON-LD | # It is a term defined by the context; write is as-is and JSON-LD | ||||
# expansion will convert it to a full URI based on | # expansion will convert it to a full URI based on | ||||
# "@context": CODEMETA_CONTEXT_URL | # "@context": CODEMETA_CONTEXT_URL | ||||
jsonld_child = self.xml_to_jsonld(child) | jsonld_child = self.xml_to_jsonld(child) | ||||
if localname == "type" and isinstance(jsonld_child, dict): | |||||
# With a codemeta context, this is later translated to a JSON-LD | |||||
# @type, which must be either an array of strings or a string. | |||||
if set(jsonld_child) != {"@value"}: | |||||
raise ValueError(f'Unexpected value for "type": {jsonld_child}') | |||||
jsonld_child = jsonld_child["@value"] | |||||
doc[localname].append(jsonld_child) | doc[localname].append(jsonld_child) | ||||
else: | else: | ||||
# Otherwise, we already know the URI | # Otherwise, we already know the URI | ||||
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) | doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) | ||||
# The above needed doc values to be list to work; now we allow any type | # The above needed doc values to be list to work; now we allow any type | ||||
# of value as key "@value" cannot have a list as value. | # of value as key "@value" cannot have a list as value. | ||||
doc_: Dict[str, Any] = doc | doc_: Dict[str, Any] = doc | ||||
text = e.text.strip() if e.text else None | text = e.text.strip() if e.text else None | ||||
if text: | if text: | ||||
# TODO: check doc is empty, and raise mixed-content error otherwise? | # TODO: check doc is empty, and raise mixed-content error otherwise? | ||||
doc_["@value"] = text | return text | ||||
return doc_ | return doc_ | ||||
def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | ||||
# Parse XML | # Parse XML | ||||
root = ET.fromstring(content) | root = ET.fromstring(content) | ||||
# Transform to JSON-LD document | # Transform to JSON-LD document | ||||
doc = self.xml_to_jsonld(root) | doc = self.xml_to_jsonld(root) | ||||
assert isinstance(doc, dict), f"Root object is not a dict: {doc}" | |||||
# Add @context to JSON-LD expansion replaces the "codemeta:" prefix | # Add @context to JSON-LD expansion replaces the "codemeta:" prefix | ||||
# hash (which uses the context URL as namespace URI for historical | # hash (which uses the context URL as namespace URI for historical | ||||
# reasons) into properties in `http://schema.org/` and | # reasons) into properties in `http://schema.org/` and | ||||
# `https://codemeta.github.io/terms/` namespaces | # `https://codemeta.github.io/terms/` namespaces | ||||
doc["@context"] = CODEMETA_CONTEXT_URL | doc["@context"] = CODEMETA_CONTEXT_URL | ||||
# Normalize as a Codemeta document | # Normalize as a Codemeta document | ||||
return self.normalize_translation(expand(doc)) | return self.normalize_translation(expand(doc)) | ||||
Show All 33 Lines |