Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/codemeta.py
# Copyright (C) 2018-2022 The Software Heritage developers | # Copyright (C) 2018-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import collections | import collections | ||||
import json | import json | ||||
import re | import re | ||||
from typing import Any, Dict, List, Optional, Tuple | from typing import Any, Dict, List, Optional, Tuple | ||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
import xmltodict | |||||
from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | ||||
from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | ||||
ATOM_URI = "http://www.w3.org/2005/Atom" | ATOM_URI = "http://www.w3.org/2005/Atom" | ||||
_TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") | _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") | ||||
_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) | _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) | ||||
▲ Show 20 Lines • Show All 90 Lines • ▼ Show 20 Lines | def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | ||||
# `https://codemeta.github.io/terms/` namespaces | # `https://codemeta.github.io/terms/` namespaces | ||||
doc["@context"] = CODEMETA_CONTEXT_URL | doc["@context"] = CODEMETA_CONTEXT_URL | ||||
# Normalize as a Codemeta document | # Normalize as a Codemeta document | ||||
return self.normalize_translation(expand(doc)) | return self.normalize_translation(expand(doc)) | ||||
def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: | def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: | ||||
return compact(metadata, forgefed=False) | return compact(metadata, forgefed=False) | ||||
class JsonSwordCodemetaMapping(SwordCodemetaMapping): | |||||
""" | |||||
Variant of :class:`SwordCodemetaMapping` that reads the legacy | |||||
``sword-v2-atom-codemeta-v2-in-json`` format and converts it back to | |||||
``sword-v2-atom-codemeta-v2`` XML | |||||
""" | |||||
name = "json-sword-codemeta" | |||||
@classmethod | |||||
def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: | |||||
return ("sword-v2-atom-codemeta-v2-in-json",) | |||||
def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | |||||
# ``content`` was generated by calling ``xmltodict.parse()`` on a XML document, | |||||
# so ``xmltodict.unparse()`` is guaranteed to return a document that is | |||||
# semantically equivalent to the original and pass it to SwordCodemetaMapping. | |||||
json_doc = json.loads(content) | |||||
if json_doc.get("@xmlns") != ATOM_URI: | |||||
# Technically, non-default XMLNS were allowed, but it does not seem like | |||||
# anyone used them, so they do not need to be implemented here. | |||||
raise NotImplementedError(f"Unexpected XMLNS set: {json_doc}") | |||||
# Root tag was stripped by swh-deposit | |||||
json_doc = {"entry": json_doc} | |||||
return super().translate(xmltodict.unparse(json_doc)) |