Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/codemeta.py
# Copyright (C) 2018-2022 The Software Heritage developers | # Copyright (C) 2018-2022 The Software Heritage developers | ||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||
import collections | import collections | ||||||||
import json | import json | ||||||||
import logging | |||||||||
import re | import re | ||||||||
from typing import Any, Dict, List, Optional, Tuple, Union | from typing import Any, Dict, List, Optional, Tuple, Union | ||||||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||||||
import iso8601 | import iso8601 | ||||||||
import xmltodict | import xmltodict | ||||||||
from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | ||||||||
from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | ||||||||
ATOM_URI = "http://www.w3.org/2005/Atom" | ATOM_URI = "http://www.w3.org/2005/Atom" | ||||||||
_TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") | _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") | ||||||||
_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) | _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) | ||||||||
_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") | _DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") | ||||||||
logger = logging.getLogger(__name__) | |||||||||
class CodemetaMapping(SingleFileIntrinsicMapping): | class CodemetaMapping(SingleFileIntrinsicMapping): | ||||||||
""" | """ | ||||||||
dedicated class for CodeMeta (codemeta.json) mapping and translation | dedicated class for CodeMeta (codemeta.json) mapping and translation | ||||||||
""" | """ | ||||||||
name = "codemeta" | name = "codemeta" | ||||||||
filename = b"codemeta.json" | filename = b"codemeta.json" | ||||||||
Show All 26 Lines | def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: | ||||||||
"sword-v2-atom-codemeta-v2", | "sword-v2-atom-codemeta-v2", | ||||||||
) | ) | ||||||||
@classmethod | @classmethod | ||||||||
def supported_terms(cls) -> List[str]: | def supported_terms(cls) -> List[str]: | ||||||||
return [term for term in CODEMETA_TERMS if not term.startswith("@")] | return [term for term in CODEMETA_TERMS if not term.startswith("@")] | ||||||||
def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: | def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: | ||||||||
# Keys are JSON-LD property names (URIs or terms). | |||||||||
# Values are either a single string (if key is "type") or list of | |||||||||
# other dicts with the same type recursively. | |||||||||
ardumontUnsubmitted Not Done Inline Actions
ardumont: | |||||||||
Done Inline Actionsoops, missed your comment vlorentz: oops, missed your comment | |||||||||
# To simply annotations, we omit the single string case here. | |||||||||
doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) | doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list) | ||||||||
for child in e: | for child in e: | ||||||||
m = _TAG_RE.match(child.tag) | m = _TAG_RE.match(child.tag) | ||||||||
assert m, f"Tag with no namespace: {child}" | assert m, f"Tag with no namespace: {child}" | ||||||||
namespace = m.group("namespace") | namespace = m.group("namespace") | ||||||||
localname = m.group("localname") | localname = m.group("localname") | ||||||||
if namespace == ATOM_URI and localname in ("title", "name"): | if namespace == ATOM_URI and localname in ("title", "name"): | ||||||||
# Convert Atom to Codemeta name; in case codemeta:name | # Convert Atom to Codemeta name; in case codemeta:name | ||||||||
# is not provided or different | # is not provided or different | ||||||||
Show All 19 Lines | def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: | ||||||||
) | ) | ||||||||
and isinstance(jsonld_child, str) | and isinstance(jsonld_child, str) | ||||||||
and _DATE_RE.match(jsonld_child) | and _DATE_RE.match(jsonld_child) | ||||||||
): | ): | ||||||||
# Dates missing a leading zero for their day/month, used | # Dates missing a leading zero for their day/month, used | ||||||||
# to be allowed by the deposit; so we need to reformat them | # to be allowed by the deposit; so we need to reformat them | ||||||||
# to be valid ISO8601. | # to be valid ISO8601. | ||||||||
jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() | jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() | ||||||||
if localname == "id": | |||||||||
# JSON-LD only allows a single id, and they have to be strings. | |||||||||
if localname in doc: | |||||||||
Done Inline ActionsShouldn't this spit a warning of some sort? olasd: Shouldn't this spit a warning of some sort? | |||||||||
logger.error( | |||||||||
"Duplicate <id>s in SWORD document: %r and %r", | |||||||||
doc[localname], | |||||||||
jsonld_child, | |||||||||
) | |||||||||
continue | |||||||||
elif not jsonld_child: | |||||||||
logger.error("Empty <id> value in SWORD document") | |||||||||
continue | |||||||||
elif not isinstance(jsonld_child, str): | |||||||||
logger.error( | |||||||||
"Unexpected <id> value in SWORD document: %r", jsonld_child | |||||||||
) | |||||||||
continue | |||||||||
else: | |||||||||
doc[localname] = jsonld_child # type: ignore[assignment] | |||||||||
else: | |||||||||
doc[localname].append(jsonld_child) | doc[localname].append(jsonld_child) | ||||||||
else: | else: | ||||||||
# Otherwise, we already know the URI | # Otherwise, we already know the URI | ||||||||
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) | doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) | ||||||||
# The above needed doc values to be list to work; now we allow any type | # The above needed doc values to be list to work; now we allow any type | ||||||||
# of value as key "@value" cannot have a list as value. | # of value as key "@value" cannot have a list as value. | ||||||||
doc_: Dict[str, Any] = doc | doc_: Dict[str, Any] = doc | ||||||||
▲ Show 20 Lines • Show All 57 Lines • Show Last 20 Lines |