Differential D8778 Diff 31666 swh/indexer/metadata_dictionary/codemeta.py

Changeset View

Standalone View

swh/indexer/metadata_dictionary/codemeta.py

# Copyright (C) 2018-2022 The Software Heritage developers		# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import collections		import collections
import json		import json
import re		import re
from typing import Any, Dict, List, Optional, Tuple		from typing import Any, Dict, List, Optional, Tuple, Union
import xml.etree.ElementTree as ET		import xml.etree.ElementTree as ET

import xmltodict		import xmltodict

from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand		from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand

from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping		from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping

Show All 38 Lines	def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
"sword-v2-atom-codemeta",		"sword-v2-atom-codemeta",
"sword-v2-atom-codemeta-v2",		"sword-v2-atom-codemeta-v2",
)		)

@classmethod		@classmethod
def supported_terms(cls) -> List[str]:		def supported_terms(cls) -> List[str]:
return [term for term in CODEMETA_TERMS if not term.startswith("@")]		return [term for term in CODEMETA_TERMS if not term.startswith("@")]

def xml_to_jsonld(self, e: ET.Element) -> Dict[str, Any]:		def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]:
doc: Dict[str, List[Dict[str, Any]]] = collections.defaultdict(list)		doc: Dict[str, List[Union[str, Dict[str, Any]]]] = collections.defaultdict(list)
for child in e:		for child in e:
m = _TAG_RE.match(child.tag)		m = _TAG_RE.match(child.tag)
assert m, f"Tag with no namespace: {child}"		assert m, f"Tag with no namespace: {child}"
namespace = m.group("namespace")		namespace = m.group("namespace")
localname = m.group("localname")		localname = m.group("localname")
if namespace == ATOM_URI and localname in ("title", "name"):		if namespace == ATOM_URI and localname in ("title", "name"):
# Convert Atom to Codemeta name; in case codemeta:name		# Convert Atom to Codemeta name; in case codemeta:name
# is not provided or different		# is not provided or different
doc["name"].append(self.xml_to_jsonld(child))		doc["name"].append(self.xml_to_jsonld(child))
elif namespace == ATOM_URI and localname in ("author", "email"):		elif namespace == ATOM_URI and localname in ("author", "email"):
# ditto for these author properties (note that author email is also		# ditto for these author properties (note that author email is also
# covered by the previous test)		# covered by the previous test)
doc[localname].append(self.xml_to_jsonld(child))		doc[localname].append(self.xml_to_jsonld(child))
elif namespace in _IGNORED_NAMESPACES:		elif namespace in _IGNORED_NAMESPACES:
# SWORD-specific namespace that is not interesting to translate		# SWORD-specific namespace that is not interesting to translate
pass		pass
elif namespace.lower() == CODEMETA_CONTEXT_URL:		elif namespace.lower() == CODEMETA_CONTEXT_URL:
# It is a term defined by the context; write is as-is and JSON-LD		# It is a term defined by the context; write is as-is and JSON-LD
# expansion will convert it to a full URI based on		# expansion will convert it to a full URI based on
# "@context": CODEMETA_CONTEXT_URL		# "@context": CODEMETA_CONTEXT_URL
jsonld_child = self.xml_to_jsonld(child)		jsonld_child = self.xml_to_jsonld(child)
if localname == "type" and isinstance(jsonld_child, dict):
# With a codemeta context, this is later translated to a JSON-LD
# @type, which must be either an array of strings or a string.
if set(jsonld_child) != {"@value"}:
raise ValueError(f'Unexpected value for "type": {jsonld_child}')
jsonld_child = jsonld_child["@value"]
doc[localname].append(jsonld_child)		doc[localname].append(jsonld_child)
else:		else:
# Otherwise, we already know the URI		# Otherwise, we already know the URI
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))		doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))

# The above needed doc values to be list to work; now we allow any type		# The above needed doc values to be list to work; now we allow any type
# of value as key "@value" cannot have a list as value.		# of value as key "@value" cannot have a list as value.
doc_: Dict[str, Any] = doc		doc_: Dict[str, Any] = doc

text = e.text.strip() if e.text else None		text = e.text.strip() if e.text else None
if text:		if text:
# TODO: check doc is empty, and raise mixed-content error otherwise?		# TODO: check doc is empty, and raise mixed-content error otherwise?
doc_["@value"] = text		return text

return doc_		return doc_

def translate(self, content: bytes) -> Optional[Dict[str, Any]]:		def translate(self, content: bytes) -> Optional[Dict[str, Any]]:
# Parse XML		# Parse XML
root = ET.fromstring(content)		root = ET.fromstring(content)

# Transform to JSON-LD document		# Transform to JSON-LD document
doc = self.xml_to_jsonld(root)		doc = self.xml_to_jsonld(root)

		assert isinstance(doc, dict), f"Root object is not a dict: {doc}"

# Add @context to JSON-LD expansion replaces the "codemeta:" prefix		# Add @context to JSON-LD expansion replaces the "codemeta:" prefix
# hash (which uses the context URL as namespace URI for historical		# hash (which uses the context URL as namespace URI for historical
# reasons) into properties in `http://schema.org/` and		# reasons) into properties in `http://schema.org/` and
# `https://codemeta.github.io/terms/` namespaces		# `https://codemeta.github.io/terms/` namespaces
doc["@context"] = CODEMETA_CONTEXT_URL		doc["@context"] = CODEMETA_CONTEXT_URL

# Normalize as a Codemeta document		# Normalize as a Codemeta document
return self.normalize_translation(expand(doc))		return self.normalize_translation(expand(doc))
Show All 33 Lines