Differential D8779 Diff 31667 swh/indexer/metadata_dictionary/codemeta.py

Changeset View

Standalone View

swh/indexer/metadata_dictionary/codemeta.py

# Copyright (C) 2018-2022 The Software Heritage developers		# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import collections		import collections
import json		import json
import re		import re
from typing import Any, Dict, List, Optional, Tuple, Union		from typing import Any, Dict, List, Optional, Tuple, Union
import xml.etree.ElementTree as ET		import xml.etree.ElementTree as ET

		import iso8601
import xmltodict		import xmltodict

from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand		from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand

from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping		from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping

ATOM_URI = "http://www.w3.org/2005/Atom"		ATOM_URI = "http://www.w3.org/2005/Atom"

_TAG_RE = re.compile(r"\{(?P<namespace>.?)\}(?P<localname>.)")		_TAG_RE = re.compile(r"\{(?P<namespace>.?)\}(?P<localname>.)")
_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",)		_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",)
		_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$")


class CodemetaMapping(SingleFileIntrinsicMapping):		class CodemetaMapping(SingleFileIntrinsicMapping):
"""		"""
dedicated class for CodeMeta (codemeta.json) mapping and translation		dedicated class for CodeMeta (codemeta.json) mapping and translation
"""		"""

name = "codemeta"		name = "codemeta"
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]:
elif namespace in _IGNORED_NAMESPACES:		elif namespace in _IGNORED_NAMESPACES:
# SWORD-specific namespace that is not interesting to translate		# SWORD-specific namespace that is not interesting to translate
pass		pass
elif namespace.lower() == CODEMETA_CONTEXT_URL:		elif namespace.lower() == CODEMETA_CONTEXT_URL:
# It is a term defined by the context; write is as-is and JSON-LD		# It is a term defined by the context; write is as-is and JSON-LD
# expansion will convert it to a full URI based on		# expansion will convert it to a full URI based on
# "@context": CODEMETA_CONTEXT_URL		# "@context": CODEMETA_CONTEXT_URL
jsonld_child = self.xml_to_jsonld(child)		jsonld_child = self.xml_to_jsonld(child)
		if (
		localname
		in (
		"dateCreated",
		"dateModified",
		"datePublished",
		)
		and isinstance(jsonld_child, str)
		anlambertUnsubmitted Not Done Inline Actions maybe add extra condition on string length to avoid useless reformatting ? and len(json_child) < 10 anlambert: maybe add extra condition on string length to avoid useless reformatting ? ```lang=python and…
		vlorentzAuthorUnsubmitted Done Inline Actions I don't think it matters, reformatting is fast: In [4]: %timeit iso8601.parse_date("2022-10-26").date().isoformat() 4.56 µs ± 47.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) vlorentz: I don't think it matters, reformatting is fast: ``` In [4]: %timeit iso8601.parse_date("2022…
		and _DATE_RE.match(jsonld_child)
		):
		# Dates missing a leading zero for their day/month, used
		# to be allowed by the deposit; so we need to reformat them
		# to be valid ISO8601.
		jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat()
doc[localname].append(jsonld_child)		doc[localname].append(jsonld_child)
else:		else:
# Otherwise, we already know the URI		# Otherwise, we already know the URI
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))		doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child))

# The above needed doc values to be list to work; now we allow any type		# The above needed doc values to be list to work; now we allow any type
# of value as key "@value" cannot have a list as value.		# of value as key "@value" cannot have a list as value.
doc_: Dict[str, Any] = doc		doc_: Dict[str, Any] = doc
▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines