Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/codemeta.py
# Copyright (C) 2018-2022 The Software Heritage developers | # Copyright (C) 2018-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import collections | import collections | ||||
import json | import json | ||||
import re | import re | ||||
from typing import Any, Dict, List, Optional, Tuple, Union | from typing import Any, Dict, List, Optional, Tuple, Union | ||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
import iso8601 | |||||
import xmltodict | import xmltodict | ||||
from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand | ||||
from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | from .base import BaseExtrinsicMapping, SingleFileIntrinsicMapping | ||||
ATOM_URI = "http://www.w3.org/2005/Atom" | ATOM_URI = "http://www.w3.org/2005/Atom" | ||||
_TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") | _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") | ||||
_IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) | _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) | ||||
_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") | |||||
class CodemetaMapping(SingleFileIntrinsicMapping): | class CodemetaMapping(SingleFileIntrinsicMapping): | ||||
""" | """ | ||||
dedicated class for CodeMeta (codemeta.json) mapping and translation | dedicated class for CodeMeta (codemeta.json) mapping and translation | ||||
""" | """ | ||||
name = "codemeta" | name = "codemeta" | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | def xml_to_jsonld(self, e: ET.Element) -> Union[str, Dict[str, Any]]: | ||||
elif namespace in _IGNORED_NAMESPACES: | elif namespace in _IGNORED_NAMESPACES: | ||||
# SWORD-specific namespace that is not interesting to translate | # SWORD-specific namespace that is not interesting to translate | ||||
pass | pass | ||||
elif namespace.lower() == CODEMETA_CONTEXT_URL: | elif namespace.lower() == CODEMETA_CONTEXT_URL: | ||||
# It is a term defined by the context; write is as-is and JSON-LD | # It is a term defined by the context; write is as-is and JSON-LD | ||||
# expansion will convert it to a full URI based on | # expansion will convert it to a full URI based on | ||||
# "@context": CODEMETA_CONTEXT_URL | # "@context": CODEMETA_CONTEXT_URL | ||||
jsonld_child = self.xml_to_jsonld(child) | jsonld_child = self.xml_to_jsonld(child) | ||||
if ( | |||||
localname | |||||
in ( | |||||
"dateCreated", | |||||
"dateModified", | |||||
"datePublished", | |||||
) | |||||
and isinstance(jsonld_child, str) | |||||
anlambert: maybe add extra condition on string length to avoid useless reformatting ?
```lang=python
and… | |||||
Done Inline ActionsI don't think it matters, reformatting is fast: In [4]: %timeit iso8601.parse_date("2022-10-26").date().isoformat() 4.56 µs ± 47.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) vlorentz: I don't think it matters, reformatting is fast:
```
In [4]: %timeit iso8601.parse_date("2022… | |||||
and _DATE_RE.match(jsonld_child) | |||||
): | |||||
# Dates missing a leading zero for their day/month, used | |||||
# to be allowed by the deposit; so we need to reformat them | |||||
# to be valid ISO8601. | |||||
jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() | |||||
doc[localname].append(jsonld_child) | doc[localname].append(jsonld_child) | ||||
else: | else: | ||||
# Otherwise, we already know the URI | # Otherwise, we already know the URI | ||||
doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) | doc[f"{namespace}{localname}"].append(self.xml_to_jsonld(child)) | ||||
# The above needed doc values to be list to work; now we allow any type | # The above needed doc values to be list to work; now we allow any type | ||||
# of value as key "@value" cannot have a list as value. | # of value as key "@value" cannot have a list as value. | ||||
doc_: Dict[str, Any] = doc | doc_: Dict[str, Any] = doc | ||||
▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines |
maybe add extra condition on string length to avoid useless reformatting ?