Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/base.py
# Copyright (C) 2017-2022 The Software Heritage developers | # Copyright (C) 2017-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import logging | import logging | ||||
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar | from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar | ||||
import urllib.parse | |||||
import uuid | import uuid | ||||
import xml.parsers.expat | import xml.parsers.expat | ||||
from pyld import jsonld | from pyld import jsonld | ||||
import rdflib | import rdflib | ||||
from typing_extensions import TypedDict | from typing_extensions import TypedDict | ||||
import xmltodict | import xmltodict | ||||
import yaml | import yaml | ||||
from swh.indexer.codemeta import _document_loader, compact | from swh.indexer.codemeta import _document_loader, compact | ||||
from swh.indexer.namespaces import RDF, SCHEMA | from swh.indexer.namespaces import RDF, SCHEMA | ||||
from swh.indexer.storage.interface import Sha1 | from swh.indexer.storage.interface import Sha1 | ||||
from .utils import add_url_if_valid | |||||
TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" | TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/" | ||||
"""Prefix used to generate temporary URIs for root nodes being translated.""" | """Prefix used to generate temporary URIs for root nodes being translated.""" | ||||
class DirectoryLsEntry(TypedDict): | class DirectoryLsEntry(TypedDict): | ||||
target: Sha1 | target: Sha1 | ||||
sha1: Sha1 | sha1: Sha1 | ||||
name: bytes | name: bytes | ||||
▲ Show 20 Lines • Show All 249 Lines • ▼ Show 20 Lines | ) -> None: | ||||
self, "normalize_" + self._normalize_method_name(k), None | self, "normalize_" + self._normalize_method_name(k), None | ||||
) | ) | ||||
if normalization_method: | if normalization_method: | ||||
v = normalization_method(v) | v = normalization_method(v) | ||||
if v is None: | if v is None: | ||||
pass | pass | ||||
elif isinstance(v, list): | elif isinstance(v, list): | ||||
for item in reversed(v): | for item in reversed(v): | ||||
if isinstance(item, rdflib.URIRef): | |||||
add_url_if_valid(graph, root, codemeta_key, str(item)) | |||||
else: | |||||
graph.add((root, codemeta_key, item)) | graph.add((root, codemeta_key, item)) | ||||
else: | else: | ||||
if isinstance(v, rdflib.URIRef): | |||||
add_url_if_valid(graph, root, codemeta_key, str(v)) | |||||
else: | |||||
graph.add((root, codemeta_key, v)) | graph.add((root, codemeta_key, v)) | ||||
elif k in self.string_fields and isinstance(v, str): | elif k in self.string_fields and isinstance(v, str): | ||||
graph.add((root, codemeta_key, rdflib.Literal(v))) | graph.add((root, codemeta_key, rdflib.Literal(v))) | ||||
elif k in self.string_fields and isinstance(v, list): | elif k in self.string_fields and isinstance(v, list): | ||||
for item in v: | for item in v: | ||||
graph.add((root, codemeta_key, rdflib.Literal(item))) | graph.add((root, codemeta_key, rdflib.Literal(item))) | ||||
elif k in self.date_fields and isinstance(v, str): | elif k in self.date_fields and isinstance(v, str): | ||||
typed_v = rdflib.Literal(v, datatype=SCHEMA.Date) | typed_v = rdflib.Literal(v, datatype=SCHEMA.Date) | ||||
graph.add((root, codemeta_key, typed_v)) | graph.add((root, codemeta_key, typed_v)) | ||||
elif k in self.date_fields and isinstance(v, list): | elif k in self.date_fields and isinstance(v, list): | ||||
for item in v: | for item in v: | ||||
if isinstance(item, str): | if isinstance(item, str): | ||||
typed_item = rdflib.Literal(item, datatype=SCHEMA.Date) | typed_item = rdflib.Literal(item, datatype=SCHEMA.Date) | ||||
graph.add((root, codemeta_key, typed_item)) | graph.add((root, codemeta_key, typed_item)) | ||||
elif k in self.uri_fields and isinstance(v, str): | elif k in self.uri_fields and isinstance(v, str): | ||||
# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop | add_url_if_valid(graph, root, codemeta_key, v) | ||||
# URLs that are blatantly invalid early, so PyLD does not crash. | |||||
parsed_url = urllib.parse.urlparse(v) | |||||
if parsed_url.netloc: | |||||
graph.add((root, codemeta_key, rdflib.URIRef(v))) | |||||
elif k in self.uri_fields and isinstance(v, list): | elif k in self.uri_fields and isinstance(v, list): | ||||
for item in v: | for item in v: | ||||
if isinstance(item, str): | add_url_if_valid(graph, root, codemeta_key, item) | ||||
# ditto | |||||
parsed_url = urllib.parse.urlparse(item) | |||||
if parsed_url.netloc: | |||||
graph.add((root, codemeta_key, rdflib.URIRef(item))) | |||||
else: | else: | ||||
continue | continue | ||||
self.extra_translation(graph, root, content_dict) | self.extra_translation(graph, root, content_dict) | ||||
def sanitize(self, graph: rdflib.Graph) -> None: | def sanitize(self, graph: rdflib.Graph) -> None: | ||||
# Remove triples that make PyLD crash | # Remove triples that make PyLD crash | ||||
for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): | for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))): | ||||
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines |