Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/codemeta.py
# Copyright (C) 2018 The Software Heritage developers | # Copyright (C) 2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import collections | import collections | ||||
import csv | import csv | ||||
import itertools | import itertools | ||||
import json | import json | ||||
import os.path | import os.path | ||||
import re | import re | ||||
from typing import Any, List | from typing import Any, List | ||||
from pyld import jsonld | from pyld import jsonld | ||||
import swh.indexer | import swh.indexer | ||||
from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA | |||||
_DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), "data") | _DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), "data") | ||||
CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, "codemeta", "crosswalk.csv") | CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, "codemeta", "crosswalk.csv") | ||||
CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, "codemeta", "codemeta.jsonld") | CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, "codemeta", "codemeta.jsonld") | ||||
with open(CODEMETA_CONTEXT_PATH) as fd: | with open(CODEMETA_CONTEXT_PATH) as fd: | ||||
CODEMETA_CONTEXT = json.load(fd) | CODEMETA_CONTEXT = json.load(fd) | ||||
_EMPTY_PROCESSED_CONTEXT: Any = {"mappings": {}} | _EMPTY_PROCESSED_CONTEXT: Any = {"mappings": {}} | ||||
_PROCESSED_CODEMETA_CONTEXT = jsonld.JsonLdProcessor().process_context( | _PROCESSED_CODEMETA_CONTEXT = jsonld.JsonLdProcessor().process_context( | ||||
_EMPTY_PROCESSED_CONTEXT, CODEMETA_CONTEXT, None | _EMPTY_PROCESSED_CONTEXT, CODEMETA_CONTEXT, None | ||||
) | ) | ||||
CODEMETA_CONTEXT_URL = "https://doi.org/10.5063/schema/codemeta-2.0" | CODEMETA_CONTEXT_URL = "https://doi.org/10.5063/schema/codemeta-2.0" | ||||
CODEMETA_ALTERNATE_CONTEXT_URLS = { | CODEMETA_ALTERNATE_CONTEXT_URLS = { | ||||
("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") | ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") | ||||
} | } | ||||
CODEMETA_URI = "https://codemeta.github.io/terms/" | |||||
SCHEMA_URI = "http://schema.org/" | |||||
FORGEFED_URI = "https://forgefed.org/ns#" | |||||
ACTIVITYSTREAMS_URI = "https://www.w3.org/ns/activitystreams#" | |||||
PROPERTY_BLACKLIST = { | PROPERTY_BLACKLIST = { | ||||
# CodeMeta properties that we cannot properly represent. | # CodeMeta properties that we cannot properly represent. | ||||
SCHEMA_URI + "softwareRequirements", | SCHEMA.softwareRequirements, | ||||
CODEMETA_URI + "softwareSuggestions", | CODEMETA.softwareSuggestions, | ||||
# Duplicate of 'author' | # Duplicate of 'author' | ||||
SCHEMA_URI + "creator", | SCHEMA.creator, | ||||
} | } | ||||
_codemeta_field_separator = re.compile(r"\s*[,/]\s*") | _codemeta_field_separator = re.compile(r"\s*[,/]\s*") | ||||
def make_absolute_uri(local_name): | def make_absolute_uri(local_name): | ||||
"""Parses codemeta.jsonld, and returns the @id of terms it defines. | """Parses codemeta.jsonld, and returns the @id of terms it defines. | ||||
>>> make_absolute_uri("name") | >>> make_absolute_uri("name") | ||||
'http://schema.org/name' | 'http://schema.org/name' | ||||
>>> make_absolute_uri("downloadUrl") | >>> make_absolute_uri("downloadUrl") | ||||
'http://schema.org/downloadUrl' | 'http://schema.org/downloadUrl' | ||||
>>> make_absolute_uri("referencePublication") | >>> make_absolute_uri("referencePublication") | ||||
'https://codemeta.github.io/terms/referencePublication' | 'https://codemeta.github.io/terms/referencePublication' | ||||
""" | """ | ||||
uri = jsonld.JsonLdProcessor.get_context_value( | uri = jsonld.JsonLdProcessor.get_context_value( | ||||
_PROCESSED_CODEMETA_CONTEXT, local_name, "@id" | _PROCESSED_CODEMETA_CONTEXT, local_name, "@id" | ||||
) | ) | ||||
assert uri.startswith(("@", CODEMETA_URI, SCHEMA_URI)), (local_name, uri) | assert uri.startswith(("@", CODEMETA._uri, SCHEMA._uri)), (local_name, uri) | ||||
return uri | return uri | ||||
def _read_crosstable(fd): | def _read_crosstable(fd): | ||||
reader = csv.reader(fd) | reader = csv.reader(fd) | ||||
try: | try: | ||||
header = next(reader) | header = next(reader) | ||||
except StopIteration: | except StopIteration: | ||||
Show All 34 Lines | def _document_loader(url, options=None): | ||||
Reads the local codemeta.jsonld file instead of fetching it | Reads the local codemeta.jsonld file instead of fetching it | ||||
from the Internet every single time.""" | from the Internet every single time.""" | ||||
if url == CODEMETA_CONTEXT_URL or url in CODEMETA_ALTERNATE_CONTEXT_URLS: | if url == CODEMETA_CONTEXT_URL or url in CODEMETA_ALTERNATE_CONTEXT_URLS: | ||||
return { | return { | ||||
"contextUrl": None, | "contextUrl": None, | ||||
"documentUrl": url, | "documentUrl": url, | ||||
"document": CODEMETA_CONTEXT, | "document": CODEMETA_CONTEXT, | ||||
} | } | ||||
elif url == CODEMETA_URI: | elif url == CODEMETA._uri: | ||||
raise Exception( | raise Exception( | ||||
"{} is CodeMeta's URI, use {} as context url".format( | "{} is CodeMeta's URI, use {} as context url".format( | ||||
CODEMETA_URI, CODEMETA_CONTEXT_URL | CODEMETA._uri, CODEMETA_CONTEXT_URL | ||||
) | ) | ||||
) | ) | ||||
else: | else: | ||||
raise Exception(url) | raise Exception(url) | ||||
def compact(doc, forgefed: bool): | def compact(doc, forgefed: bool): | ||||
"""Same as `pyld.jsonld.compact`, but in the context of CodeMeta. | """Same as `pyld.jsonld.compact`, but in the context of CodeMeta. | ||||
Args: | Args: | ||||
forgefed: Whether to add ForgeFed and ActivityStreams as compact URIs. | forgefed: Whether to add ForgeFed and ActivityStreams as compact URIs. | ||||
This is typically used for extrinsic metadata documents, which frequently | This is typically used for extrinsic metadata documents, which frequently | ||||
use properties from these namespaces. | use properties from these namespaces. | ||||
""" | """ | ||||
contexts: List[Any] = [CODEMETA_CONTEXT_URL] | contexts: List[Any] = [CODEMETA_CONTEXT_URL] | ||||
if forgefed: | if forgefed: | ||||
contexts.append({"as": ACTIVITYSTREAMS_URI, "forge": FORGEFED_URI}) | contexts.append({"as": ACTIVITYSTREAMS._uri, "forge": FORGEFED._uri}) | ||||
return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader}) | return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader}) | ||||
def expand(doc): | def expand(doc): | ||||
"""Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" | """Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" | ||||
return jsonld.expand(doc, options={"documentLoader": _document_loader}) | return jsonld.expand(doc, options={"documentLoader": _document_loader}) | ||||
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines | for document in documents: | ||||
if key == "@id": | if key == "@id": | ||||
# @id does not get expanded to a list | # @id does not get expanded to a list | ||||
value = values | value = values | ||||
# Only one @id is allowed, move it to sameAs | # Only one @id is allowed, move it to sameAs | ||||
if "@id" not in merged_document: | if "@id" not in merged_document: | ||||
merged_document["@id"] = value | merged_document["@id"] = value | ||||
elif value != merged_document["@id"]: | elif value != merged_document["@id"]: | ||||
if value not in merged_document[SCHEMA_URI + "sameAs"]: | if value not in merged_document[SCHEMA.sameAs]: | ||||
merged_document[SCHEMA_URI + "sameAs"].append(value) | merged_document[SCHEMA.sameAs].append(value) | ||||
else: | else: | ||||
for value in values: | for value in values: | ||||
if isinstance(value, dict) and set(value) == {"@list"}: | if isinstance(value, dict) and set(value) == {"@list"}: | ||||
# Value is of the form {'@list': [item1, item2]} | # Value is of the form {'@list': [item1, item2]} | ||||
# instead of the usual [item1, item2]. | # instead of the usual [item1, item2]. | ||||
# We need to merge the inner lists (and mostly | # We need to merge the inner lists (and mostly | ||||
# preserve order). | # preserve order). | ||||
merged_value = merged_document.setdefault(key, {"@list": []}) | merged_value = merged_document.setdefault(key, {"@list": []}) | ||||
Show All 13 Lines |