diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -14,6 +14,7 @@ from pyld import jsonld import swh.indexer +from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA _DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), "data") @@ -34,18 +35,14 @@ CODEMETA_ALTERNATE_CONTEXT_URLS = { ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") } -CODEMETA_URI = "https://codemeta.github.io/terms/" -SCHEMA_URI = "http://schema.org/" -FORGEFED_URI = "https://forgefed.org/ns#" -ACTIVITYSTREAMS_URI = "https://www.w3.org/ns/activitystreams#" PROPERTY_BLACKLIST = { # CodeMeta properties that we cannot properly represent. - SCHEMA_URI + "softwareRequirements", - CODEMETA_URI + "softwareSuggestions", + SCHEMA.softwareRequirements, + CODEMETA.softwareSuggestions, # Duplicate of 'author' - SCHEMA_URI + "creator", + SCHEMA.creator, } _codemeta_field_separator = re.compile(r"\s*[,/]\s*") @@ -64,7 +61,7 @@ uri = jsonld.JsonLdProcessor.get_context_value( _PROCESSED_CODEMETA_CONTEXT, local_name, "@id" ) - assert uri.startswith(("@", CODEMETA_URI, SCHEMA_URI)), (local_name, uri) + assert uri.startswith(("@", CODEMETA._uri, SCHEMA._uri)), (local_name, uri) return uri @@ -115,10 +112,10 @@ "documentUrl": url, "document": CODEMETA_CONTEXT, } - elif url == CODEMETA_URI: + elif url == CODEMETA._uri: raise Exception( "{} is CodeMeta's URI, use {} as context url".format( - CODEMETA_URI, CODEMETA_CONTEXT_URL + CODEMETA._uri, CODEMETA_CONTEXT_URL ) ) else: @@ -135,7 +132,7 @@ """ contexts: List[Any] = [CODEMETA_CONTEXT_URL] if forgefed: - contexts.append({"as": ACTIVITYSTREAMS_URI, "forge": FORGEFED_URI}) + contexts.append({"as": ACTIVITYSTREAMS._uri, "forge": FORGEFED._uri}) return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader}) @@ -195,8 +192,8 @@ if "@id" not in merged_document: merged_document["@id"] = value elif value != merged_document["@id"]: - if value not in merged_document[SCHEMA_URI + "sameAs"]: - merged_document[SCHEMA_URI + "sameAs"].append(value) + if value not in merged_document[SCHEMA.sameAs]: + merged_document[SCHEMA.sameAs].append(value) else: for value in values: if isinstance(value, dict) and set(value) == {"@list"}: diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -10,7 +10,8 @@ from typing_extensions import TypedDict import yaml -from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values +from swh.indexer.codemeta import compact, merge_values +from swh.indexer.namespaces import SCHEMA from swh.indexer.storage.interface import Sha1 @@ -26,16 +27,14 @@ ) -def produce_terms( - namespace: str, terms: List[str] -) -> Callable[[TTranslateCallable], TTranslateCallable]: +def produce_terms(*uris: str) -> Callable[[TTranslateCallable], TTranslateCallable]: """Returns a decorator that marks the decorated function as adding the given terms to the ``translated_metadata`` dict""" def decorator(f: TTranslateCallable) -> TTranslateCallable: if not hasattr(f, "produced_terms"): f.produced_terms = [] # type: ignore - f.produced_terms.extend(namespace + term for term in terms) # type: ignore + f.produced_terms.extend(uris) # type: ignore return f return decorator @@ -175,7 +174,7 @@ the indexer """ - translated_metadata = {"@type": SCHEMA_URI + "SoftwareSourceCode"} + translated_metadata = {"@type": SCHEMA.SoftwareSourceCode} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -1,6 +1,7 @@ from typing import Dict, List, Optional, Union -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE +from swh.indexer.namespaces import SCHEMA from .base import YamlMapping @@ -17,19 +18,19 @@ result = [] for author in d: author_data: Dict[str, Optional[Union[str, Dict]]] = { - "@type": SCHEMA_URI + "Person" + "@type": SCHEMA.Person } if "orcid" in author and isinstance(author["orcid"], str): author_data["@id"] = author["orcid"] if "affiliation" in author and isinstance(author["affiliation"], str): - author_data[SCHEMA_URI + "affiliation"] = { - "@type": SCHEMA_URI + "Organization", - SCHEMA_URI + "name": author["affiliation"], + author_data[SCHEMA.affiliation] = { + "@type": SCHEMA.Organization, + SCHEMA.name: author["affiliation"], } if "family-names" in author and isinstance(author["family-names"], str): - author_data[SCHEMA_URI + "familyName"] = author["family-names"] + author_data[SCHEMA.familyName] = author["family-names"] if "given-names" in author and isinstance(author["given-names"], str): - author_data[SCHEMA_URI + "givenName"] = author["given-names"] + author_data[SCHEMA.givenName] = author["given-names"] result.append(author_data) @@ -50,4 +51,4 @@ def normalize_date_released(self, s: str) -> Dict[str, str]: if isinstance(s, str): - return {"@value": s, "@type": SCHEMA_URI + "Date"} + return {"@value": s, "@type": SCHEMA.Date} diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py --- a/swh/indexer/metadata_dictionary/composer.py +++ b/swh/indexer/metadata_dictionary/composer.py @@ -5,7 +5,8 @@ import os.path -from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.namespaces import SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping @@ -43,13 +44,13 @@ def normalize_authors(self, author_list): authors = [] for author in author_list: - author_obj = {"@type": SCHEMA_URI + "Person"} + author_obj = {"@type": SCHEMA.Person} if isinstance(author, dict): if isinstance(author.get("name", None), str): - author_obj[SCHEMA_URI + "name"] = author.get("name", None) + author_obj[SCHEMA.name] = author.get("name", None) if isinstance(author.get("email", None), str): - author_obj[SCHEMA_URI + "email"] = author.get("email", None) + author_obj[SCHEMA.email] = author.get("email", None) authors.append(author_obj) diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py --- a/swh/indexer/metadata_dictionary/dart.py +++ b/swh/indexer/metadata_dictionary/dart.py @@ -6,7 +6,8 @@ import os.path import re -from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.namespaces import SCHEMA from .base import YamlMapping @@ -18,8 +19,8 @@ def name_to_person(name): return { - "@type": SCHEMA_URI + "Person", - SCHEMA_URI + "name": name, + "@type": SCHEMA.Person, + SCHEMA.name: name, } @@ -50,17 +51,17 @@ def normalize_author(self, s): name_email_regex = "(?P.*?)( <(?P.*)>)" - author = {"@type": SCHEMA_URI + "Person"} + author = {"@type": SCHEMA.Person} if isinstance(s, str): match = re.search(name_email_regex, s) if match: name = match.group("name") email = match.group("email") - author[SCHEMA_URI + "email"] = email + author[SCHEMA.email] = email else: name = s - author[SCHEMA_URI + "name"] = name + author[SCHEMA.name] = name return {"@list": [author]} diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -6,7 +6,8 @@ import json from typing import Any, Dict, Tuple -from swh.indexer.codemeta import ACTIVITYSTREAMS_URI, CROSSWALK_TABLE, FORGEFED_URI +from swh.indexer.codemeta import CROSSWALK_TABLE +from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED from .base import BaseExtrinsicMapping, JsonMapping, produce_terms @@ -34,11 +35,10 @@ def _translate_dict(self, content_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]: d = super()._translate_dict(content_dict, **kwargs) - d["type"] = FORGEFED_URI + "Repository" + d["type"] = FORGEFED.Repository return d - @produce_terms(FORGEFED_URI, ["forks"]) - @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"]) + @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems) def translate_forks_count( self, translated_metadata: Dict[str, Any], v: Any ) -> None: @@ -57,15 +57,14 @@ } """ if isinstance(v, int): - translated_metadata.setdefault(FORGEFED_URI + "forks", []).append( + translated_metadata.setdefault(FORGEFED.forks, []).append( { - "@type": ACTIVITYSTREAMS_URI + "OrderedCollection", - ACTIVITYSTREAMS_URI + "totalItems": v, + "@type": ACTIVITYSTREAMS.OrderedCollection, + ACTIVITYSTREAMS.totalItems: v, } ) - @produce_terms(ACTIVITYSTREAMS_URI, ["likes"]) - @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"]) + @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems) def translate_stargazers_count( self, translated_metadata: Dict[str, Any], v: Any ) -> None: @@ -84,15 +83,14 @@ } """ if isinstance(v, int): - translated_metadata.setdefault(ACTIVITYSTREAMS_URI + "likes", []).append( + translated_metadata.setdefault(ACTIVITYSTREAMS.likes, []).append( { - "@type": ACTIVITYSTREAMS_URI + "Collection", - ACTIVITYSTREAMS_URI + "totalItems": v, + "@type": ACTIVITYSTREAMS.Collection, + ACTIVITYSTREAMS.totalItems: v, } ) - @produce_terms(ACTIVITYSTREAMS_URI, ["followers"]) - @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"]) + @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems) def translate_watchers_count( self, translated_metadata: Dict[str, Any], v: Any ) -> None: @@ -111,12 +109,10 @@ } """ if isinstance(v, int): - translated_metadata.setdefault( - ACTIVITYSTREAMS_URI + "followers", [] - ).append( + translated_metadata.setdefault(ACTIVITYSTREAMS.followers, []).append( { - "@type": ACTIVITYSTREAMS_URI + "Collection", - ACTIVITYSTREAMS_URI + "totalItems": v, + "@type": ACTIVITYSTREAMS.Collection, + ACTIVITYSTREAMS.totalItems: v, } ) diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -9,7 +9,8 @@ import xmltodict -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE +from swh.indexer.namespaces import SCHEMA from .base import DictMapping, SingleFileIntrinsicMapping @@ -41,8 +42,8 @@ self.log.warning("Skipping ill-formed XML content: %s", content) return None metadata = self._translate_dict(d, normalize=False) - metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) - metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) + metadata[SCHEMA.codeRepository] = self.parse_repositories(d) + metadata[SCHEMA.license] = self.parse_licenses(d) return self.normalize_translation(metadata) _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -6,7 +6,8 @@ import re import urllib.parse -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE +from swh.indexer.namespaces import SCHEMA from .base import JsonMapping, SingleFileIntrinsicMapping @@ -120,7 +121,7 @@ 'http://schema.org/email': 'john.doe@example.org', 'http://schema.org/name': 'John Doe'}]} """ # noqa - author = {"@type": SCHEMA_URI + "Person"} + author = {"@type": SCHEMA.Person} if isinstance(d, dict): name = d.get("name", None) email = d.get("email", None) @@ -136,15 +137,15 @@ return None if name and isinstance(name, str): - author[SCHEMA_URI + "name"] = name + author[SCHEMA.name] = name if email and isinstance(email, str): - author[SCHEMA_URI + "email"] = email + author[SCHEMA.email] = email if url and isinstance(url, str): # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop # URLs that are blatantly invalid early, so PyLD does not crash. parsed_url = urllib.parse.urlparse(url) if parsed_url.netloc: - author[SCHEMA_URI + "url"] = {"@id": url} + author[SCHEMA.url] = {"@id": url} return {"@list": [author]} diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py --- a/swh/indexer/metadata_dictionary/nuget.py +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -9,7 +9,8 @@ import xmltodict -from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable +from swh.indexer.codemeta import _DATA_DIR, _read_crosstable +from swh.indexer.namespaces import SCHEMA from swh.indexer.storage.interface import Sha1 from .base import DictMapping, DirectoryLsEntry, SingleFileIntrinsicMapping @@ -93,8 +94,7 @@ if isinstance(s, str): author_names = [a.strip() for a in s.split(",")] authors = [ - {"@type": SCHEMA_URI + "Person", SCHEMA_URI + "name": name} - for name in author_names + {"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names ] return {"@list": authors} diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py --- a/swh/indexer/metadata_dictionary/python.py +++ b/swh/indexer/metadata_dictionary/python.py @@ -7,7 +7,8 @@ import email.policy import itertools -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE +from swh.indexer.namespaces import SCHEMA from .base import DictMapping, SingleFileIntrinsicMapping @@ -52,15 +53,13 @@ if value != "UNKNOWN": d.setdefault(key, []).append(value) metadata = self._translate_dict(d, normalize=False) - if SCHEMA_URI + "author" in metadata or SCHEMA_URI + "email" in metadata: - metadata[SCHEMA_URI + "author"] = { + if SCHEMA.author in metadata or SCHEMA.email in metadata: + metadata[SCHEMA.author] = { "@list": [ { - "@type": SCHEMA_URI + "Person", - SCHEMA_URI - + "name": metadata.pop(SCHEMA_URI + "author", [None])[0], - SCHEMA_URI - + "email": metadata.pop(SCHEMA_URI + "email", [None])[0], + "@type": SCHEMA.Person, + SCHEMA.name: metadata.pop(SCHEMA.author, [None])[0], + SCHEMA.email: metadata.pop(SCHEMA.email, [None])[0], } ] } diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py --- a/swh/indexer/metadata_dictionary/ruby.py +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -8,8 +8,9 @@ import re from typing import List -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE from swh.indexer.metadata_dictionary.base import DirectoryLsEntry +from swh.indexer.namespaces import SCHEMA from swh.indexer.storage.interface import Sha1 from .base import BaseIntrinsicMapping, DictMapping @@ -17,8 +18,8 @@ def name_to_person(name): return { - "@type": SCHEMA_URI + "Person", - SCHEMA_URI + "name": name, + "@type": SCHEMA.Person, + SCHEMA.name: name, } diff --git a/swh/indexer/namespaces.py b/swh/indexer/namespaces.py new file mode 100644 --- /dev/null +++ b/swh/indexer/namespaces.py @@ -0,0 +1,28 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class _Namespace: + """Handy class to get terms within a namespace by accessing them as attributes. + + This is similar to `rdflib's namespaces + `__ + """ + + def __init__(self, uri: str): + if not uri.endswith(("#", "/")): + # Sanity check, to make sure it doesn't end with an alphanumerical + # character, which is very likely to be invalid. + raise ValueError(f"Invalid trailing character for namespace URI: {uri}") + self._uri = uri + + def __getattr__(self, term: str) -> str: + return self._uri + term + + +SCHEMA = _Namespace("http://schema.org/") +CODEMETA = _Namespace("https://codemeta.github.io/terms/") +FORGEFED = _Namespace("https://forgefed.org/ns#") +ACTIVITYSTREAMS = _Namespace("https://www.w3.org/ns/activitystreams#")