Page MenuHomeSoftware Heritage

D8263.diff
No OneTemporary

D8263.diff

diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -14,6 +14,7 @@
from pyld import jsonld
import swh.indexer
+from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA
_DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), "data")
@@ -34,18 +35,14 @@
CODEMETA_ALTERNATE_CONTEXT_URLS = {
("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
}
-CODEMETA_URI = "https://codemeta.github.io/terms/"
-SCHEMA_URI = "http://schema.org/"
-FORGEFED_URI = "https://forgefed.org/ns#"
-ACTIVITYSTREAMS_URI = "https://www.w3.org/ns/activitystreams#"
PROPERTY_BLACKLIST = {
# CodeMeta properties that we cannot properly represent.
- SCHEMA_URI + "softwareRequirements",
- CODEMETA_URI + "softwareSuggestions",
+ SCHEMA.softwareRequirements,
+ CODEMETA.softwareSuggestions,
# Duplicate of 'author'
- SCHEMA_URI + "creator",
+ SCHEMA.creator,
}
_codemeta_field_separator = re.compile(r"\s*[,/]\s*")
@@ -64,7 +61,7 @@
uri = jsonld.JsonLdProcessor.get_context_value(
_PROCESSED_CODEMETA_CONTEXT, local_name, "@id"
)
- assert uri.startswith(("@", CODEMETA_URI, SCHEMA_URI)), (local_name, uri)
+ assert uri.startswith(("@", CODEMETA._uri, SCHEMA._uri)), (local_name, uri)
return uri
@@ -115,10 +112,10 @@
"documentUrl": url,
"document": CODEMETA_CONTEXT,
}
- elif url == CODEMETA_URI:
+ elif url == CODEMETA._uri:
raise Exception(
"{} is CodeMeta's URI, use {} as context url".format(
- CODEMETA_URI, CODEMETA_CONTEXT_URL
+ CODEMETA._uri, CODEMETA_CONTEXT_URL
)
)
else:
@@ -135,7 +132,7 @@
"""
contexts: List[Any] = [CODEMETA_CONTEXT_URL]
if forgefed:
- contexts.append({"as": ACTIVITYSTREAMS_URI, "forge": FORGEFED_URI})
+ contexts.append({"as": ACTIVITYSTREAMS._uri, "forge": FORGEFED._uri})
return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader})
@@ -195,8 +192,8 @@
if "@id" not in merged_document:
merged_document["@id"] = value
elif value != merged_document["@id"]:
- if value not in merged_document[SCHEMA_URI + "sameAs"]:
- merged_document[SCHEMA_URI + "sameAs"].append(value)
+ if value not in merged_document[SCHEMA.sameAs]:
+ merged_document[SCHEMA.sameAs].append(value)
else:
for value in values:
if isinstance(value, dict) and set(value) == {"@list"}:
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -10,7 +10,8 @@
from typing_extensions import TypedDict
import yaml
-from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values
+from swh.indexer.codemeta import compact, merge_values
+from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
@@ -26,16 +27,14 @@
)
-def produce_terms(
- namespace: str, terms: List[str]
-) -> Callable[[TTranslateCallable], TTranslateCallable]:
+def produce_terms(*uris: str) -> Callable[[TTranslateCallable], TTranslateCallable]:
"""Returns a decorator that marks the decorated function as adding
the given terms to the ``translated_metadata`` dict"""
def decorator(f: TTranslateCallable) -> TTranslateCallable:
if not hasattr(f, "produced_terms"):
f.produced_terms = [] # type: ignore
- f.produced_terms.extend(namespace + term for term in terms) # type: ignore
+ f.produced_terms.extend(uris) # type: ignore
return f
return decorator
@@ -175,7 +174,7 @@
the indexer
"""
- translated_metadata = {"@type": SCHEMA_URI + "SoftwareSourceCode"}
+ translated_metadata = {"@type": SCHEMA.SoftwareSourceCode}
for k, v in content_dict.items():
# First, check if there is a specific translation
# method for this key
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -1,6 +1,7 @@
from typing import Dict, List, Optional, Union
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
+from swh.indexer.namespaces import SCHEMA
from .base import YamlMapping
@@ -17,19 +18,19 @@
result = []
for author in d:
author_data: Dict[str, Optional[Union[str, Dict]]] = {
- "@type": SCHEMA_URI + "Person"
+ "@type": SCHEMA.Person
}
if "orcid" in author and isinstance(author["orcid"], str):
author_data["@id"] = author["orcid"]
if "affiliation" in author and isinstance(author["affiliation"], str):
- author_data[SCHEMA_URI + "affiliation"] = {
- "@type": SCHEMA_URI + "Organization",
- SCHEMA_URI + "name": author["affiliation"],
+ author_data[SCHEMA.affiliation] = {
+ "@type": SCHEMA.Organization,
+ SCHEMA.name: author["affiliation"],
}
if "family-names" in author and isinstance(author["family-names"], str):
- author_data[SCHEMA_URI + "familyName"] = author["family-names"]
+ author_data[SCHEMA.familyName] = author["family-names"]
if "given-names" in author and isinstance(author["given-names"], str):
- author_data[SCHEMA_URI + "givenName"] = author["given-names"]
+ author_data[SCHEMA.givenName] = author["given-names"]
result.append(author_data)
@@ -50,4 +51,4 @@
def normalize_date_released(self, s: str) -> Dict[str, str]:
if isinstance(s, str):
- return {"@value": s, "@type": SCHEMA_URI + "Date"}
+ return {"@value": s, "@type": SCHEMA.Date}
diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
--- a/swh/indexer/metadata_dictionary/composer.py
+++ b/swh/indexer/metadata_dictionary/composer.py
@@ -5,7 +5,8 @@
import os.path
-from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
@@ -43,13 +44,13 @@
def normalize_authors(self, author_list):
authors = []
for author in author_list:
- author_obj = {"@type": SCHEMA_URI + "Person"}
+ author_obj = {"@type": SCHEMA.Person}
if isinstance(author, dict):
if isinstance(author.get("name", None), str):
- author_obj[SCHEMA_URI + "name"] = author.get("name", None)
+ author_obj[SCHEMA.name] = author.get("name", None)
if isinstance(author.get("email", None), str):
- author_obj[SCHEMA_URI + "email"] = author.get("email", None)
+ author_obj[SCHEMA.email] = author.get("email", None)
authors.append(author_obj)
diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py
--- a/swh/indexer/metadata_dictionary/dart.py
+++ b/swh/indexer/metadata_dictionary/dart.py
@@ -6,7 +6,8 @@
import os.path
import re
-from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.namespaces import SCHEMA
from .base import YamlMapping
@@ -18,8 +19,8 @@
def name_to_person(name):
return {
- "@type": SCHEMA_URI + "Person",
- SCHEMA_URI + "name": name,
+ "@type": SCHEMA.Person,
+ SCHEMA.name: name,
}
@@ -50,17 +51,17 @@
def normalize_author(self, s):
name_email_regex = "(?P<name>.*?)( <(?P<email>.*)>)"
- author = {"@type": SCHEMA_URI + "Person"}
+ author = {"@type": SCHEMA.Person}
if isinstance(s, str):
match = re.search(name_email_regex, s)
if match:
name = match.group("name")
email = match.group("email")
- author[SCHEMA_URI + "email"] = email
+ author[SCHEMA.email] = email
else:
name = s
- author[SCHEMA_URI + "name"] = name
+ author[SCHEMA.name] = name
return {"@list": [author]}
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -6,7 +6,8 @@
import json
from typing import Any, Dict, Tuple
-from swh.indexer.codemeta import ACTIVITYSTREAMS_URI, CROSSWALK_TABLE, FORGEFED_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
+from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED
from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
@@ -34,11 +35,10 @@
def _translate_dict(self, content_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
d = super()._translate_dict(content_dict, **kwargs)
- d["type"] = FORGEFED_URI + "Repository"
+ d["type"] = FORGEFED.Repository
return d
- @produce_terms(FORGEFED_URI, ["forks"])
- @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"])
+ @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
def translate_forks_count(
self, translated_metadata: Dict[str, Any], v: Any
) -> None:
@@ -57,15 +57,14 @@
}
"""
if isinstance(v, int):
- translated_metadata.setdefault(FORGEFED_URI + "forks", []).append(
+ translated_metadata.setdefault(FORGEFED.forks, []).append(
{
- "@type": ACTIVITYSTREAMS_URI + "OrderedCollection",
- ACTIVITYSTREAMS_URI + "totalItems": v,
+ "@type": ACTIVITYSTREAMS.OrderedCollection,
+ ACTIVITYSTREAMS.totalItems: v,
}
)
- @produce_terms(ACTIVITYSTREAMS_URI, ["likes"])
- @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"])
+ @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
def translate_stargazers_count(
self, translated_metadata: Dict[str, Any], v: Any
) -> None:
@@ -84,15 +83,14 @@
}
"""
if isinstance(v, int):
- translated_metadata.setdefault(ACTIVITYSTREAMS_URI + "likes", []).append(
+ translated_metadata.setdefault(ACTIVITYSTREAMS.likes, []).append(
{
- "@type": ACTIVITYSTREAMS_URI + "Collection",
- ACTIVITYSTREAMS_URI + "totalItems": v,
+ "@type": ACTIVITYSTREAMS.Collection,
+ ACTIVITYSTREAMS.totalItems: v,
}
)
- @produce_terms(ACTIVITYSTREAMS_URI, ["followers"])
- @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"])
+ @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
def translate_watchers_count(
self, translated_metadata: Dict[str, Any], v: Any
) -> None:
@@ -111,12 +109,10 @@
}
"""
if isinstance(v, int):
- translated_metadata.setdefault(
- ACTIVITYSTREAMS_URI + "followers", []
- ).append(
+ translated_metadata.setdefault(ACTIVITYSTREAMS.followers, []).append(
{
- "@type": ACTIVITYSTREAMS_URI + "Collection",
- ACTIVITYSTREAMS_URI + "totalItems": v,
+ "@type": ACTIVITYSTREAMS.Collection,
+ ACTIVITYSTREAMS.totalItems: v,
}
)
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -9,7 +9,8 @@
import xmltodict
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
+from swh.indexer.namespaces import SCHEMA
from .base import DictMapping, SingleFileIntrinsicMapping
@@ -41,8 +42,8 @@
self.log.warning("Skipping ill-formed XML content: %s", content)
return None
metadata = self._translate_dict(d, normalize=False)
- metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d)
- metadata[SCHEMA_URI + "license"] = self.parse_licenses(d)
+ metadata[SCHEMA.codeRepository] = self.parse_repositories(d)
+ metadata[SCHEMA.license] = self.parse_licenses(d)
return self.normalize_translation(metadata)
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -6,7 +6,8 @@
import re
import urllib.parse
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
+from swh.indexer.namespaces import SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
@@ -120,7 +121,7 @@
'http://schema.org/email': 'john.doe@example.org',
'http://schema.org/name': 'John Doe'}]}
""" # noqa
- author = {"@type": SCHEMA_URI + "Person"}
+ author = {"@type": SCHEMA.Person}
if isinstance(d, dict):
name = d.get("name", None)
email = d.get("email", None)
@@ -136,15 +137,15 @@
return None
if name and isinstance(name, str):
- author[SCHEMA_URI + "name"] = name
+ author[SCHEMA.name] = name
if email and isinstance(email, str):
- author[SCHEMA_URI + "email"] = email
+ author[SCHEMA.email] = email
if url and isinstance(url, str):
# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
# URLs that are blatantly invalid early, so PyLD does not crash.
parsed_url = urllib.parse.urlparse(url)
if parsed_url.netloc:
- author[SCHEMA_URI + "url"] = {"@id": url}
+ author[SCHEMA.url] = {"@id": url}
return {"@list": [author]}
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -9,7 +9,8 @@
import xmltodict
-from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
from .base import DictMapping, DirectoryLsEntry, SingleFileIntrinsicMapping
@@ -93,8 +94,7 @@
if isinstance(s, str):
author_names = [a.strip() for a in s.split(",")]
authors = [
- {"@type": SCHEMA_URI + "Person", SCHEMA_URI + "name": name}
- for name in author_names
+ {"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names
]
return {"@list": authors}
diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
--- a/swh/indexer/metadata_dictionary/python.py
+++ b/swh/indexer/metadata_dictionary/python.py
@@ -7,7 +7,8 @@
import email.policy
import itertools
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
+from swh.indexer.namespaces import SCHEMA
from .base import DictMapping, SingleFileIntrinsicMapping
@@ -52,15 +53,13 @@
if value != "UNKNOWN":
d.setdefault(key, []).append(value)
metadata = self._translate_dict(d, normalize=False)
- if SCHEMA_URI + "author" in metadata or SCHEMA_URI + "email" in metadata:
- metadata[SCHEMA_URI + "author"] = {
+ if SCHEMA.author in metadata or SCHEMA.email in metadata:
+ metadata[SCHEMA.author] = {
"@list": [
{
- "@type": SCHEMA_URI + "Person",
- SCHEMA_URI
- + "name": metadata.pop(SCHEMA_URI + "author", [None])[0],
- SCHEMA_URI
- + "email": metadata.pop(SCHEMA_URI + "email", [None])[0],
+ "@type": SCHEMA.Person,
+ SCHEMA.name: metadata.pop(SCHEMA.author, [None])[0],
+ SCHEMA.email: metadata.pop(SCHEMA.email, [None])[0],
}
]
}
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -8,8 +8,9 @@
import re
from typing import List
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
+from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
from .base import BaseIntrinsicMapping, DictMapping
@@ -17,8 +18,8 @@
def name_to_person(name):
return {
- "@type": SCHEMA_URI + "Person",
- SCHEMA_URI + "name": name,
+ "@type": SCHEMA.Person,
+ SCHEMA.name: name,
}
diff --git a/swh/indexer/namespaces.py b/swh/indexer/namespaces.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/namespaces.py
@@ -0,0 +1,28 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+class _Namespace:
+ """Handy class to get terms within a namespace by accessing them as attributes.
+
+ This is similar to `rdflib's namespaces
+ <https://rdflib.readthedocs.io/en/stable/namespaces_and_bindings.html>`__
+ """
+
+ def __init__(self, uri: str):
+ if not uri.endswith(("#", "/")):
+ # Sanity check, to make sure it doesn't end with an alphanumerical
+ # character, which is very likely to be invalid.
+ raise ValueError(f"Invalid trailing character for namespace URI: {uri}")
+ self._uri = uri
+
+ def __getattr__(self, term: str) -> str:
+ return self._uri + term
+
+
+SCHEMA = _Namespace("http://schema.org/")
+CODEMETA = _Namespace("https://codemeta.github.io/terms/")
+FORGEFED = _Namespace("https://forgefed.org/ns#")
+ACTIVITYSTREAMS = _Namespace("https://www.w3.org/ns/activitystreams#")

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 4:01 AM (1 d, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219242

Event Timeline