D8263.diff
No OneTemporary
Actions

Size

18 KB

Subscribers

None

D8263.diff
View Options

	diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
	--- a/swh/indexer/codemeta.py
	+++ b/swh/indexer/codemeta.py
	@@ -14,6 +14,7 @@
	from pyld import jsonld

	import swh.indexer
	+from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA

	_DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), "data")

	@@ -34,18 +35,14 @@
	CODEMETA_ALTERNATE_CONTEXT_URLS = {
	("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
	}
	-CODEMETA_URI = "https://codemeta.github.io/terms/"
	-SCHEMA_URI = "http://schema.org/"
	-FORGEFED_URI = "https://forgefed.org/ns#"
	-ACTIVITYSTREAMS_URI = "https://www.w3.org/ns/activitystreams#"


	PROPERTY_BLACKLIST = {
	# CodeMeta properties that we cannot properly represent.
	- SCHEMA_URI + "softwareRequirements",
	- CODEMETA_URI + "softwareSuggestions",
	+ SCHEMA.softwareRequirements,
	+ CODEMETA.softwareSuggestions,
	# Duplicate of 'author'
	- SCHEMA_URI + "creator",
	+ SCHEMA.creator,
	}

	_codemeta_field_separator = re.compile(r"\s[,/]\s")
	@@ -64,7 +61,7 @@
	uri = jsonld.JsonLdProcessor.get_context_value(
	_PROCESSED_CODEMETA_CONTEXT, local_name, "@id"
	)
	- assert uri.startswith(("@", CODEMETA_URI, SCHEMA_URI)), (local_name, uri)
	+ assert uri.startswith(("@", CODEMETA._uri, SCHEMA._uri)), (local_name, uri)
	return uri


	@@ -115,10 +112,10 @@
	"documentUrl": url,
	"document": CODEMETA_CONTEXT,
	}
	- elif url == CODEMETA_URI:
	+ elif url == CODEMETA._uri:
	raise Exception(
	"{} is CodeMeta's URI, use {} as context url".format(
	- CODEMETA_URI, CODEMETA_CONTEXT_URL
	+ CODEMETA._uri, CODEMETA_CONTEXT_URL
	)
	)
	else:
	@@ -135,7 +132,7 @@
	"""
	contexts: List[Any] = [CODEMETA_CONTEXT_URL]
	if forgefed:
	- contexts.append({"as": ACTIVITYSTREAMS_URI, "forge": FORGEFED_URI})
	+ contexts.append({"as": ACTIVITYSTREAMS._uri, "forge": FORGEFED._uri})
	return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader})


	@@ -195,8 +192,8 @@
	if "@id" not in merged_document:
	merged_document["@id"] = value
	elif value != merged_document["@id"]:
	- if value not in merged_document[SCHEMA_URI + "sameAs"]:
	- merged_document[SCHEMA_URI + "sameAs"].append(value)
	+ if value not in merged_document[SCHEMA.sameAs]:
	+ merged_document[SCHEMA.sameAs].append(value)
	else:
	for value in values:
	if isinstance(value, dict) and set(value) == {"@list"}:
	diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
	--- a/swh/indexer/metadata_dictionary/base.py
	+++ b/swh/indexer/metadata_dictionary/base.py
	@@ -10,7 +10,8 @@
	from typing_extensions import TypedDict
	import yaml

	-from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values
	+from swh.indexer.codemeta import compact, merge_values
	+from swh.indexer.namespaces import SCHEMA
	from swh.indexer.storage.interface import Sha1


	@@ -26,16 +27,14 @@
	)


	-def produce_terms(
	- namespace: str, terms: List[str]
	-) -> Callable[[TTranslateCallable], TTranslateCallable]:
	+def produce_terms(*uris: str) -> Callable[[TTranslateCallable], TTranslateCallable]:
	"""Returns a decorator that marks the decorated function as adding
	the given terms to the ``translated_metadata`` dict"""

	def decorator(f: TTranslateCallable) -> TTranslateCallable:
	if not hasattr(f, "produced_terms"):
	f.produced_terms = [] # type: ignore
	- f.produced_terms.extend(namespace + term for term in terms) # type: ignore
	+ f.produced_terms.extend(uris) # type: ignore
	return f

	return decorator
	@@ -175,7 +174,7 @@
	the indexer

	"""
	- translated_metadata = {"@type": SCHEMA_URI + "SoftwareSourceCode"}
	+ translated_metadata = {"@type": SCHEMA.SoftwareSourceCode}
	for k, v in content_dict.items():
	# First, check if there is a specific translation
	# method for this key
	diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
	--- a/swh/indexer/metadata_dictionary/cff.py
	+++ b/swh/indexer/metadata_dictionary/cff.py
	@@ -1,6 +1,7 @@
	from typing import Dict, List, Optional, Union

	-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
	+from swh.indexer.codemeta import CROSSWALK_TABLE
	+from swh.indexer.namespaces import SCHEMA

	from .base import YamlMapping

	@@ -17,19 +18,19 @@
	result = []
	for author in d:
	author_data: Dict[str, Optional[Union[str, Dict]]] = {
	- "@type": SCHEMA_URI + "Person"
	+ "@type": SCHEMA.Person
	}
	if "orcid" in author and isinstance(author["orcid"], str):
	author_data["@id"] = author["orcid"]
	if "affiliation" in author and isinstance(author["affiliation"], str):
	- author_data[SCHEMA_URI + "affiliation"] = {
	- "@type": SCHEMA_URI + "Organization",
	- SCHEMA_URI + "name": author["affiliation"],
	+ author_data[SCHEMA.affiliation] = {
	+ "@type": SCHEMA.Organization,
	+ SCHEMA.name: author["affiliation"],
	}
	if "family-names" in author and isinstance(author["family-names"], str):
	- author_data[SCHEMA_URI + "familyName"] = author["family-names"]
	+ author_data[SCHEMA.familyName] = author["family-names"]
	if "given-names" in author and isinstance(author["given-names"], str):
	- author_data[SCHEMA_URI + "givenName"] = author["given-names"]
	+ author_data[SCHEMA.givenName] = author["given-names"]

	result.append(author_data)

	@@ -50,4 +51,4 @@

	def normalize_date_released(self, s: str) -> Dict[str, str]:
	if isinstance(s, str):
	- return {"@value": s, "@type": SCHEMA_URI + "Date"}
	+ return {"@value": s, "@type": SCHEMA.Date}
	diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
	--- a/swh/indexer/metadata_dictionary/composer.py
	+++ b/swh/indexer/metadata_dictionary/composer.py
	@@ -5,7 +5,8 @@

	import os.path

	-from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
	+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
	+from swh.indexer.namespaces import SCHEMA

	from .base import JsonMapping, SingleFileIntrinsicMapping

	@@ -43,13 +44,13 @@
	def normalize_authors(self, author_list):
	authors = []
	for author in author_list:
	- author_obj = {"@type": SCHEMA_URI + "Person"}
	+ author_obj = {"@type": SCHEMA.Person}

	if isinstance(author, dict):
	if isinstance(author.get("name", None), str):
	- author_obj[SCHEMA_URI + "name"] = author.get("name", None)
	+ author_obj[SCHEMA.name] = author.get("name", None)
	if isinstance(author.get("email", None), str):
	- author_obj[SCHEMA_URI + "email"] = author.get("email", None)
	+ author_obj[SCHEMA.email] = author.get("email", None)

	authors.append(author_obj)

	diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py
	--- a/swh/indexer/metadata_dictionary/dart.py
	+++ b/swh/indexer/metadata_dictionary/dart.py
	@@ -6,7 +6,8 @@
	import os.path
	import re

	-from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
	+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
	+from swh.indexer.namespaces import SCHEMA

	from .base import YamlMapping

	@@ -18,8 +19,8 @@

	def name_to_person(name):
	return {
	- "@type": SCHEMA_URI + "Person",
	- SCHEMA_URI + "name": name,
	+ "@type": SCHEMA.Person,
	+ SCHEMA.name: name,
	}


	@@ -50,17 +51,17 @@

	def normalize_author(self, s):
	name_email_regex = "(?P<name>.?)( <(?P<email>.)>)"
	- author = {"@type": SCHEMA_URI + "Person"}
	+ author = {"@type": SCHEMA.Person}
	if isinstance(s, str):
	match = re.search(name_email_regex, s)
	if match:
	name = match.group("name")
	email = match.group("email")
	- author[SCHEMA_URI + "email"] = email
	+ author[SCHEMA.email] = email
	else:
	name = s

	- author[SCHEMA_URI + "name"] = name
	+ author[SCHEMA.name] = name

	return {"@list": [author]}

	diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
	--- a/swh/indexer/metadata_dictionary/github.py
	+++ b/swh/indexer/metadata_dictionary/github.py
	@@ -6,7 +6,8 @@
	import json
	from typing import Any, Dict, Tuple

	-from swh.indexer.codemeta import ACTIVITYSTREAMS_URI, CROSSWALK_TABLE, FORGEFED_URI
	+from swh.indexer.codemeta import CROSSWALK_TABLE
	+from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED

	from .base import BaseExtrinsicMapping, JsonMapping, produce_terms

	@@ -34,11 +35,10 @@

	def _translate_dict(self, content_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
	d = super()._translate_dict(content_dict, **kwargs)
	- d["type"] = FORGEFED_URI + "Repository"
	+ d["type"] = FORGEFED.Repository
	return d

	- @produce_terms(FORGEFED_URI, ["forks"])
	- @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"])
	+ @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
	def translate_forks_count(
	self, translated_metadata: Dict[str, Any], v: Any
	) -> None:
	@@ -57,15 +57,14 @@
	}
	"""
	if isinstance(v, int):
	- translated_metadata.setdefault(FORGEFED_URI + "forks", []).append(
	+ translated_metadata.setdefault(FORGEFED.forks, []).append(
	{
	- "@type": ACTIVITYSTREAMS_URI + "OrderedCollection",
	- ACTIVITYSTREAMS_URI + "totalItems": v,
	+ "@type": ACTIVITYSTREAMS.OrderedCollection,
	+ ACTIVITYSTREAMS.totalItems: v,
	}
	)

	- @produce_terms(ACTIVITYSTREAMS_URI, ["likes"])
	- @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"])
	+ @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
	def translate_stargazers_count(
	self, translated_metadata: Dict[str, Any], v: Any
	) -> None:
	@@ -84,15 +83,14 @@
	}
	"""
	if isinstance(v, int):
	- translated_metadata.setdefault(ACTIVITYSTREAMS_URI + "likes", []).append(
	+ translated_metadata.setdefault(ACTIVITYSTREAMS.likes, []).append(
	{
	- "@type": ACTIVITYSTREAMS_URI + "Collection",
	- ACTIVITYSTREAMS_URI + "totalItems": v,
	+ "@type": ACTIVITYSTREAMS.Collection,
	+ ACTIVITYSTREAMS.totalItems: v,
	}
	)

	- @produce_terms(ACTIVITYSTREAMS_URI, ["followers"])
	- @produce_terms(ACTIVITYSTREAMS_URI, ["totalItems"])
	+ @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
	def translate_watchers_count(
	self, translated_metadata: Dict[str, Any], v: Any
	) -> None:
	@@ -111,12 +109,10 @@
	}
	"""
	if isinstance(v, int):
	- translated_metadata.setdefault(
	- ACTIVITYSTREAMS_URI + "followers", []
	- ).append(
	+ translated_metadata.setdefault(ACTIVITYSTREAMS.followers, []).append(
	{
	- "@type": ACTIVITYSTREAMS_URI + "Collection",
	- ACTIVITYSTREAMS_URI + "totalItems": v,
	+ "@type": ACTIVITYSTREAMS.Collection,
	+ ACTIVITYSTREAMS.totalItems: v,
	}
	)

	diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
	--- a/swh/indexer/metadata_dictionary/maven.py
	+++ b/swh/indexer/metadata_dictionary/maven.py
	@@ -9,7 +9,8 @@

	import xmltodict

	-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
	+from swh.indexer.codemeta import CROSSWALK_TABLE
	+from swh.indexer.namespaces import SCHEMA

	from .base import DictMapping, SingleFileIntrinsicMapping

	@@ -41,8 +42,8 @@
	self.log.warning("Skipping ill-formed XML content: %s", content)
	return None
	metadata = self._translate_dict(d, normalize=False)
	- metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d)
	- metadata[SCHEMA_URI + "license"] = self.parse_licenses(d)
	+ metadata[SCHEMA.codeRepository] = self.parse_repositories(d)
	+ metadata[SCHEMA.license] = self.parse_licenses(d)
	return self.normalize_translation(metadata)

	_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
	diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
	--- a/swh/indexer/metadata_dictionary/npm.py
	+++ b/swh/indexer/metadata_dictionary/npm.py
	@@ -6,7 +6,8 @@
	import re
	import urllib.parse

	-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
	+from swh.indexer.codemeta import CROSSWALK_TABLE
	+from swh.indexer.namespaces import SCHEMA

	from .base import JsonMapping, SingleFileIntrinsicMapping

	@@ -120,7 +121,7 @@
	'http://schema.org/email': 'john.doe@example.org',
	'http://schema.org/name': 'John Doe'}]}
	""" # noqa
	- author = {"@type": SCHEMA_URI + "Person"}
	+ author = {"@type": SCHEMA.Person}
	if isinstance(d, dict):
	name = d.get("name", None)
	email = d.get("email", None)
	@@ -136,15 +137,15 @@
	return None

	if name and isinstance(name, str):
	- author[SCHEMA_URI + "name"] = name
	+ author[SCHEMA.name] = name
	if email and isinstance(email, str):
	- author[SCHEMA_URI + "email"] = email
	+ author[SCHEMA.email] = email
	if url and isinstance(url, str):
	# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
	# URLs that are blatantly invalid early, so PyLD does not crash.
	parsed_url = urllib.parse.urlparse(url)
	if parsed_url.netloc:
	- author[SCHEMA_URI + "url"] = {"@id": url}
	+ author[SCHEMA.url] = {"@id": url}

	return {"@list": [author]}

	diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
	--- a/swh/indexer/metadata_dictionary/nuget.py
	+++ b/swh/indexer/metadata_dictionary/nuget.py
	@@ -9,7 +9,8 @@

	import xmltodict

	-from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable
	+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
	+from swh.indexer.namespaces import SCHEMA
	from swh.indexer.storage.interface import Sha1

	from .base import DictMapping, DirectoryLsEntry, SingleFileIntrinsicMapping
	@@ -93,8 +94,7 @@
	if isinstance(s, str):
	author_names = [a.strip() for a in s.split(",")]
	authors = [
	- {"@type": SCHEMA_URI + "Person", SCHEMA_URI + "name": name}
	- for name in author_names
	+ {"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names
	]
	return {"@list": authors}

	diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
	--- a/swh/indexer/metadata_dictionary/python.py
	+++ b/swh/indexer/metadata_dictionary/python.py
	@@ -7,7 +7,8 @@
	import email.policy
	import itertools

	-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
	+from swh.indexer.codemeta import CROSSWALK_TABLE
	+from swh.indexer.namespaces import SCHEMA

	from .base import DictMapping, SingleFileIntrinsicMapping

	@@ -52,15 +53,13 @@
	if value != "UNKNOWN":
	d.setdefault(key, []).append(value)
	metadata = self._translate_dict(d, normalize=False)
	- if SCHEMA_URI + "author" in metadata or SCHEMA_URI + "email" in metadata:
	- metadata[SCHEMA_URI + "author"] = {
	+ if SCHEMA.author in metadata or SCHEMA.email in metadata:
	+ metadata[SCHEMA.author] = {
	"@list": [
	{
	- "@type": SCHEMA_URI + "Person",
	- SCHEMA_URI
	- + "name": metadata.pop(SCHEMA_URI + "author", [None])[0],
	- SCHEMA_URI
	- + "email": metadata.pop(SCHEMA_URI + "email", [None])[0],
	+ "@type": SCHEMA.Person,
	+ SCHEMA.name: metadata.pop(SCHEMA.author, [None])[0],
	+ SCHEMA.email: metadata.pop(SCHEMA.email, [None])[0],
	}
	]
	}
	diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
	--- a/swh/indexer/metadata_dictionary/ruby.py
	+++ b/swh/indexer/metadata_dictionary/ruby.py
	@@ -8,8 +8,9 @@
	import re
	from typing import List

	-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
	+from swh.indexer.codemeta import CROSSWALK_TABLE
	from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
	+from swh.indexer.namespaces import SCHEMA
	from swh.indexer.storage.interface import Sha1

	from .base import BaseIntrinsicMapping, DictMapping
	@@ -17,8 +18,8 @@

	def name_to_person(name):
	return {
	- "@type": SCHEMA_URI + "Person",
	- SCHEMA_URI + "name": name,
	+ "@type": SCHEMA.Person,
	+ SCHEMA.name: name,
	}


	diff --git a/swh/indexer/namespaces.py b/swh/indexer/namespaces.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/indexer/namespaces.py
	@@ -0,0 +1,28 @@
	+# Copyright (C) 2022 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+
	+class _Namespace:
	+ """Handy class to get terms within a namespace by accessing them as attributes.
	+
	+ This is similar to `rdflib's namespaces
	+ <https://rdflib.readthedocs.io/en/stable/namespaces_and_bindings.html>`__
	+ """
	+
	+ def __init__(self, uri: str):
	+ if not uri.endswith(("#", "/")):
	+ # Sanity check, to make sure it doesn't end with an alphanumerical
	+ # character, which is very likely to be invalid.
	+ raise ValueError(f"Invalid trailing character for namespace URI: {uri}")
	+ self._uri = uri
	+
	+ def __getattr__(self, term: str) -> str:
	+ return self._uri + term
	+
	+
	+SCHEMA = _Namespace("http://schema.org/")
	+CODEMETA = _Namespace("https://codemeta.github.io/terms/")
	+FORGEFED = _Namespace("https://forgefed.org/ns#")
	+ACTIVITYSTREAMS = _Namespace("https://www.w3.org/ns/activitystreams#")

File Metadata

Mime Type: text/plain
Expires: Wed, Dec 18, 4:01 AM (1 d, 22 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3219242

D8263.diffNo OneTemporaryActions

D8263.diffView Options

File Metadata

Event Timeline

D8263.diff
No OneTemporary
Actions

D8263.diff
View Options