Page MenuHomeSoftware Heritage

D7342.id26539.diff
No OneTemporary

D7342.id26539.diff

diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py
--- a/swh/indexer/metadata_dictionary/__init__.py
+++ b/swh/indexer/metadata_dictionary/__init__.py
@@ -1,10 +1,12 @@
import collections
+from typing import DefaultDict, Dict, Final, Set, Type
import click
from . import cff, codemeta, maven, npm, python, ruby
+from .base import BaseMapping
-MAPPINGS = {
+MAPPINGS: Final[Dict[str, Type[BaseMapping]]] = {
"CodemetaMapping": codemeta.CodemetaMapping,
"MavenMapping": maven.MavenMapping,
"NpmMapping": npm.NpmMapping,
@@ -14,7 +16,7 @@
}
-def list_terms():
+def list_terms() -> DefaultDict[str, Set[Type[BaseMapping]]]:
"""Returns a dictionary with all supported CodeMeta terms as keys,
and the mappings that support each of them as values."""
d = collections.defaultdict(set)
@@ -27,7 +29,7 @@
@click.command()
@click.argument("mapping_name")
@click.argument("file_name")
-def main(mapping_name, file_name):
+def main(mapping_name: str, file_name: str) -> None:
from pprint import pprint
with open(file_name, "rb") as fd:
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -5,11 +5,47 @@
import json
import logging
-from typing import List
+from typing import Dict, Iterable, List, Optional
+
+from typing_extensions import TypedDict
from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values
+class FileEntry(TypedDict):
+ name: bytes
+ sha1: bytes
+ sha1_git: bytes
+ target: bytes
+ length: int
+ status: str
+ type: str
+ perms: int
+ dir_id: bytes
+
+
+SchemaEntry = TypedDict("SchemaEntry", {"@id": str})
+
+Affiliation = TypedDict("Affiliation", {"@type": str, "http://schema.org/name": str})
+Author = TypedDict(
+ "Author",
+ {
+ "@type": str,
+ "@id": str,
+ "http://schema.org/name": str,
+ "http://schema.org/email": str,
+ "http://schema.org/familyName": str,
+ "http://schema.org/givenName": str,
+ "http://schema.org/affiliation": Affiliation,
+ "http://schema.org/url": SchemaEntry,
+ },
+ total=False,
+)
+Authors = TypedDict("Authors", {"@list": List[Author]})
+
+Date = TypedDict("Date", {"@value": str, "@type": str})
+
+
class BaseMapping:
"""Base class for mappings to inherit from
@@ -19,35 +55,40 @@
- override translate function
"""
- def __init__(self, log_suffix=""):
+ def __init__(self, log_suffix: str = ""):
self.log_suffix = log_suffix
self.log = logging.getLogger(
"%s.%s" % (self.__class__.__module__, self.__class__.__name__)
)
@property
- def name(self):
+ def name(self) -> str:
"""A name of this mapping, used as an identifier in the
indexer storage."""
raise NotImplementedError(f"{self.__class__.__name__}.name")
@classmethod
- def detect_metadata_files(cls, files):
+ def detect_metadata_files(cls, file_entries: List[FileEntry]) -> List[bytes]:
"""
Detects files potentially containing metadata
Args:
- file_entries (list): list of files
+ file_entries: list of files
Returns:
list: list of sha1 (possibly empty)
"""
raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
- def translate(self, file_content):
+ @classmethod
+ def supported_terms(cls) -> Iterable[str]:
+ """Returns all CodeMeta terms this mapping supports"""
+ raise NotImplementedError(f"{cls.__name__}.supported_terms")
+
+ def translate(self, file_content: bytes) -> Optional[Dict]:
raise NotImplementedError(f"{self.__class__.__name__}.translate")
- def normalize_translation(self, metadata):
+ def normalize_translation(self, metadata: Dict) -> Dict:
return compact(metadata)
@@ -55,14 +96,14 @@
"""Base class for all mappings that use a single file as input."""
@property
- def filename(self):
+ def filename(self) -> bytes:
"""The .json file to extract metadata from."""
raise NotImplementedError(f"{self.__class__.__name__}.filename")
@classmethod
- def detect_metadata_files(cls, file_entries):
+ def detect_metadata_files(cls, file_entries: List[FileEntry]) -> List[bytes]:
for entry in file_entries:
- if entry["name"].lower() == cls.filename.lower():
+ if entry["name"].lower() == cls.filename.lower(): # type: ignore
return [entry["sha1"]]
return []
@@ -71,36 +112,36 @@
"""Base class for mappings that take as input a file that is mostly
a key-value store (eg. a shallow JSON dict)."""
- string_fields = [] # type: List[str]
+ string_fields: List[str] = []
"""List of fields that are simple strings, and don't need any
normalization."""
@property
- def mapping(self):
+ def mapping(self) -> Dict[str, str]:
"""A translation dict to map dict keys into a canonical name."""
raise NotImplementedError(f"{self.__class__.__name__}.mapping")
@staticmethod
- def _normalize_method_name(name):
+ def _normalize_method_name(name: str) -> str:
return name.replace("-", "_")
@classmethod
- def supported_terms(cls):
+ def supported_terms(cls) -> Iterable[str]:
return {
term
- for (key, term) in cls.mapping.items()
+ for (key, term) in cls.mapping.items() # type: ignore
if key in cls.string_fields
or hasattr(cls, "translate_" + cls._normalize_method_name(key))
or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
}
- def _translate_dict(self, content_dict, *, normalize=True):
+ def _translate_dict(self, content_dict: Dict, *, normalize: bool = True) -> Dict:
"""
Translates content by parsing content from a dict object
and translating with the appropriate mapping
Args:
- content_dict (dict): content dict to translate
+ content_dict: content dict to translate
Returns:
dict: translated metadata in json-friendly form needed for
@@ -150,13 +191,13 @@
class JsonMapping(DictMapping, SingleFileMapping):
"""Base class for all mappings that use a JSON file as input."""
- def translate(self, raw_content):
+ def translate(self, raw_content: bytes) -> Optional[Dict]:
"""
Translates content by parsing content from a bytestring containing
json data and translating with the appropriate mapping
Args:
- raw_content (bytes): raw content to translate
+ raw_content: raw content to translate
Returns:
dict: translated metadata in json-friendly form needed for
@@ -164,14 +205,15 @@
"""
try:
- raw_content = raw_content.decode()
+ content: str = raw_content.decode()
except UnicodeDecodeError:
self.log.warning("Error unidecoding from %s", self.log_suffix)
- return
+ return None
try:
- content_dict = json.loads(raw_content)
+ content_dict = json.loads(content)
except json.JSONDecodeError:
self.log.warning("Error unjsoning from %s", self.log_suffix)
- return
+ return None
if isinstance(content_dict, dict):
return self._translate_dict(content_dict)
+ return None
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -1,8 +1,10 @@
+from typing import Dict, List, Optional
+
import yaml
from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping, SingleFileMapping
+from .base import Author, Authors, Date, DictMapping, SchemaEntry, SingleFileMapping
yaml.SafeLoader.yaml_implicit_resolvers = {
k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
@@ -18,17 +20,17 @@
mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
string_fields = ["keywords", "license", "abstract", "version", "doi"]
- def translate(self, raw_content):
- raw_content = raw_content.decode()
- content_dict = yaml.load(raw_content, Loader=yaml.SafeLoader)
+ def translate(self, raw_content: bytes) -> Dict:
+ content: str = raw_content.decode()
+ content_dict = yaml.load(content, Loader=yaml.SafeLoader)
metadata = self._translate_dict(content_dict)
metadata["@context"] = CODEMETA_CONTEXT_URL
return metadata
- def normalize_authors(self, d):
- result = []
+ def normalize_authors(self, d) -> Authors:
+ result: List[Author] = []
for author in d:
author_data = {"@type": SCHEMA_URI + "Person"}
if "orcid" in author:
@@ -37,29 +39,32 @@
author_data[SCHEMA_URI + "affiliation"] = {
"@type": SCHEMA_URI + "Organization",
SCHEMA_URI + "name": author["affiliation"],
- }
+ } # type: ignore
if "family-names" in author:
author_data[SCHEMA_URI + "familyName"] = author["family-names"]
if "given-names" in author:
author_data[SCHEMA_URI + "givenName"] = author["given-names"]
- result.append(author_data)
+ result.append(author_data) # type: ignore
- result = {"@list": result}
- return result
+ return {"@list": result}
- def normalize_doi(self, s):
+ def normalize_doi(self, s) -> Optional[SchemaEntry]:
if isinstance(s, str):
return {"@id": "https://doi.org/" + s}
+ return None
- def normalize_license(self, s):
+ def normalize_license(self, s) -> Optional[SchemaEntry]:
if isinstance(s, str):
return {"@id": "https://spdx.org/licenses/" + s}
+ return None
- def normalize_repository_code(self, s):
+ def normalize_repository_code(self, s) -> Optional[SchemaEntry]:
if isinstance(s, str):
return {"@id": s}
+ return None
- def normalize_date_released(self, s):
+ def normalize_date_released(self, s) -> Optional[Date]:
if isinstance(s, str):
return {"@value": s, "@type": SCHEMA_URI + "Date"}
+ return None
diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
import json
+from typing import Dict, List, Optional
from swh.indexer.codemeta import CODEMETA_TERMS, expand
@@ -20,10 +21,10 @@
string_fields = None
@classmethod
- def supported_terms(cls):
+ def supported_terms(cls) -> List[str]:
return [term for term in CODEMETA_TERMS if not term.startswith("@")]
- def translate(self, content):
+ def translate(self, content: bytes) -> Optional[Dict]:
try:
return self.normalize_translation(expand(json.loads(content.decode())))
except Exception:
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -4,13 +4,14 @@
# See top-level LICENSE file for more information
import os
+from typing import Dict, List, Optional
import xml.parsers.expat
import xmltodict
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping, SingleFileMapping
+from .base import DictMapping, SchemaEntry, SingleFileMapping
class MavenMapping(DictMapping, SingleFileMapping):
@@ -46,7 +47,7 @@
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
- def parse_repositories(self, d):
+ def parse_repositories(self, d: Dict) -> Optional[List[Optional[SchemaEntry]]]:
"""https://maven.apache.org/pom.html#Repositories
>>> import xmltodict
@@ -75,11 +76,11 @@
results = []
return [res for res in results if res] or None
- def parse_repository(self, d, repo):
+ def parse_repository(self, d: Dict, repo) -> Optional[SchemaEntry]:
if not isinstance(repo, dict):
- return
+ return None
if repo.get("layout", "default") != "default":
- return # TODO ?
+ return None # TODO ?
url = repo.get("url")
group_id = d.get("groupId")
artifact_id = d.get("artifactId")
@@ -90,8 +91,9 @@
):
repo = os.path.join(url, *group_id.split("."), artifact_id)
return {"@id": repo}
+ return None
- def normalize_groupId(self, id_):
+ def normalize_groupId(self, id_) -> Optional[SchemaEntry]:
"""https://maven.apache.org/pom.html#Maven_Coordinates
>>> MavenMapping().normalize_groupId('org.example')
@@ -99,8 +101,9 @@
"""
if isinstance(id_, str):
return {"@id": id_}
+ return None
- def parse_licenses(self, d):
+ def parse_licenses(self, d) -> Optional[List[SchemaEntry]]:
"""https://maven.apache.org/pom.html#Licenses
>>> import xmltodict
@@ -148,12 +151,12 @@
licenses = d.get("licenses")
if not isinstance(licenses, dict):
- return
+ return None
licenses = licenses.get("license")
if isinstance(licenses, dict):
licenses = [licenses]
elif not isinstance(licenses, list):
- return
+ return None
return [
{"@id": license["url"]}
for license in licenses
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -4,10 +4,11 @@
# See top-level LICENSE file for more information
import re
+from typing import Any, Dict, List, Optional, cast
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import JsonMapping
+from .base import Author, Authors, JsonMapping, SchemaEntry
class NpmMapping(JsonMapping):
@@ -29,7 +30,7 @@
# 'bitbucket': 'https://bitbucket.org/',
}
- def normalize_repository(self, d):
+ def normalize_repository(self, d) -> Optional[SchemaEntry]:
"""https://docs.npmjs.com/files/package.json#repository
>>> NpmMapping().normalize_repository({
@@ -67,7 +68,7 @@
return {"@id": url}
- def normalize_bugs(self, d):
+ def normalize_bugs(self, d) -> Optional[SchemaEntry]:
"""https://docs.npmjs.com/files/package.json#bugs
>>> NpmMapping().normalize_bugs({
@@ -90,7 +91,7 @@
r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
)
- def normalize_author(self, d):
+ def normalize_author(self, d) -> Optional[Authors]:
"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
>>> from pprint import pprint
@@ -111,7 +112,7 @@
'http://schema.org/name': 'John Doe',
'http://schema.org/url': {'@id': 'https://example.org/~john.doe'}}]}
""" # noqa
- author = {"@type": SCHEMA_URI + "Person"}
+ author: Dict[str, Any] = {"@type": SCHEMA_URI + "Person"}
if isinstance(d, dict):
name = d.get("name", None)
email = d.get("email", None)
@@ -131,9 +132,10 @@
author[SCHEMA_URI + "email"] = email
if url and isinstance(url, str):
author[SCHEMA_URI + "url"] = {"@id": url}
- return {"@list": [author]}
+ authors = [cast(Author, author)]
+ return {"@list": authors}
- def normalize_license(self, s):
+ def normalize_license(self, s) -> Optional[SchemaEntry]:
"""https://docs.npmjs.com/files/package.json#license
>>> NpmMapping().normalize_license('MIT')
@@ -141,8 +143,9 @@
"""
if isinstance(s, str):
return {"@id": "https://spdx.org/licenses/" + s}
+ return None
- def normalize_homepage(self, s):
+ def normalize_homepage(self, s) -> Optional[SchemaEntry]:
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_homepage('https://example.org/~john.doe')
@@ -150,8 +153,9 @@
"""
if isinstance(s, str):
return {"@id": s}
+ return None
- def normalize_keywords(self, lst):
+ def normalize_keywords(self, lst: List) -> Optional[List[str]]:
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_keywords(['foo', 'bar'])
@@ -159,3 +163,4 @@
"""
if isinstance(lst, list):
return [x for x in lst if isinstance(x, str)]
+ return None
diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py
--- a/swh/indexer/metadata_dictionary/python.py
+++ b/swh/indexer/metadata_dictionary/python.py
@@ -6,10 +6,11 @@
import email.parser
import email.policy
import itertools
+from typing import Dict, List
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping, SingleFileMapping
+from .base import DictMapping, SchemaEntry, SingleFileMapping
_normalize_pkginfo_key = str.lower
@@ -44,9 +45,9 @@
_parser = email.parser.BytesHeaderParser(policy=LinebreakPreservingEmailPolicy())
- def translate(self, content):
+ def translate(self, content: bytes) -> Dict:
msg = self._parser.parsebytes(content)
- d = {}
+ d: Dict[str, List[str]] = {}
for (key, value) in msg.items():
key = _normalize_pkginfo_key(key)
if value != "UNKNOWN":
@@ -66,11 +67,11 @@
}
return self.normalize_translation(metadata)
- def normalize_home_page(self, urls):
+ def normalize_home_page(self, urls: List[str]) -> List[SchemaEntry]:
return [{"@id": url} for url in urls]
- def normalize_keywords(self, keywords):
+ def normalize_keywords(self, keywords: List[str]) -> List[str]:
return list(itertools.chain.from_iterable(s.split(" ") for s in keywords))
- def normalize_license(self, licenses):
+ def normalize_license(self, licenses: str) -> List[SchemaEntry]:
return [{"@id": license} for license in licenses]
diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py
--- a/swh/indexer/metadata_dictionary/ruby.py
+++ b/swh/indexer/metadata_dictionary/ruby.py
@@ -6,16 +6,17 @@
import ast
import itertools
import re
+from typing import Dict, List, Optional, Union
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
-from .base import DictMapping
+from .base import Author, Authors, DictMapping, FileEntry, SchemaEntry
-def name_to_person(name):
+def name_to_person(name: str) -> Author:
return {
"@type": SCHEMA_URI + "Person",
- SCHEMA_URI + "name": name,
+ SCHEMA_URI + "name": name, # type: ignore
}
@@ -28,29 +29,29 @@
_re_spec_entry = re.compile(r"\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)")
@classmethod
- def detect_metadata_files(cls, file_entries):
+ def detect_metadata_files(cls, file_entries: List[FileEntry]) -> List[bytes]:
for entry in file_entries:
if entry["name"].endswith(b".gemspec"):
return [entry["sha1"]]
return []
- def translate(self, raw_content):
+ def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]:
try:
- raw_content = raw_content.decode()
+ content: str = raw_content.decode()
except UnicodeDecodeError:
self.log.warning("Error unidecoding from %s", self.log_suffix)
- return
+ return None
# Skip lines before 'Gem::Specification.new'
lines = itertools.dropwhile(
- lambda x: not self._re_spec_new.match(x), raw_content.split("\n")
+ lambda x: not self._re_spec_new.match(x), content.split("\n")
)
try:
next(lines) # Consume 'Gem::Specification.new'
except StopIteration:
self.log.warning("Could not find Gem::Specification in %s", self.log_suffix)
- return
+ return None
content_dict = {}
for line in lines:
@@ -61,7 +62,7 @@
content_dict[match.group("key")] = value
return self._translate_dict(content_dict)
- def eval_ruby_expression(self, expr):
+ def eval_ruby_expression(self, expr: str) -> Optional[Union[str, List]]:
"""Very simple evaluator of Ruby expressions.
>>> GemspecMapping().eval_ruby_expression('"Foo bar"')
@@ -97,31 +98,36 @@
# of such strings).
tree = ast.parse(expr, mode="eval")
except (SyntaxError, ValueError):
- return
+ return None
if isinstance(tree, ast.Expression):
return evaluator(tree.body)
+ return None
- def normalize_homepage(self, s):
+ def normalize_homepage(self, s) -> Optional[SchemaEntry]:
if isinstance(s, str):
return {"@id": s}
+ return None
- def normalize_license(self, s):
+ def normalize_license(self, s) -> Optional[List[SchemaEntry]]:
if isinstance(s, str):
return [{"@id": "https://spdx.org/licenses/" + s}]
+ return None
- def normalize_licenses(self, licenses):
+ def normalize_licenses(self, licenses) -> Optional[List[SchemaEntry]]:
if isinstance(licenses, list):
return [
{"@id": "https://spdx.org/licenses/" + license}
for license in licenses
if isinstance(license, str)
]
+ return None
- def normalize_author(self, author):
+ def normalize_author(self, author) -> Optional[Authors]:
if isinstance(author, str):
return {"@list": [name_to_person(author)]}
+ return None
- def normalize_authors(self, authors):
+ def normalize_authors(self, authors) -> Optional[Authors]:
if isinstance(authors, list):
return {
"@list": [
@@ -130,3 +136,4 @@
if isinstance(author, str)
]
}
+ return None

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 5:27 AM (10 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223340

Event Timeline