Page MenuHomeSoftware Heritage

D8549.diff
No OneTemporary

D8549.diff

diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
--- a/docs/metadata-workflow.rst
+++ b/docs/metadata-workflow.rst
@@ -69,7 +69,11 @@
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Intrinsic metadata is extracted from files provided with a project's source
-code, and translated using `CodeMeta`_'s `crosswalk table`_.
+code, and translated using `CodeMeta`_'s `crosswalk table`_; which is vendored
+in :file:`swh/indexer/data/codemeta/codemeta.csv`.
+Ecosystems not yet included in Codemeta's crosswalk have their own
+:file:`swh/indexer/data/*.csv` file, with one row for each CodeMeta property,
+even when not supported by the ecosystem.
All input formats supported so far are straightforward dictionaries (eg. JSON)
or can be accessed as such (eg. XML); and the first part of the translation is
diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -9,7 +9,7 @@
import json
import os.path
import re
-from typing import Any, List
+from typing import Any, Dict, List, Set, TextIO, Tuple
from pyld import jsonld
import rdflib
@@ -66,7 +66,15 @@
return uri
-def _read_crosstable(fd):
+def read_crosstable(fd: TextIO) -> Tuple[Set[str], Dict[str, Dict[str, rdflib.URIRef]]]:
+ """
+ Given a file-like object to a `CodeMeta crosswalk table` (either the main
+ cross-table with all columns, or an auxiliary table with just the CodeMeta
+ column and one ecosystem-specific table); returns a list of all CodeMeta
+ terms, and a dictionary ``{ecosystem: {ecosystem_term: codemeta_term}}``
+
+ .. _CodeMeta crosswalk table: <https://codemeta.github.io/crosswalk/
+ """
reader = csv.reader(fd)
try:
header = next(reader)
@@ -75,7 +83,9 @@
data_sources = set(header) - {"Parent Type", "Property", "Type", "Description"}
- codemeta_translation = {data_source: {} for data_source in data_sources}
+ codemeta_translation: Dict[str, Dict[str, rdflib.URIRef]] = {
+ data_source: {} for data_source in data_sources
+ }
terms = set()
for line in reader: # For each canonical name
@@ -101,7 +111,7 @@
with open(CROSSWALK_TABLE_PATH) as fd:
- (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd)
+ (CODEMETA_TERMS, CROSSWALK_TABLE) = read_crosstable(fd)
def _document_loader(url, options=None):
diff --git a/swh/indexer/data/Gitea.csv b/swh/indexer/data/Gitea.csv
--- a/swh/indexer/data/Gitea.csv
+++ b/swh/indexer/data/Gitea.csv
@@ -66,11 +66,3 @@
issueTracker,
referencePublication,
readme,
-,
-,
-,
-,
-,
-,
-,
-,
diff --git a/swh/indexer/metadata_dictionary/composer.py b/swh/indexer/metadata_dictionary/composer.py
--- a/swh/indexer/metadata_dictionary/composer.py
+++ b/swh/indexer/metadata_dictionary/composer.py
@@ -8,7 +8,7 @@
from rdflib import BNode, Graph, Literal, URIRef
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
from swh.indexer.namespaces import RDF, SCHEMA
from .base import JsonMapping, SingleFileIntrinsicMapping
@@ -20,7 +20,7 @@
COMPOSER_TABLE_PATH = os.path.join(_DATA_DIR, "composer.csv")
with open(COMPOSER_TABLE_PATH) as fd:
- (CODEMETA_TERMS, COMPOSER_TABLE) = _read_crosstable(fd)
+ (CODEMETA_TERMS, COMPOSER_TABLE) = read_crosstable(fd)
class ComposerMapping(JsonMapping, SingleFileIntrinsicMapping):
diff --git a/swh/indexer/metadata_dictionary/dart.py b/swh/indexer/metadata_dictionary/dart.py
--- a/swh/indexer/metadata_dictionary/dart.py
+++ b/swh/indexer/metadata_dictionary/dart.py
@@ -8,7 +8,7 @@
from rdflib import RDF, BNode, Graph, Literal, URIRef
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
from swh.indexer.namespaces import SCHEMA
from .base import YamlMapping
@@ -19,7 +19,7 @@
PUB_TABLE_PATH = os.path.join(_DATA_DIR, "pubspec.csv")
with open(PUB_TABLE_PATH) as fd:
- (CODEMETA_TERMS, PUB_TABLE) = _read_crosstable(fd)
+ (CODEMETA_TERMS, PUB_TABLE) = read_crosstable(fd)
def name_to_person(name):
diff --git a/swh/indexer/metadata_dictionary/gitea.py b/swh/indexer/metadata_dictionary/gitea.py
--- a/swh/indexer/metadata_dictionary/gitea.py
+++ b/swh/indexer/metadata_dictionary/gitea.py
@@ -8,7 +8,7 @@
from rdflib import RDF, BNode, Graph, Literal, URIRef
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
@@ -20,7 +20,7 @@
GITEA_TABLE_PATH = os.path.join(_DATA_DIR, "Gitea.csv")
with open(GITEA_TABLE_PATH) as fd:
- (CODEMETA_TERMS, GITEA_TABLE) = _read_crosstable(fd)
+ (CODEMETA_TERMS, GITEA_TABLE) = read_crosstable(fd)
class GiteaMapping(BaseExtrinsicMapping, JsonMapping):
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -9,7 +9,7 @@
from rdflib import RDF, BNode, Graph, Literal, URIRef
-from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+from swh.indexer.codemeta import _DATA_DIR, read_crosstable
from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
@@ -19,7 +19,7 @@
NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
with open(NUGET_TABLE_PATH) as fd:
- (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd)
+ (CODEMETA_TERMS, NUGET_TABLE) = read_crosstable(fd)
SPDX = URIRef("https://spdx.org/licenses/")

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 10:42 AM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218045

Event Timeline