diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -69,7 +69,11 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Intrinsic metadata is extracted from files provided with a project's source -code, and translated using `CodeMeta`_'s `crosswalk table`_. +code, and translated using `CodeMeta`_'s `crosswalk table`_; which is vendored +in :file:`swh/indexer/data/codemeta/codemeta.csv`. +Ecosystems not yet included in Codemeta's crosswalk have their own +:file:`swh/indexer/data/*.csv` file, with one row for each CodeMeta property, +even when not supported by the ecosystem. All input formats supported so far are straightforward dictionaries (eg. JSON) or can be accessed as such (eg. XML); and the first part of the translation is diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -9,7 +9,7 @@ import json import os.path import re -from typing import Any, List +from typing import Any, Dict, List, Set, TextIO, Tuple from pyld import jsonld import rdflib @@ -66,7 +66,15 @@ return uri -def _read_crosstable(fd): +def read_crosstable(fd: TextIO) -> Tuple[Set[str], Dict[str, Dict[str, rdflib.URIRef]]]: + """ + Given a file-like object to a `CodeMeta crosswalk table` (either the main + cross-table with all columns, or an auxiliary table with just the CodeMeta + column and one ecosystem-specific table); returns a list of all CodeMeta + terms, and a dictionary ``{ecosystem: {ecosystem_term: codemeta_term}}`` + + .. _CodeMeta crosswalk table: