Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/codemeta.py
Show All 21 Lines | |||||
CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0' | CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0' | ||||
CODEMETA_URI = 'https://codemeta.github.io/terms/' | CODEMETA_URI = 'https://codemeta.github.io/terms/' | ||||
SCHEMA_URI = 'http://schema.org/' | SCHEMA_URI = 'http://schema.org/' | ||||
PROPERTY_BLACKLIST = { | PROPERTY_BLACKLIST = { | ||||
# CodeMeta properties that we cannot properly represent. | # CodeMeta properties that we cannot properly represent. | ||||
CODEMETA_URI + 'softwareRequirements', | SCHEMA_URI + 'softwareRequirements', | ||||
CODEMETA_URI + 'softwareSuggestions', | CODEMETA_URI + 'softwareSuggestions', | ||||
# Duplicate of 'author' | # Duplicate of 'author' | ||||
CODEMETA_URI + 'creator', | SCHEMA_URI + 'creator', | ||||
} | } | ||||
def make_absolute_uri(local_name): | |||||
definition = CODEMETA_CONTEXT['@context'][local_name] | |||||
if isinstance(definition, str): | |||||
return definition | |||||
elif isinstance(definition, dict): | |||||
prefixed_name = definition['@id'] | |||||
(prefix, local_name) = prefixed_name.split(':') | |||||
if prefix == 'schema': | |||||
canonical_name = SCHEMA_URI + local_name | |||||
elif prefix == 'codemeta': | |||||
canonical_name = CODEMETA_URI + local_name | |||||
else: | |||||
assert False, prefix | |||||
return canonical_name | |||||
else: | |||||
assert False, definition | |||||
def _read_crosstable(fd): | def _read_crosstable(fd): | ||||
reader = csv.reader(fd) | reader = csv.reader(fd) | ||||
try: | try: | ||||
header = next(reader) | header = next(reader) | ||||
except StopIteration: | except StopIteration: | ||||
raise ValueError('empty file') | raise ValueError('empty file') | ||||
data_sources = set(header) - {'Parent Type', 'Property', | data_sources = set(header) - {'Parent Type', 'Property', | ||||
'Type', 'Description'} | 'Type', 'Description'} | ||||
assert 'codemeta-V1' in data_sources | assert 'codemeta-V1' in data_sources | ||||
moranegg: The codemeta-V1 is the older version of codemeta.
The difficulty (not here but globaly) if we… | |||||
Done Inline ActionsYeah I understood that later, there's a fix in D620. Adding support for v1 remains to be done though vlorentz: Yeah I understood that later, there's a fix in D620. Adding support for v1 remains to be done… | |||||
codemeta_translation = {data_source: {} for data_source in data_sources} | codemeta_translation = {data_source: {} for data_source in data_sources} | ||||
for line in reader: # For each canonical name | for line in reader: # For each canonical name | ||||
canonical_name = CODEMETA_URI + dict(zip(header, line))['Property'] | local_name = dict(zip(header, line))['Property'] | ||||
if not local_name: | |||||
continue | |||||
canonical_name = make_absolute_uri(local_name) | |||||
if canonical_name in PROPERTY_BLACKLIST: | if canonical_name in PROPERTY_BLACKLIST: | ||||
continue | continue | ||||
for (col, value) in zip(header, line): # For each cell in the row | for (col, value) in zip(header, line): # For each cell in the row | ||||
if col in data_sources: | if col in data_sources: | ||||
# If that's not the parentType/property/type/description | # If that's not the parentType/property/type/description | ||||
for local_name in value.split('/'): | for local_name in value.split('/'): | ||||
# For each of the data source's properties that maps | # For each of the data source's properties that maps | ||||
# to this canonical name | # to this canonical name | ||||
Show All 39 Lines |
The codemeta-V1 is the older version of codemeta.
The difficulty (not here but globaly) if we find a codemeta.json file, we need to use the @context attribute to see which version is used,
but this could also cause problems because of examples like this:
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
So here, the column codemeta-V1 shouldn't be the canonical name for codemeta.
The codemeta vocabulary is in the codemeta.csv table under property
and to facilitate things at present, I think when encountering a codemeta.json file, the vocabulary to check should be in the property column