diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -148,6 +148,10 @@ """List of fields that are simple strings, and don't need any normalization.""" + date_fields: List[str] = [] + """List of fields that are strings that should be typed as http://schema.org/Date + """ + uri_fields: List[str] = [] """List of fields that are simple URIs, and don't need any normalization.""" @@ -167,7 +171,7 @@ simple_terms = { str(term) for (key, term) in cls.mapping.items() - if key in cls.string_fields + cls.uri_fields + if key in cls.string_fields + cls.date_fields + cls.uri_fields or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) } @@ -240,6 +244,14 @@ elif k in self.string_fields and isinstance(v, list): for item in v: graph.add((root, codemeta_key, rdflib.Literal(item))) + elif k in self.date_fields and isinstance(v, str): + typed_v = rdflib.Literal(v, datatype=SCHEMA.Date) + graph.add((root, codemeta_key, typed_v)) + elif k in self.date_fields and isinstance(v, list): + for item in v: + if isinstance(item, str): + typed_item = rdflib.Literal(item, datatype=SCHEMA.Date) + graph.add((root, codemeta_key, typed_item)) elif k in self.uri_fields and isinstance(v, str): # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop # URLs that are blatantly invalid early, so PyLD does not crash. diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -25,6 +25,7 @@ filename = b"CITATION.cff" mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"] string_fields = ["keywords", "license", "abstract", "version", "doi"] + date_fields = ["date-released"] uri_fields = ["repository-code"] def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node: @@ -57,7 +58,3 @@ def normalize_license(self, s: str) -> URIRef: if isinstance(s, str): return SPDX + s - - def normalize_date_released(self, s: str) -> Literal: - if isinstance(s, str): - return Literal(s, datatype=SCHEMA.Date) diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py --- a/swh/indexer/metadata_dictionary/github.py +++ b/swh/indexer/metadata_dictionary/github.py @@ -22,14 +22,18 @@ **CROSSWALK_TABLE["GitHub"], "topics": SCHEMA.keywords, # TODO: submit this to the official crosswalk } - string_fields = [ + uri_fields = [ "archive_url", + "html_url", + "issues_url", + ] + date_fields = [ "created_at", "updated_at", + ] + string_fields = [ "description", "full_name", - "html_url", - "issues_url", "topics", ] diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py --- a/swh/indexer/tests/metadata_dictionary/test_github.py +++ b/swh/indexer/tests/metadata_dictionary/test_github.py @@ -32,6 +32,7 @@ assert { "http://schema.org/name", "http://schema.org/license", + "http://schema.org/dateCreated", "https://forgefed.org/ns#forks", "https://www.w3.org/ns/activitystreams#totalItems", } <= terms @@ -133,9 +134,9 @@ "license": "https://spdx.org/licenses/GPL-3.0", "name": "SoftwareHeritage/swh-indexer", "description": "GitHub mirror of Metadata indexer", - "schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer", - "schema:dateCreated": "2017-01-31T13:05:39Z", - "schema:dateModified": "2022-06-22T08:02:20Z", + "codeRepository": "https://github.com/SoftwareHeritage/swh-indexer", + "dateCreated": "2017-01-31T13:05:39Z", + "dateModified": "2022-06-22T08:02:20Z", }