diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -1,3 +1,4 @@ +import re from typing import Dict, List, Optional, Union import yaml @@ -30,12 +31,61 @@ return None if isinstance(content_dict, dict): - metadata = self._translate_dict(content_dict) + content_dict_checked = self.value_type_check(content_dict) + metadata = self._translate_dict(content_dict_checked) metadata["@context"] = CODEMETA_CONTEXT_URL return metadata return None + def value_type_check(self, content_dict): + type_checked_content_dict = {} + + for field in content_dict: + if field == "authors": + authors = [] + + for author in content_dict["authors"]: + author_checked = {} + + if "given-names" in author and isinstance( + author["given-names"], str + ): + author_checked["given-names"] = author["given-names"] + + if "name-particle" in author and isinstance( + author["name-particle"], str + ): + author_checked["name-particle"] = author["name-particle"] + + # will add the rest if what I'm doing is correct... + + # Check if the given author email keeps with the re of emails + if "email" in author and bool( + re.match( + r"([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+", + author["email"], + ) + ): + author_checked["email"] = author["email"] + + for subField in author: + if not ( + subField == "given-names" + or subField == "name-particle" + or subField == "email" + ): + author_checked[subField] = author[subField] + + authors.append(author_checked) + + type_checked_content_dict["authors"] = authors + + else: + type_checked_content_dict[field] = content_dict[field] + + return type_checked_content_dict + def normalize_authors(self, d: List[dict]) -> Dict[str, list]: result = [] for author in d: