diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -22,14 +22,19 @@ mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"] string_fields = ["keywords", "license", "abstract", "version", "doi"] - def translate(self, raw_content: bytes) -> Dict[str, str]: + def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]: raw_content_string: str = raw_content.decode() - content_dict = yaml.load(raw_content_string, Loader=SafeLoader) - metadata = self._translate_dict(content_dict) + try: + content_dict = yaml.load(raw_content_string, Loader=SafeLoader) + except yaml.scanner.ScannerError: + return None - metadata["@context"] = CODEMETA_CONTEXT_URL + if isinstance(content_dict, dict): + metadata = self._translate_dict(content_dict) + metadata["@context"] = CODEMETA_CONTEXT_URL + return metadata - return metadata + return None def normalize_authors(self, d: List[dict]) -> Dict[str, list]: result = [] diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -136,7 +136,6 @@ def normalize_description(self, description): r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common mistake that causes issues in the database because of null bytes in JSON. - >>> NpmMapping().normalize_description("foo bar") 'foo bar' >>> NpmMapping().normalize_description( @@ -160,7 +159,7 @@ >>> NpmMapping().normalize_description(None) is None True """ - if description is None: + if description is None or type(description) != str: return None # XXX: if this function ever need to support more cases, consider # switching to https://pypi.org/project/ftfy/ instead of adding more hacks diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -159,6 +159,31 @@ # then assert expected == result + def test_compute_metadata_cff_invalid_yaml(self): + """ + test.yaml translation for invalid yaml file + """ + # given + content = """cff-version: 1.0.3 +message: To cite the SigMF specification, please include the following: +authors: + - name: The GNU Radio Foundation, Inc. +title: The Signal Metadata Format (SigMF) +version: 0.0.1 +doi: 10.5281/zenodo.1418396 +date-released: 2018-07-18 +license: CC-BY-SA-4.0 +url: https://sigmf.org + """.encode( + "utf-8" + ) + + expected = None + + result = self.cff_mapping.translate(content) + # then + assert expected == result + def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm