diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py --- a/swh/indexer/metadata_dictionary/cff.py +++ b/swh/indexer/metadata_dictionary/cff.py @@ -27,7 +27,6 @@ try: content_dict = yaml.load(raw_content_string, Loader=yaml.SafeLoader) except yaml.scanner.ScannerError: - self.log.warning("Error yaml is invalid and will be skipped") return None if isinstance(content_dict, dict): diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py --- a/swh/indexer/metadata_dictionary/npm.py +++ b/swh/indexer/metadata_dictionary/npm.py @@ -133,73 +133,6 @@ author[SCHEMA_URI + "url"] = {"@id": url} return {"@list": [author]} - def normalize_description(self, description): - r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common - mistake that causes issues in the database because of null bytes in JSON. - - >>> NpmMapping().normalize_description("foo bar") - 'foo bar' - >>> NpmMapping().normalize_description( - ... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00" - ... ) - 'foo bar' - >>> NpmMapping().normalize_description( - ... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 " - ... ) - 'foo bar' - >>> NpmMapping().normalize_description( - ... # invalid UTF-16 and meaningless UTF-8: - ... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00" - ... ) is None - True - >>> NpmMapping().normalize_description( - ... # ditto (ut looks like little-endian at first) - ... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00" - ... ) is None - True - >>> NpmMapping().normalize_description(None) is None - True - """ - if description is None: - return None - # XXX: if this function ever need to support more cases, consider - # switching to https://pypi.org/project/ftfy/ instead of adding more hacks - if description.startswith("\ufffd\ufffd") and "\x00" in description: - # 2 unicode replacement characters followed by '# ' encoded as UTF-16 - # is a common mistake, which indicates a README.md was saved as UTF-16, - # and some NPM tool opened it as UTF-8 and used the first line as - # description. - - description_bytes = description.encode() - - # Strip the the two unicode replacement characters - assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd") - description_bytes = description_bytes[6:] - - # If the following attempts fail to recover the description, discard it - # entirely because the current indexer storage backend (postgresql) cannot - # store zero bytes in JSON columns. - description = None - - if not description_bytes.startswith(b"\x00"): - # try UTF-16 little-endian (the most common) first - try: - description = description_bytes.decode("utf-16le") - except UnicodeDecodeError: - pass - if description is None: - # if it fails, try UTF-16 big-endian - try: - description = description_bytes.decode("utf-16be") - except UnicodeDecodeError: - pass - - if description: - if description.startswith("# "): - description = description[2:] - return description.rstrip() - return description - def normalize_license(self, s): """https://docs.npmjs.com/files/package.json#license diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -101,7 +101,7 @@ given-names: Stephan orcid: https://orcid.org/0000-0003-4925-7248 cff-version: "1.0.3" -date-released: 2019-11-12 +date-released: "2019-11-12" doi: 10.5281/zenodo.1162057 keywords: - "citation" @@ -159,6 +159,31 @@ # then assert expected == result + def test_compute_metadata_cff_invalid_yaml(self): + """ + test.yaml translation for invalid yaml file + """ + # given + content = """cff-version: 1.0.3 +message: To cite the SigMF specification, please include the following: +authors: + - name: The GNU Radio Foundation, Inc. +title: The Signal Metadata Format (SigMF) +version: 0.0.1 +doi: 10.5281/zenodo.1418396 +date-released: 2018-07-18 +license: CC-BY-SA-4.0 +url: https://sigmf.org + """.encode( + "utf-8" + ) + + expected = None + + result = self.cff_mapping.translate(content) + # then + assert expected == result + def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm