Differential D7992 Diff 28825 swh/indexer/metadata_dictionary/npm.py

Changeset View

Standalone View

swh/indexer/metadata_dictionary/npm.py

Show First 20 Lines • Show All 127 Lines • ▼ Show 20 Lines	def normalize_author(self, d):
if name and isinstance(name, str):		if name and isinstance(name, str):
author[SCHEMA_URI + "name"] = name		author[SCHEMA_URI + "name"] = name
if email and isinstance(email, str):		if email and isinstance(email, str):
author[SCHEMA_URI + "email"] = email		author[SCHEMA_URI + "email"] = email
if url and isinstance(url, str):		if url and isinstance(url, str):
author[SCHEMA_URI + "url"] = {"@id": url}		author[SCHEMA_URI + "url"] = {"@id": url}
return {"@list": [author]}		return {"@list": [author]}

		def normalize_description(self, description):
		r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
		mistake that causes issues in the database because of null bytes in JSON.

		>>> NpmMapping().normalize_description("foo bar")
		'foo bar'
		>>> NpmMapping().normalize_description(
		... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
		... )
		'foo bar'
		>>> NpmMapping().normalize_description(
		... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
		... )
		'foo bar'
		>>> NpmMapping().normalize_description(
		... # invalid UTF-16 and meaningless UTF-8:
		... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
		... ) is None
		True
		>>> NpmMapping().normalize_description(
		... # ditto (ut looks like little-endian at first)
		... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
		... ) is None
		True
		>>> NpmMapping().normalize_description(None) is None
		True
		"""
		if description is None:
		return None
		# XXX: if this function ever need to support more cases, consider
		# switching to https://pypi.org/project/ftfy/ instead of adding more hacks
		if description.startswith("\ufffd\ufffd") and "\x00" in description:
		# 2 unicode replacement characters followed by '# ' encoded as UTF-16
		# is a common mistake, which indicates a README.md was saved as UTF-16,
		# and some NPM tool opened it as UTF-8 and used the first line as
		# description.

		description_bytes = description.encode()

		# Strip the the two unicode replacement characters
		assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
		description_bytes = description_bytes[6:]

		# If the following attempts fail to recover the description, discard it
		# entirely because the current indexer storage backend (postgresql) cannot
		# store zero bytes in JSON columns.
		description = None

		ardumontUnsubmitted Done Inline Actions you're missing that one in your docstring sample tests. ardumont: you're missing that one in your docstring sample tests.
		if not description_bytes.startswith(b"\x00"):
		# try UTF-16 little-endian (the most common) first
		try:
		description = description_bytes.decode("utf-16le")
		except UnicodeDecodeError:
		pass
		if description is None:
		# if it fails, try UTF-16 big-endian
		try:
		description = description_bytes.decode("utf-16be")
		except UnicodeDecodeError:
		pass

		if description:
		if description.startswith("# "):
		description = description[2:]
		return description.rstrip()
		return description

def normalize_license(self, s):		def normalize_license(self, s):
"""https://docs.npmjs.com/files/package.json#license		"""https://docs.npmjs.com/files/package.json#license

>>> NpmMapping().normalize_license('MIT')		>>> NpmMapping().normalize_license('MIT')
{'@id': 'https://spdx.org/licenses/MIT'}		{'@id': 'https://spdx.org/licenses/MIT'}
"""		"""
if isinstance(s, str):		if isinstance(s, str):
return {"@id": "https://spdx.org/licenses/" + s}		return {"@id": "https://spdx.org/licenses/" + s}
Show All 18 Lines