Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/npm/utils.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | |||||
import re | import re | ||||
from codecs import BOM_UTF8 | |||||
import chardet | |||||
_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} | _EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} | ||||
# https://github.com/jonschlinkert/author-regex | # https://github.com/jonschlinkert/author-regex | ||||
_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' | _author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' | ||||
def parse_npm_package_author(author_str): | def parse_npm_package_author(author_str): | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 99 Lines • ▼ Show 20 Lines | def extract_npm_package_author(package_json): | ||||
if name: | if name: | ||||
name = name.encode('utf-8') | name = name.encode('utf-8') | ||||
if email: | if email: | ||||
email = email.encode('utf-8') | email = email.encode('utf-8') | ||||
return {'fullname': fullname, 'name': name, 'email': email} | return {'fullname': fullname, 'name': name, 'email': email} | ||||
def _lstrip_bom(s, bom=BOM_UTF8): | |||||
if s.startswith(bom): | |||||
return s[len(bom):] | |||||
else: | |||||
return s | |||||
def load_json(json_bytes): | |||||
""" | |||||
Try to load JSON from bytes and return a dictionary. | |||||
First try to decode from utf-8. If the decoding failed, | |||||
try to detect the encoding and decode again with replace | |||||
error handling. | |||||
If JSON is malformed, an empty dictionary will be returned. | |||||
Args: | |||||
json_bytes (bytes): binary content of a JSON file | |||||
Returns: | |||||
dict: JSON data loaded in a dictionary | |||||
""" | |||||
json_data = {} | |||||
try: | |||||
json_str = _lstrip_bom(json_bytes).decode('utf-8') | |||||
except UnicodeDecodeError: | |||||
encoding = chardet.detect(json_bytes)['encoding'] | |||||
if encoding: | |||||
json_str = json_bytes.decode(encoding, 'replace') | |||||
try: | |||||
json_data = json.loads(json_str) | |||||
except json.decoder.JSONDecodeError: | |||||
pass | |||||
return json_data |