diff --git a/swh/loader/npm/client.py b/swh/loader/npm/client.py --- a/swh/loader/npm/client.py +++ b/swh/loader/npm/client.py @@ -3,17 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json import logging import os -import chardet import requests from swh.core import tarball from swh.model import hashutil -from swh.loader.npm.utils import extract_npm_package_author +from swh.loader.npm.utils import extract_npm_package_author, load_json class NpmClient: @@ -185,14 +183,12 @@ # do not archive useless tarball root directory package_path = os.path.join(path, 'package') - # some old packages use their name as root directory + # some old packages use a root directory with a different name if not os.path.exists(package_path): - ver_pos = package_source_data['filename'].rfind(version) - package_name = package_source_data['filename'][:ver_pos-1] - package_path = os.path.join(path, package_name) - # fallback: archive root tarball directory - if not os.path.exists(package_path): - package_path = path + for _, dirnames, _ in os.walk(path): + if dirnames: + package_path = os.path.join(path, dirnames[0]) + break self.log.debug('Package local path: %s', package_path) @@ -203,8 +199,7 @@ package_json = {} with open(package_json_path, 'rb') as package_json_file: package_json_bytes = package_json_file.read() - file_encoding = chardet.detect(package_json_bytes)['encoding'] - package_json = json.loads(package_json_bytes.decode(file_encoding)) + package_json = load_json(package_json_bytes) # extract author from package.json author = extract_npm_package_author(package_json) diff --git a/swh/loader/npm/tests/common.py b/swh/loader/npm/tests/common.py --- a/swh/loader/npm/tests/common.py +++ b/swh/loader/npm/tests/common.py @@ -3,11 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json import os import os.path -import chardet +from swh.loader.npm.utils import load_json RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources') @@ -49,8 +48,7 @@ with open(package_metadata_filepath, 'rb') as json_file: json_file_bytes = json_file.read() - file_encoding = chardet.detect(json_file_bytes)['encoding'] - package_metadata = json.loads(json_file_bytes.decode(file_encoding)) + package_metadata = load_json(json_file_bytes) m.register_uri('GET', package_metadata_url, json=package_metadata) diff --git a/swh/loader/npm/utils.py b/swh/loader/npm/utils.py --- a/swh/loader/npm/utils.py +++ b/swh/loader/npm/utils.py @@ -3,8 +3,13 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json import re +from codecs import BOM_UTF8 + +import chardet + _EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} # https://github.com/jonschlinkert/author-regex @@ -120,3 +125,40 @@ email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} + + +def _lstrip_bom(s, bom=BOM_UTF8): + if s.startswith(bom): + return s[len(bom):] + else: + return s + + +def load_json(json_bytes): + """ + Try to load JSON from bytes and return a dictionary. + + First try to decode from utf-8. If the decoding failed, + try to detect the encoding and decode again with replace + error handling. + + If JSON is malformed, an empty dictionary will be returned. + + Args: + json_bytes (bytes): binary content of a JSON file + + Returns: + dict: JSON data loaded in a dictionary + """ + json_data = {} + try: + json_str = _lstrip_bom(json_bytes).decode('utf-8') + except UnicodeDecodeError: + encoding = chardet.detect(json_bytes)['encoding'] + if encoding: + json_str = json_bytes.decode(encoding, 'replace') + try: + json_data = json.loads(json_str) + except json.decoder.JSONDecodeError: + pass + return json_data