Page MenuHomeSoftware Heritage

D1498.id4935.diff
No OneTemporary

D1498.id4935.diff

diff --git a/swh/loader/npm/client.py b/swh/loader/npm/client.py
--- a/swh/loader/npm/client.py
+++ b/swh/loader/npm/client.py
@@ -3,17 +3,15 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
import logging
import os
-import chardet
import requests
from swh.core import tarball
from swh.model import hashutil
-from swh.loader.npm.utils import extract_npm_package_author
+from swh.loader.npm.utils import extract_npm_package_author, load_json
class NpmClient:
@@ -185,14 +183,12 @@
# do not archive useless tarball root directory
package_path = os.path.join(path, 'package')
- # some old packages use their name as root directory
+ # some old packages use a root directory with a different name
if not os.path.exists(package_path):
- ver_pos = package_source_data['filename'].rfind(version)
- package_name = package_source_data['filename'][:ver_pos-1]
- package_path = os.path.join(path, package_name)
- # fallback: archive root tarball directory
- if not os.path.exists(package_path):
- package_path = path
+ for _, dirnames, _ in os.walk(path):
+ if dirnames:
+ package_path = os.path.join(path, dirnames[0])
+ break
self.log.debug('Package local path: %s', package_path)
@@ -203,8 +199,7 @@
package_json = {}
with open(package_json_path, 'rb') as package_json_file:
package_json_bytes = package_json_file.read()
- file_encoding = chardet.detect(package_json_bytes)['encoding']
- package_json = json.loads(package_json_bytes.decode(file_encoding))
+ package_json = load_json(package_json_bytes)
# extract author from package.json
author = extract_npm_package_author(package_json)
diff --git a/swh/loader/npm/tests/common.py b/swh/loader/npm/tests/common.py
--- a/swh/loader/npm/tests/common.py
+++ b/swh/loader/npm/tests/common.py
@@ -3,11 +3,10 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import json
import os
import os.path
-import chardet
+from swh.loader.npm.utils import load_json
RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources')
@@ -49,8 +48,7 @@
with open(package_metadata_filepath, 'rb') as json_file:
json_file_bytes = json_file.read()
- file_encoding = chardet.detect(json_file_bytes)['encoding']
- package_metadata = json.loads(json_file_bytes.decode(file_encoding))
+ package_metadata = load_json(json_file_bytes)
m.register_uri('GET', package_metadata_url, json=package_metadata)
diff --git a/swh/loader/npm/utils.py b/swh/loader/npm/utils.py
--- a/swh/loader/npm/utils.py
+++ b/swh/loader/npm/utils.py
@@ -3,8 +3,13 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import json
import re
+from codecs import BOM_UTF8
+
+import chardet
+
_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
# https://github.com/jonschlinkert/author-regex
@@ -120,3 +125,40 @@
email = email.encode('utf-8')
return {'fullname': fullname, 'name': name, 'email': email}
+
+
+def _lstrip_bom(s, bom=BOM_UTF8):
+ if s.startswith(bom):
+ return s[len(bom):]
+ else:
+ return s
+
+
+def load_json(json_bytes):
+ """
+ Try to load JSON from bytes and return a dictionary.
+
+ First try to decode from utf-8. If the decoding failed,
+ try to detect the encoding and decode again with replace
+ error handling.
+
+ If JSON is malformed, an empty dictionary will be returned.
+
+ Args:
+ json_bytes (bytes): binary content of a JSON file
+
+ Returns:
+ dict: JSON data loaded in a dictionary
+ """
+ json_data = {}
+ try:
+ json_str = _lstrip_bom(json_bytes).decode('utf-8')
+ except UnicodeDecodeError:
+ encoding = chardet.detect(json_bytes)['encoding']
+ if encoding:
+ json_str = json_bytes.decode(encoding, 'replace')
+ try:
+ json_data = json.loads(json_str)
+ except json.decoder.JSONDecodeError:
+ pass
+ return json_data

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 9:54 AM (5 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217472

Event Timeline