diff --git a/swh/loader/npm/client.py b/swh/loader/npm/client.py index a4f51ca..e5a5522 100644 --- a/swh/loader/npm/client.py +++ b/swh/loader/npm/client.py @@ -1,219 +1,214 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json import logging import os -import chardet import requests from swh.core import tarball from swh.model import hashutil -from swh.loader.npm.utils import extract_npm_package_author +from swh.loader.npm.utils import extract_npm_package_author, load_json class NpmClient: """ Helper class internally used by the npm loader to fetch metadata for a specific package hosted on the npm registry. Args: temp_dir (str): Path to the temporary disk location used to uncompress the package tarballs """ def __init__(self, temp_dir, log=None): self.root_temp_dir = temp_dir self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage npm loader' } } self.log = log or logging def fetch_package_metadata(self, package_metadata_url): """ Fetch metadata for a given package and make it the focused one. This must be called prior any other operations performed by the other methods below. Args: package_metadata_url: the package metadata url provided by the npm loader """ self.package_metadata_url = package_metadata_url self.package_metadata = self._request(self.package_metadata_url).json() self.package = self.package_metadata['name'] self.temp_dir = os.path.join(self.root_temp_dir, self.package) def latest_package_version(self): """ Return the last released version of the focused package. Returns: str: the last releases package version """ latest = '' if 'latest' in self.package_metadata['dist-tags']: latest = self.package_metadata['dist-tags']['latest'] return latest def package_versions(self, known_versions=None): """ Return the available versions for the focused package. Args: known_versions (dict): may be provided by the loader, it enables to filter out versions already ingested in the archive. Returns: dict: A dict whose keys are Tuple[version, tarball_sha1] and values dicts with the following entries: * **name**: the package name * **version**: the package version * **filename**: the package source tarball filename * **sha1**: the package source tarball sha1 checksum * **date**: the package release date * **url**: the package source tarball download url """ versions = {} if 'versions' in self.package_metadata: for version, data in self.package_metadata['versions'].items(): sha1 = data['dist']['shasum'] key = (version, sha1) if known_versions and key in known_versions: continue tarball_url = data['dist']['tarball'] filename = os.path.basename(tarball_url) date = self.package_metadata['time'][version] versions[key] = { 'name': self.package, 'version': version, 'filename': filename, 'sha1': sha1, 'date': date, 'url': tarball_url } return versions def prepare_package_versions(self, known_versions=None): """ Instantiate a generator that will process a specific package released version at each iteration step. The following operations will be performed: 1. Create a temporary directory to download and extract the release tarball 2. Download the tarball 3. Check downloaded tarball integrity 4. Uncompress the tarball 5. Parse ``package.json`` file associated to the package version 6. Extract author from the parsed ``package.json`` file Args: known_versions (dict): may be provided by the loader, it enables to filter out versions already ingested in the archive. Yields: Tuple[dict, dict, dict, str]: tuples containing the following members: * a dict holding the parsed ``package.json`` file * a dict holding package author information * a dict holding package tarball information * a string holding the path of the uncompressed package to load into the archive """ new_versions = self.package_versions(known_versions) for version, package_source_data in sorted(new_versions.items()): # filter out version with missing tarball (cases exist), # package visit will be marked as partial at the end of # the loading process tarball_url = package_source_data['url'] tarball_request = self._request(tarball_url, throw_error=False) if tarball_request.status_code == 404: self.log.debug('Tarball url %s returns a 404 error.', tarball_url) self.log.debug(('Version %s of %s package will be missing and ' 'the visit will be marked as partial.'), version[0], self.package) continue version_data = self.package_metadata['versions'][version[0]] yield self._prepare_package_version(package_source_data, version_data) def _prepare_package_version(self, package_source_data, version_data): version = version_data['version'] self.log.debug('Processing version %s for npm package %s', version, self.package) # create temp dir to download and extract package tarball path = os.path.join(self.temp_dir, version) os.makedirs(path, exist_ok=True) filepath = os.path.join(path, package_source_data['filename']) # download tarball url = package_source_data['url'] response = self._request(url) hash_names = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} h = hashutil.MultiHash(hash_names=hash_names) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=None): h.update(chunk) f.write(chunk) # check tarball integrity hashes = h.hexdigest() expected_digest = package_source_data['sha1'] actual_digest = hashes['sha1'] if actual_digest != expected_digest: raise ValueError( '%s %s: Checksum mismatched: %s != %s' % ( self.package, version, expected_digest, actual_digest)) # uncompress tarball tarball.uncompress(filepath, path) # remove tarball os.remove(filepath) # do not archive useless tarball root directory package_path = os.path.join(path, 'package') - # some old packages use their name as root directory + # some old packages use a root directory with a different name if not os.path.exists(package_path): - ver_pos = package_source_data['filename'].rfind(version) - package_name = package_source_data['filename'][:ver_pos-1] - package_path = os.path.join(path, package_name) - # fallback: archive root tarball directory - if not os.path.exists(package_path): - package_path = path + for _, dirnames, _ in os.walk(path): + if dirnames: + package_path = os.path.join(path, dirnames[0]) + break self.log.debug('Package local path: %s', package_path) package_source_data.update(hashes) # parse package.json file to add its content to revision metadata package_json_path = os.path.join(package_path, 'package.json') package_json = {} with open(package_json_path, 'rb') as package_json_file: package_json_bytes = package_json_file.read() - file_encoding = chardet.detect(package_json_bytes)['encoding'] - package_json = json.loads(package_json_bytes.decode(file_encoding)) + package_json = load_json(package_json_bytes) # extract author from package.json author = extract_npm_package_author(package_json) return (package_json, author, package_source_data, package_path) def _request(self, url, throw_error=True): response = self.session.get(url, **self.params, stream=True) if response.status_code != 200 and throw_error: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response diff --git a/swh/loader/npm/tests/common.py b/swh/loader/npm/tests/common.py index 30cfb06..1a71b53 100644 --- a/swh/loader/npm/tests/common.py +++ b/swh/loader/npm/tests/common.py @@ -1,82 +1,80 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json import os import os.path -import chardet +from swh.loader.npm.utils import load_json RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources') empty_package = '22u-validators' package = 'org' package_non_utf8_encoding = '0b' def package_url(package): return 'https://www.npmjs.com/package/%s' % package def package_metadata_url(package): return 'https://replicate.npmjs.com/%s/' % package def package_metadata_file(package, visit=''): json_filename = '%s_metadata' % package if visit: json_filename += '_visit%s' % visit json_filename += '.json' return json_filename class _MockedFileStream(): def __init__(self, file_data): self.file_data = file_data self.closed = False def read(self): self.closed = True return self.file_data def init_test_data(m, package_metadata_json_file, package_metadata_url): package_metadata_filepath = os.path.join(RESOURCES_PATH, package_metadata_json_file) with open(package_metadata_filepath, 'rb') as json_file: json_file_bytes = json_file.read() - file_encoding = chardet.detect(json_file_bytes)['encoding'] - package_metadata = json.loads(json_file_bytes.decode(file_encoding)) + package_metadata = load_json(json_file_bytes) m.register_uri('GET', package_metadata_url, json=package_metadata) for v, v_data in package_metadata['versions'].items(): tarball_url = v_data['dist']['tarball'] tarball_filename = tarball_url.split('/')[-1] tarball_filepath = os.path.join(RESOURCES_PATH, 'tarballs', tarball_filename) with open(tarball_filepath, mode='rb') as tarball_file: tarball_content = tarball_file.read() m.register_uri('GET', tarball_url, body=_MockedFileStream(tarball_content)) return package_metadata def get_package_versions_data(package_metadata): versions_data = {} for v, v_data in package_metadata['versions'].items(): shasum = v_data['dist']['shasum'] versions_data[(v, shasum)] = { 'name': package, 'version': v, 'sha1': shasum, 'url': v_data['dist']['tarball'], 'filename': v_data['dist']['tarball'].split('/')[-1], 'date': package_metadata['time'][v] } return versions_data diff --git a/swh/loader/npm/utils.py b/swh/loader/npm/utils.py index 04d474c..263bb2a 100644 --- a/swh/loader/npm/utils.py +++ b/swh/loader/npm/utils.py @@ -1,122 +1,164 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json import re +from codecs import BOM_UTF8 + +import chardet + _EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} # https://github.com/jonschlinkert/author-regex _author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' def parse_npm_package_author(author_str): """ Parse npm package author string. It works with a flexible range of formats, as detailed below:: name name (url) name (url) name (url) name(url) name (url) name (url) name(url) name(url) name (url) name(url) name name (url) (url) (url) (url) (url) Args: author_str (str): input author string Returns: dict: A dict that may contain the following keys: * name * email * url """ author = {} matches = re.findall(_author_regexp, author_str.replace('<>', '').replace('()', ''), re.M) for match in matches: if match[0].strip(): author['name'] = match[0].strip() if match[1].strip(): author['email'] = match[1].strip() if match[2].strip(): author['url'] = match[2].strip() return author def extract_npm_package_author(package_json): """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json (dict): Dict holding the content of parsed ``package.json`` file Returns: dict: A dict with the following keys: * fullname * name * email """ def _author_str(author_data): if type(author_data) is dict: author_str = '' if 'name' in author_data: author_str += author_data['name'] if 'email' in author_data: author_str += ' <%s>' % author_data['email'] return author_str elif type(author_data) is list: return _author_str(author_data[0]) if len(author_data) > 0 else '' else: return author_data author_data = {} for author_key in ('author', 'authors'): if author_key in package_json: author_str = _author_str(package_json[author_key]) author_data = parse_npm_package_author(author_str) name = author_data.get('name') email = author_data.get('email') fullname = None if name and email: fullname = '%s <%s>' % (name, email) elif name: fullname = name if not fullname: return _EMPTY_AUTHOR if fullname: fullname = fullname.encode('utf-8') if name: name = name.encode('utf-8') if email: email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} + + +def _lstrip_bom(s, bom=BOM_UTF8): + if s.startswith(bom): + return s[len(bom):] + else: + return s + + +def load_json(json_bytes): + """ + Try to load JSON from bytes and return a dictionary. + + First try to decode from utf-8. If the decoding failed, + try to detect the encoding and decode again with replace + error handling. + + If JSON is malformed, an empty dictionary will be returned. + + Args: + json_bytes (bytes): binary content of a JSON file + + Returns: + dict: JSON data loaded in a dictionary + """ + json_data = {} + try: + json_str = _lstrip_bom(json_bytes).decode('utf-8') + except UnicodeDecodeError: + encoding = chardet.detect(json_bytes)['encoding'] + if encoding: + json_str = json_bytes.decode(encoding, 'replace') + try: + json_data = json.loads(json_str) + except json.decoder.JSONDecodeError: + pass + return json_data