diff --git a/swh/loader/npm/client.py b/swh/loader/npm/client.py index 5fccd68..627c483 100644 --- a/swh/loader/npm/client.py +++ b/swh/loader/npm/client.py @@ -1,215 +1,215 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os import chardet import requests from swh.core import tarball from swh.model import hashutil from swh.loader.npm.utils import extract_npm_package_author class NpmClient: """ Helper class internally used by the npm loader to fetch metadata for a specific package hosted on the npm registry. Args: temp_dir (str): Path to the temporary disk location used to uncompress the package tarballs """ def __init__(self, temp_dir, log=None): self.root_temp_dir = temp_dir self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage npm loader' } } self.log = log or logging def fetch_package_metadata(self, package_metadata_url): """ Fetch metadata for a given package and make it the focused one. This must be called prior any other operations performed by the other methods below. Args: package_metadata_url: the package metadata url provided by the npm loader """ self.package_metadata_url = package_metadata_url self.package_metadata = self._request(self.package_metadata_url).json() self.package = self.package_metadata['name'] self.temp_dir = os.path.join(self.root_temp_dir, self.package) def latest_package_version(self): """ Return the last released version of the focused package. Returns: str: the last releases package version """ latest = '' if 'latest' in self.package_metadata['dist-tags']: latest = self.package_metadata['dist-tags']['latest'] return latest def package_versions(self, known_versions=None): """ Return the available versions for the focused package. Args: known_versions (dict): may be provided by the loader, it enables to filter out versions already ingested in the archive. Returns: dict: A dict whose keys are Tuple[version, tarball_sha1] and values dicts with the following entries: * **name**: the package name * **version**: the package version * **filename**: the package source tarball filename * **sha1**: the package source tarball sha1 checksum * **date**: the package release date * **url**: the package source tarball download url """ versions = {} if 'versions' in self.package_metadata: for version, data in self.package_metadata['versions'].items(): sha1 = data['dist']['shasum'] key = (version, sha1) if known_versions and key in known_versions: continue tarball_url = data['dist']['tarball'] filename = os.path.basename(tarball_url) date = self.package_metadata['time'][version] versions[key] = { 'name': self.package, 'version': version, 'filename': filename, 'sha1': sha1, 'date': date, 'url': tarball_url } return versions def prepare_package_versions(self, known_versions=None): """ Instantiate a generator that will process a specific package released version at each iteration step. The following operations will be performed: 1. Create a temporary directory to download and extract the release tarball 2. Download the tarball 3. Check downloaded tarball integrity 4. Uncompress the tarball 5. Parse ``package.json`` file associated to the package version 6. Extract author from the parsed ``package.json`` file Args: known_versions (dict): may be provided by the loader, it enables to filter out versions already ingested in the archive. Yields: Tuple[dict, dict, dict, str]: tuples containing the following members: * a dict holding the parsed ``package.json`` file * a dict holding package author information * a dict holding package tarball information * a string holding the path of the uncompressed package to load into the archive """ new_versions = self.package_versions(known_versions) for version, package_source_data in sorted(new_versions.items()): # filter out version with missing tarball (cases exist), # package visit will be marked as partial at the end of # the loading process tarball_url = package_source_data['url'] tarball_request = self._request(tarball_url, throw_error=False) if tarball_request.status_code == 404: self.log.debug('Tarball url %s returns a 404 error.' % tarball_url) self.log.debug(('Version %s of %s package will be missing and ' 'the visit will be marked as partial.') % (version[0], self.package)) continue version_data = self.package_metadata['versions'][version[0]] yield self._prepare_package_version(package_source_data, version_data) def _prepare_package_version(self, package_source_data, version_data): version = version_data['version'] self.log.debug('Processing version %s for npm package %s' % (version, self.package)) # create temp dir to download and extract package tarball path = os.path.join(self.temp_dir, version) os.makedirs(path, exist_ok=True) filepath = os.path.join(path, package_source_data['filename']) self.log.debug('Package local path: %s' % filepath) # download tarball url = package_source_data['url'] response = self._request(url) hash_names = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} h = hashutil.MultiHash(hash_names=hash_names) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=None): h.update(chunk) f.write(chunk) # check tarball integrity hashes = h.hexdigest() expected_digest = package_source_data['sha1'] actual_digest = hashes['sha1'] if actual_digest != expected_digest: raise ValueError( '%s %s: Checksum mismatched: %s != %s' % ( self.package, version, expected_digest, actual_digest)) # uncompress tarball tarball.uncompress(filepath, path) # do not archive useless tarball root directory package_path = os.path.join(path, 'package') # some old packages use their name as root directory if not os.path.exists(package_path): ver_pos = package_source_data['filename'].rfind(version) package_name = package_source_data['filename'][:ver_pos-1] package_path = os.path.join(path, package_name) # fallback: archive root tarball directory if not os.path.exists(package_path): package_path = path package_source_data.update(hashes) # parse package.json file to add its content to revision metadata package_json_path = os.path.join(package_path, 'package.json') package_json = {} - with open(package_json_path, "rb") as package_json_file: + with open(package_json_path, 'rb') as package_json_file: package_json_bytes = package_json_file.read() file_encoding = chardet.detect(package_json_bytes)['encoding'] package_json = json.loads(package_json_bytes.decode(file_encoding)) # extract author from package.json author = extract_npm_package_author(package_json) return (package_json, author, package_source_data, package_path) def _request(self, url, throw_error=True): response = self.session.get(url, **self.params, stream=True) if response.status_code != 200 and throw_error: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response diff --git a/swh/loader/npm/tests/common.py b/swh/loader/npm/tests/common.py index e07ba3d..30cfb06 100644 --- a/swh/loader/npm/tests/common.py +++ b/swh/loader/npm/tests/common.py @@ -1,78 +1,82 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import os.path +import chardet + RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources') empty_package = '22u-validators' package = 'org' package_non_utf8_encoding = '0b' def package_url(package): return 'https://www.npmjs.com/package/%s' % package def package_metadata_url(package): return 'https://replicate.npmjs.com/%s/' % package def package_metadata_file(package, visit=''): json_filename = '%s_metadata' % package if visit: json_filename += '_visit%s' % visit json_filename += '.json' return json_filename class _MockedFileStream(): def __init__(self, file_data): self.file_data = file_data self.closed = False def read(self): self.closed = True return self.file_data def init_test_data(m, package_metadata_json_file, package_metadata_url): package_metadata_filepath = os.path.join(RESOURCES_PATH, package_metadata_json_file) - with open(package_metadata_filepath) as json_file: - package_metadata = json.load(json_file) + with open(package_metadata_filepath, 'rb') as json_file: + json_file_bytes = json_file.read() + file_encoding = chardet.detect(json_file_bytes)['encoding'] + package_metadata = json.loads(json_file_bytes.decode(file_encoding)) m.register_uri('GET', package_metadata_url, json=package_metadata) for v, v_data in package_metadata['versions'].items(): tarball_url = v_data['dist']['tarball'] tarball_filename = tarball_url.split('/')[-1] tarball_filepath = os.path.join(RESOURCES_PATH, 'tarballs', tarball_filename) with open(tarball_filepath, mode='rb') as tarball_file: tarball_content = tarball_file.read() m.register_uri('GET', tarball_url, body=_MockedFileStream(tarball_content)) return package_metadata def get_package_versions_data(package_metadata): versions_data = {} for v, v_data in package_metadata['versions'].items(): shasum = v_data['dist']['shasum'] versions_data[(v, shasum)] = { 'name': package, 'version': v, 'sha1': shasum, 'url': v_data['dist']['tarball'], 'filename': v_data['dist']['tarball'].split('/')[-1], 'date': package_metadata['time'][v] } return versions_data