diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -1,4 +1,84 @@ swh-loader-npm ============== -Software Heritage loader to ingest npm packages into the archive. \ No newline at end of file +Software Heritage loader to ingest [`npm`](https://www.npmjs.com/) packages into the archive. + +# What does the loader do? + +The npm loader visits and loads a npm package [1]. + +Each visit will result in: +- 1 snapshot (which targets n revisions ; 1 per package release version) +- 1 revision (which targets 1 directory ; the package release version uncompressed) + +[1] https://docs.npmjs.com/about-packages-and-modules + +## First visit + +Given a npm package (origin), the loader, for the first visit: + +- retrieves information for the given package (notably released versions) +- then for each associated released version: + - retrieves the associated tarball (with checks) + - uncompresses locally the archive + - computes the hashes of the uncompressed directory + - then creates a revision (using ``package.json`` metadata file) targeting such directory +- finally, creates a snapshot targeting all seen revisions (uncompressed npm package released versions and metadata). + +## Next visit + +The loader starts by checking if something changed since the last visit. If nothing changed, the visit's snapshot is left unchanged. The new visit targets the same snapshot. + +If something changed, the already seen package release versions are skipped. Only the new ones are loaded. In the end, the loader creates a new snapshot based on the previous one. Thus, the new snapshot targets both the old and new package release versions. + +# Development + +## Configuration file + +### Location + +Either: +- `/etc/softwareheritage/loader/npm.yml` +- `~/.config/swh/loader/npm.yml` + +### Configuration sample + +```lang=yaml +storage: + cls: remote + args: + url: http://localhost:5002/ + +debug: false +``` + +## Local run + +The built-in command-line will run the loader for a specified npm package. + +For instance, to load `jquery`: +```lang=bash +$ python3 -m swh.loader.npm.loader jquery +``` + +If you need more control, you can use the loader directly. It expects +three arguments: +- `package_name` (required): a npm package name +- `package_url` (optional): URL of the npm package description (human-readable html page) that will be used as the associated origin URL in the archive +- `project_metadata_url` (optional): URL of the npm package metadata information (machine-parsable JSON document) + +```lang=python +import logging + +from urllib.parse import quote + +from swh.loader.npm.loader import NpmLoader + +logging.basicConfig(level=logging.DEBUG) + +package_name='webpack' + +NpmLoader().load(package_name, + 'https://www.npmjs.com/package/%s/' % package_name, + 'https://replicate.npmjs.com/%s/' % quote(package_name, safe='')) +``` \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -.. _swh-py-template: +.. _swh-loader-npm: Software Heritage - npm loader ============================== @@ -9,10 +9,4 @@ :maxdepth: 2 :caption: Contents: - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + /apidoc/swh.loader.npm diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ -swh.core +swh.core >= 0.0.57 swh.model >= 0.0.28 -swh.storage >= 0.0.108 +swh.storage >= 0.0.131 swh.scheduler -swh.loader.core >= 0.0.35 +swh.loader.core >= 0.0.40 diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ +python-dateutil +requests setuptools vcversioner diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -44,6 +44,7 @@ author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/source/swh-loader-npm.git', packages=find_packages(), + scripts=[], install_requires=parse_requirements() + parse_requirements('swh'), tests_require=parse_requirements('test'), setup_requires=['vcversioner'], diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/__init__.py @@ -0,0 +1 @@ +__path__ = __import__('pkgutil').extend_path(__path__, __name__) diff --git a/swh/loader/npm/client.py b/swh/loader/npm/client.py new file mode 100644 --- /dev/null +++ b/swh/loader/npm/client.py @@ -0,0 +1,209 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +import os + +import requests + +from swh.core import tarball +from swh.model import hashutil + +from swh.loader.npm.utils import extract_npm_package_author + + +class NpmClient: + """ + Helper class internally used by the npm loader to fetch + metadata for a specific package hosted on the npm registry. + + Args: + temp_dir (str): Path to the temporary disk location used + to uncompress the package tarballs + """ + def __init__(self, temp_dir, log=None): + self.root_temp_dir = temp_dir + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage npm loader' + } + } + self.log = log or logging + + def fetch_package_metadata(self, package_metadata_url): + """ + Fetch metadata for a given package and make it the focused one. + This must be called prior any other operations performed + by the other methods below. + + Args: + package_metadata_url: the package metadata url provided + by the npm loader + """ + self.package_metadata_url = package_metadata_url + self.package_metadata = self._request(self.package_metadata_url).json() + self.package = self.package_metadata['name'] + self.temp_dir = os.path.join(self.root_temp_dir, self.package) + + def latest_package_version(self): + """ + Return the last released version of the focused package. + + Returns: + str: the last releases package version + """ + return self.package_metadata['dist-tags']['latest'] + + def package_versions(self, known_versions=None): + """ + Return the available versions for the focused package. + + Args: + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + dict: A dict whose keys are Tuple[version, tarball_sha1] and + values dicts with the following entries: + + * **name**: the package name + * **version**: the package version + * **filename**: the package source tarball filename + * **sha1**: the package source tarball sha1 checksum + * **date**: the package release date + * **url**: the package source tarball download url + """ + versions = {} + if 'versions' in self.package_metadata: + for version, data in self.package_metadata['versions'].items(): + sha1 = data['dist']['shasum'] + key = (version, sha1) + if known_versions and key in known_versions: + continue + tarball_url = data['dist']['tarball'] + filename = os.path.basename(tarball_url) + date = self.package_metadata['time'][version] + versions[key] = { + 'name': self.package, + 'version': version, + 'filename': filename, + 'sha1': sha1, + 'date': date, + 'url': tarball_url + } + return versions + + def prepare_package_versions(self, known_versions=None): + """ + Instantiate a generator that will process a specific package released + version at each iteration step. The following operations will be + performed: + + 1. Create a temporary directory to download and extract the + release tarball + 2. Download the tarball + 3. Check downloaded tarball integrity + 4. Uncompress the tarball + 5. Parse ``package.json`` file associated to the package version + 6. Extract author from the parsed ``package.json`` file + + Args: + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Yields: + Tuple[dict, dict, dict, str]: tuples containing the following + members: + + * a dict holding the parsed ``package.json`` file + * a dict holding package author information + * a dict holding package tarball information + * a string holding the path of the uncompressed package to + load into the archive + """ + new_versions = self.package_versions(known_versions) + for version, package_source_data in sorted(new_versions.items()): + # filter out version with missing tarball (cases exist), + # package visit will be marked as partial at the end of + # the loading process + tarball_url = package_source_data['url'] + tarball_request = self._request(tarball_url, + throw_error=False) + if tarball_request.status_code == 404: + self.log.debug('Tarball url %s returns a 404 error.' % + tarball_url) + self.log.debug(('Version %s of %s package will be missing and ' + 'the visit will be marked as partial.') % + (version[0], self.package)) + continue + version_data = self.package_metadata['versions'][version[0]] + yield self._prepare_package_version(package_source_data, + version_data) + + def _prepare_package_version(self, package_source_data, version_data): + version = version_data['version'] + self.log.debug('Processing version %s for npm package %s' % + (version, self.package)) + + # create temp dir to download and extract package tarball + path = os.path.join(self.temp_dir, version) + os.makedirs(path, exist_ok=True) + filepath = os.path.join(path, package_source_data['filename']) + self.log.debug('Package local path: %s' % filepath) + + # download tarball + url = package_source_data['url'] + response = self._request(url) + hash_names = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} + h = hashutil.MultiHash(hash_names=hash_names) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=None): + h.update(chunk) + f.write(chunk) + + # check tarball integrity + hashes = h.hexdigest() + expected_digest = package_source_data['sha1'] + actual_digest = hashes['sha1'] + if actual_digest != expected_digest: + raise ValueError( + '%s %s: Checksum mismatched: %s != %s' % ( + self.package, version, expected_digest, actual_digest)) + + # uncompress tarball + tarball.uncompress(filepath, path) + + # do not archive useless tarball root directory + package_path = os.path.join(path, 'package') + # some old packages use their name as root directory + if not os.path.exists(package_path): + ver_pos = package_source_data['filename'].rfind(version) + package_name = package_source_data['filename'][:ver_pos-1] + package_path = os.path.join(path, package_name) + # fallback: archive root tarball directory + if not os.path.exists(package_path): + package_path = path + + package_source_data.update(hashes) + + # parse package.json file to add its content to revision metadata + package_json_path = os.path.join(package_path, 'package.json') + package_json = {} + with open(package_json_path, "r") as package_json_file: + package_json = json.load(package_json_file) + + # extract author from package.json + author = extract_npm_package_author(package_json) + + return (package_json, author, package_source_data, package_path) + + def _request(self, url, throw_error=True): + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200 and throw_error: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return response diff --git a/swh/loader/npm/loader.py b/swh/loader/npm/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/npm/loader.py @@ -0,0 +1,315 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import shutil +from tempfile import mkdtemp +from urllib.parse import quote + +from dateutil import parser as date_parser + +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import BufferedLoader +from swh.model.from_disk import Directory +from swh.model.identifiers import ( + revision_identifier, snapshot_identifier, + identifier_to_bytes, normalize_timestamp +) +from swh.storage.algos.snapshot import snapshot_get_all_branches + +from swh.loader.npm.client import NpmClient + + +TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.npm.' + + +class NpmLoader(BufferedLoader): + """ + Loader for ingesting source packages from the npm registry + into the Software Heritage archive. + """ + + CONFIG_BASE_FILENAME = 'loader/npm' + ADDITIONAL_CONFIG = { + 'temp_directory': ('str', '/tmp/swh.loader.npm/'), + 'debug': ('bool', False) + } + + def __init__(self): + super().__init__(logging_class='swh.loader.npm.NpmLoader') + self.origin_id = None + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + self.temp_directory = mkdtemp(suffix='-%s' % os.getpid(), + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + self.debug = self.config['debug'] + self.done = False + self.npm_client = NpmClient(self.temp_directory, self.log) + + def pre_cleanup(self): + """ + To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + """ + if self.debug: + self.log.warning('DEBUG: will not pre-clean up temp dir %s' % + self.temp_directory) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def cleanup(self): + """ + Clean up temporary disk use after downloading and extracting + npm source package tarballs. + """ + if self.debug: + self.log.warning('DEBUG: will not clean up temp dir %s' % + self.temp_directory) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) + + def load(self, package_name, package_url=None, + package_metadata_url=None): + """ + Loader entrypoint to ingest source tarballs for a npm package. + + Args: + package_name (str): the name of the npm package + package_url (str): the url of the package description, + if not provided the following one will be used: + https://www.npmjs.com/package/ + package_metadata_url (str): the url for the package JSON metadata, + if not provided the following one will be used: + https://replicate.npmjs.com// + """ + if package_url is None: + package_url = 'https://www.npmjs.com/package/%s' % package_name + if package_metadata_url is None: + package_metadata_url = 'https://replicate.npmjs.com/%s/' %\ + quote(package_name, safe='') + return super().load(package_name, package_url, package_metadata_url) + + def prepare_origin_visit(self, package_name, package_url, + package_metadata_url): + """ + Prepare npm package visit. + + Args: + package_name (str): the name of the npm package + package_url (str): the url of the package description + package_metadata_url (str): the url for the package JSON metadata + + """ + # reset statuses + self._load_status = 'uneventful' + self._visit_status = 'full' + self.done = False + # fetch the npm package metadata from the registry + self.npm_client.fetch_package_metadata(package_metadata_url) + self.origin = { + 'url': package_url, + 'type': 'npm', + } + self.visit_date = None # loader core will populate it + + def _known_versions(self, last_snapshot): + """ + Retrieve the known release versions for the npm package + (i.e. those already ingested into the archive). + + Args + last_snapshot (dict): Last snapshot for the visit + + Returns: + dict: Dict whose keys are Tuple[filename, sha1] and values + are revision ids. + + """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + + revs = [rev['target'] + for rev in last_snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + + known_revisions = self.storage.revision_get(revs) + ret = {} + for revision in known_revisions: + if not revision: + continue + if 'package_source' in revision['metadata']: + package = revision['metadata']['package_source'] + ret[(package['version'], package['sha1'])] = revision['id'] + return ret + + def _last_snapshot(self): + """ + Retrieve the last snapshot of the npm package if any. + """ + snapshot = self.storage.snapshot_get_latest(self.origin_id) + if snapshot and snapshot.pop('next_branch', None): + snapshot = snapshot_get_all_branches(self.storage, snapshot['id']) + return snapshot + + def prepare(self, package_name, package_url, package_metadata_url): + """ + Prepare effective loading of source tarballs for a npm + package. + + Args: + package_name (str): the name of the npm package + package_url (str): the url of the package description + package_metadata_url (str): the url for the package JSON metadata + """ + self.package_name = package_name + self.origin_url = package_url + self.package_contents = [] + self.package_directories = [] + self.package_revisions = [] + self.package_load_status = 'uneventful' + self.package_visit_status = 'full' + + last_snapshot = self._last_snapshot() + self.known_versions = self._known_versions(last_snapshot) + + self.new_versions = \ + self.npm_client.prepare_package_versions(self.known_versions) + + def fetch_data(self): + """ + Called once per package release version to process. + + This will for each call: + - download a tarball associated to a package release version + - uncompress it and compute the necessary information + - compute the swh objects + + Returns: + True as long as data to fetch exist + + """ + data = None + if self.done: + return False + + try: + data = next(self.new_versions) + self.package_load_status = 'eventful' + except StopIteration: + self.done = True + return False + + package_metadata, author, package_source_data, dir_path = data + + dir_path = dir_path.encode('utf-8') + directory = Directory.from_disk(path=dir_path, data=True) + objects = directory.collect() + + self.package_contents = objects['content'].values() + self.package_directories = objects['directory'].values() + + date = date_parser.parse(package_source_data['date']) + + date = normalize_timestamp(int(date.timestamp())) + + message = package_source_data['version'].encode('ascii') + + revision = { + 'synthetic': True, + 'metadata': { + 'package_source': package_source_data, + 'package': package_metadata, + }, + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'message': message, + 'directory': directory.hash, + 'parents': [], + 'type': 'tar', + } + revision['id'] = identifier_to_bytes(revision_identifier(revision)) + + self.package_revisions.append(revision) + + package_key = (package_source_data['version'], + package_source_data['sha1']) + self.known_versions[package_key] = revision['id'] + + return not self.done + + def _target_from_version(self, version, sha1): + """ + Return revision information if any for a given package version. + """ + target = self.known_versions.get((version, sha1)) + return { + 'target': target, + 'target_type': 'revision', + } if target else None + + def _generate_and_load_snapshot(self): + """ + Generate snapshot for the npm package visit. + """ + branches = {} + latest_version = self.npm_client.latest_package_version() + for version_data in self.npm_client.package_versions().values(): + version = version_data['version'] + sha1 = version_data['sha1'] + branch_name = ('releases/%s' % version).encode('ascii') + target = self._target_from_version(version, sha1) + branches[branch_name] = target + if version == latest_version: + branches[b'HEAD'] = { + 'target_type': 'alias', + 'target': branch_name, + } + if not target: + self.package_visit_status = 'partial' + snapshot = { + 'branches': branches, + } + snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot)) + + self.maybe_load_snapshot(snapshot) + + def store_data(self): + """ + Send collected objects to storage. + """ + self.maybe_load_contents(self.package_contents) + self.maybe_load_directories(self.package_directories) + self.maybe_load_revisions(self.package_revisions) + + if self.done: + self._generate_and_load_snapshot() + self.flush() + + def load_status(self): + return { + 'status': self.package_load_status, + } + + def visit_status(self): + return self.package_visit_status + + +if __name__ == '__main__': + import logging + import sys + logging.basicConfig(level=logging.DEBUG) + if len(sys.argv) != 2: + logging.error('Usage: %s ' % sys.argv[0]) + sys.exit(1) + package_name = sys.argv[1] + loader = NpmLoader() + loader.load(package_name) diff --git a/swh/loader/npm/utils.py b/swh/loader/npm/utils.py new file mode 100644 --- /dev/null +++ b/swh/loader/npm/utils.py @@ -0,0 +1,116 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re + +_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} + +# https://github.com/jonschlinkert/author-regex +_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' + + +def parse_npm_package_author(author_str): + """ + Parse npm package author string. + + It works with a flexible range of formats, as detailed below:: + + name + name (url) + name (url) + name (url) + name(url) + name (url) + name (url) + name(url) + name(url) + name (url) + name(url) + name + name + (url) + (url) + (url) + (url) + + (url) + + Args: + author_str (str): input author string + + Returns: + dict: A dict that may contain the following keys: + * name + * email + * url + + """ + author = {} + matches = re.findall(_author_regexp, + author_str.replace('<>', '').replace('()', ''), + re.M) + for match in matches: + if match[0].strip(): + author['name'] = match[0].strip() + if match[1].strip(): + author['email'] = match[1].strip() + if match[2].strip(): + author['url'] = match[2].strip() + return author + + +def extract_npm_package_author(package_json): + """ + Extract package author from a ``package.json`` file content and + return it in swh format. + + Args: + package_json (dict): Dict holding the content of parsed + ``package.json`` file + + Returns: + dict: A dict with the following keys: + * fullname + * name + * email + + """ + author_data = {} + if 'author' in package_json: + if type(package_json['author']) is str: + author_data = parse_npm_package_author(package_json['author']) + elif type(package_json['author']) is dict: + author_str = '' + if 'name' in package_json['author']: + author_str += package_json['author']['name'] + if 'email' in package_json['author']: + author_str += ' <%s>' % package_json['author']['email'] + author_data = parse_npm_package_author(author_str) + elif 'authors' in package_json and len(package_json['authors']) > 0: + author_data = parse_npm_package_author(package_json['authors'][0]) + + name = author_data.get('name') + email = author_data.get('email') + + fullname = None + + if name and email: + fullname = '%s <%s>' % (name, email) + elif name: + fullname = name + + if not fullname: + return _EMPTY_AUTHOR + + if fullname: + fullname = fullname.encode('utf-8') + + if name: + name = name.encode('utf-8') + + if email: + email = email.encode('utf-8') + + return {'fullname': fullname, 'name': name, 'email': email} diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist=check-manifest,flake8,py3 +envlist=flake8,py3 [testenv:py3] deps = @@ -14,10 +14,3 @@ flake8 commands = {envpython} -m flake8 - -[testenv:check-manifest] -skip_install = true -deps = - check-manifest -commands = - {envpython} -m check_manifest {toxinidir}