diff --git a/swh/loader/npm/client.py b/swh/loader/npm/client.py index e5a5522..20dc85d 100644 --- a/swh/loader/npm/client.py +++ b/swh/loader/npm/client.py @@ -1,214 +1,221 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import requests from swh.core import tarball from swh.model import hashutil from swh.loader.npm.utils import extract_npm_package_author, load_json class NpmClient: """ Helper class internally used by the npm loader to fetch metadata for a specific package hosted on the npm registry. Args: temp_dir (str): Path to the temporary disk location used to uncompress the package tarballs """ def __init__(self, temp_dir, log=None): self.root_temp_dir = temp_dir self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage npm loader' } } self.log = log or logging def fetch_package_metadata(self, package_metadata_url): """ Fetch metadata for a given package and make it the focused one. This must be called prior any other operations performed by the other methods below. Args: package_metadata_url: the package metadata url provided by the npm loader """ self.package_metadata_url = package_metadata_url self.package_metadata = self._request(self.package_metadata_url).json() self.package = self.package_metadata['name'] self.temp_dir = os.path.join(self.root_temp_dir, self.package) def latest_package_version(self): """ Return the last released version of the focused package. Returns: str: the last releases package version """ latest = '' if 'latest' in self.package_metadata['dist-tags']: latest = self.package_metadata['dist-tags']['latest'] return latest def package_versions(self, known_versions=None): """ Return the available versions for the focused package. Args: known_versions (dict): may be provided by the loader, it enables to filter out versions already ingested in the archive. Returns: dict: A dict whose keys are Tuple[version, tarball_sha1] and values dicts with the following entries: * **name**: the package name * **version**: the package version * **filename**: the package source tarball filename * **sha1**: the package source tarball sha1 checksum * **date**: the package release date * **url**: the package source tarball download url """ versions = {} if 'versions' in self.package_metadata: for version, data in self.package_metadata['versions'].items(): sha1 = data['dist']['shasum'] key = (version, sha1) if known_versions and key in known_versions: continue tarball_url = data['dist']['tarball'] filename = os.path.basename(tarball_url) date = self.package_metadata['time'][version] versions[key] = { 'name': self.package, 'version': version, 'filename': filename, 'sha1': sha1, 'date': date, 'url': tarball_url } return versions def prepare_package_versions(self, known_versions=None): """ Instantiate a generator that will process a specific package released version at each iteration step. The following operations will be performed: 1. Create a temporary directory to download and extract the release tarball 2. Download the tarball 3. Check downloaded tarball integrity 4. Uncompress the tarball 5. Parse ``package.json`` file associated to the package version 6. Extract author from the parsed ``package.json`` file Args: known_versions (dict): may be provided by the loader, it enables to filter out versions already ingested in the archive. Yields: Tuple[dict, dict, dict, str]: tuples containing the following members: * a dict holding the parsed ``package.json`` file * a dict holding package author information * a dict holding package tarball information * a string holding the path of the uncompressed package to load into the archive """ new_versions = self.package_versions(known_versions) for version, package_source_data in sorted(new_versions.items()): # filter out version with missing tarball (cases exist), # package visit will be marked as partial at the end of # the loading process tarball_url = package_source_data['url'] tarball_request = self._request(tarball_url, throw_error=False) if tarball_request.status_code == 404: self.log.debug('Tarball url %s returns a 404 error.', tarball_url) self.log.debug(('Version %s of %s package will be missing and ' 'the visit will be marked as partial.'), version[0], self.package) continue version_data = self.package_metadata['versions'][version[0]] yield self._prepare_package_version(package_source_data, version_data) def _prepare_package_version(self, package_source_data, version_data): version = version_data['version'] self.log.debug('Processing version %s for npm package %s', version, self.package) # create temp dir to download and extract package tarball path = os.path.join(self.temp_dir, version) os.makedirs(path, exist_ok=True) filepath = os.path.join(path, package_source_data['filename']) # download tarball url = package_source_data['url'] response = self._request(url) hash_names = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} h = hashutil.MultiHash(hash_names=hash_names) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=None): h.update(chunk) f.write(chunk) # check tarball integrity hashes = h.hexdigest() expected_digest = package_source_data['sha1'] actual_digest = hashes['sha1'] if actual_digest != expected_digest: raise ValueError( '%s %s: Checksum mismatched: %s != %s' % ( self.package, version, expected_digest, actual_digest)) # uncompress tarball - tarball.uncompress(filepath, path) + tarball_invalid = False + try: + tarball.uncompress(filepath, path) + except Exception: + tarball_invalid = True # remove tarball os.remove(filepath) + if tarball_invalid: + return (None, None, None, None) + # do not archive useless tarball root directory package_path = os.path.join(path, 'package') # some old packages use a root directory with a different name if not os.path.exists(package_path): for _, dirnames, _ in os.walk(path): if dirnames: package_path = os.path.join(path, dirnames[0]) break self.log.debug('Package local path: %s', package_path) package_source_data.update(hashes) # parse package.json file to add its content to revision metadata package_json_path = os.path.join(package_path, 'package.json') package_json = {} with open(package_json_path, 'rb') as package_json_file: package_json_bytes = package_json_file.read() package_json = load_json(package_json_bytes) # extract author from package.json author = extract_npm_package_author(package_json) return (package_json, author, package_source_data, package_path) def _request(self, url, throw_error=True): response = self.session.get(url, **self.params, stream=True) if response.status_code != 200 and throw_error: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response diff --git a/swh/loader/npm/loader.py b/swh/loader/npm/loader.py index 8883f4e..6e0695f 100644 --- a/swh/loader/npm/loader.py +++ b/swh/loader/npm/loader.py @@ -1,318 +1,322 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil from tempfile import mkdtemp from urllib.parse import quote from dateutil import parser as date_parser from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import BufferedLoader from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.loader.npm.client import NpmClient TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.npm.' class NpmLoader(BufferedLoader): """ Loader for ingesting source packages from the npm registry into the Software Heritage archive. """ CONFIG_BASE_FILENAME = 'loader/npm' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.npm/'), 'debug': ('bool', False) } def __init__(self): super().__init__(logging_class='swh.loader.npm.NpmLoader') self.origin_id = None temp_directory = self.config['temp_directory'] os.makedirs(temp_directory, exist_ok=True) self.temp_directory = mkdtemp(suffix='-%s' % os.getpid(), prefix=TEMPORARY_DIR_PREFIX_PATTERN, dir=temp_directory) self.debug = self.config['debug'] self.done = False self.npm_client = NpmClient(self.temp_directory, self.log) def pre_cleanup(self): """ To prevent disk explosion if some other workers exploded in mid-air (OOM killed), we try and clean up dangling files. """ if self.debug: self.log.warning('DEBUG: will not pre-clean up temp dir %s', self.temp_directory) return clean_dangling_folders(self.config['temp_directory'], pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """ Clean up temporary disk use after downloading and extracting npm source package tarballs. """ if self.debug: self.log.warning('DEBUG: will not clean up temp dir %s', self.temp_directory) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s', self.temp_directory) shutil.rmtree(self.temp_directory) def load(self, package_name, package_url=None, package_metadata_url=None): """ Loader entrypoint to ingest source tarballs for a npm package. Args: package_name (str): the name of the npm package package_url (str): the url of the package description, if not provided the following one will be used: https://www.npmjs.com/package/ package_metadata_url (str): the url for the package JSON metadata, if not provided the following one will be used: https://replicate.npmjs.com// """ if package_url is None: package_url = 'https://www.npmjs.com/package/%s' % package_name if package_metadata_url is None: package_metadata_url = 'https://replicate.npmjs.com/%s/' %\ quote(package_name, safe='') return super().load(package_name, package_url, package_metadata_url) def prepare_origin_visit(self, package_name, package_url, package_metadata_url): """ Prepare npm package visit. Args: package_name (str): the name of the npm package package_url (str): the url of the package description package_metadata_url (str): the url for the package JSON metadata """ # reset statuses self._load_status = 'uneventful' self._visit_status = 'full' self.done = False # fetch the npm package metadata from the registry self.npm_client.fetch_package_metadata(package_metadata_url) self.origin = { 'url': package_url, 'type': 'npm', } self.visit_date = None # loader core will populate it def _known_versions(self, last_snapshot): """ Retrieve the known release versions for the npm package (i.e. those already ingested into the archive). Args last_snapshot (dict): Last snapshot for the visit Returns: dict: Dict whose keys are Tuple[filename, sha1] and values are revision ids. """ if not last_snapshot or 'branches' not in last_snapshot: return {} revs = [rev['target'] for rev in last_snapshot['branches'].values() if rev and rev['target_type'] == 'revision'] known_revisions = self.storage.revision_get(revs) ret = {} for revision in known_revisions: if not revision: continue if 'package_source' in revision['metadata']: package = revision['metadata']['package_source'] ret[(package['version'], package['sha1'])] = revision['id'] return ret - def _last_snapshot(self): + def last_snapshot(self): """ Retrieve the last snapshot of the npm package if any. """ snapshot = self.storage.snapshot_get_latest(self.origin_id) if snapshot and snapshot.pop('next_branch', None): snapshot = snapshot_get_all_branches(self.storage, snapshot['id']) return snapshot def prepare(self, package_name, package_url, package_metadata_url): """ Prepare effective loading of source tarballs for a npm package. Args: package_name (str): the name of the npm package package_url (str): the url of the package description package_metadata_url (str): the url for the package JSON metadata """ self.package_name = package_name self.origin_url = package_url self.package_contents = [] self.package_directories = [] self.package_revisions = [] self.package_load_status = 'uneventful' self.package_visit_status = 'full' - last_snapshot = self._last_snapshot() + last_snapshot = self.last_snapshot() self.known_versions = self._known_versions(last_snapshot) self.new_versions = \ self.npm_client.prepare_package_versions(self.known_versions) def fetch_data(self): """ Called once per package release version to process. This will for each call: - download a tarball associated to a package release version - uncompress it and compute the necessary information - compute the swh objects Returns: True as long as data to fetch exist """ data = None if self.done: return False try: data = next(self.new_versions) self.package_load_status = 'eventful' except StopIteration: self.done = True return False package_metadata, author, package_source_data, dir_path = data + # package release tarball was corrupted + if package_metadata is None: + return not self.done + dir_path = dir_path.encode('utf-8') directory = Directory.from_disk(path=dir_path, data=True) objects = directory.collect() self.package_contents = objects['content'].values() self.package_directories = objects['directory'].values() date = date_parser.parse(package_source_data['date']) date = normalize_timestamp(int(date.timestamp())) message = package_source_data['version'].encode('ascii') revision = { 'synthetic': True, 'metadata': { 'package_source': package_source_data, 'package': package_metadata, }, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'message': message, 'directory': directory.hash, 'parents': [], 'type': 'tar', } revision['id'] = identifier_to_bytes(revision_identifier(revision)) self.package_revisions.append(revision) package_key = (package_source_data['version'], package_source_data['sha1']) self.known_versions[package_key] = revision['id'] self.log.debug('Removing unpacked package files at %s', dir_path) shutil.rmtree(dir_path) return not self.done def _target_from_version(self, version, sha1): """ Return revision information if any for a given package version. """ target = self.known_versions.get((version, sha1)) return { 'target': target, 'target_type': 'revision', } if target else None def _generate_and_load_snapshot(self): """ Generate snapshot for the npm package visit. """ branches = {} latest_version = self.npm_client.latest_package_version() for version_data in self.npm_client.package_versions().values(): version = version_data['version'] sha1 = version_data['sha1'] branch_name = ('releases/%s' % version).encode('ascii') target = self._target_from_version(version, sha1) branches[branch_name] = target if version == latest_version: branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name, } if not target: self.package_visit_status = 'partial' snapshot = { 'branches': branches, } snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot)) self.maybe_load_snapshot(snapshot) def store_data(self): """ Send collected objects to storage. """ self.maybe_load_contents(self.package_contents) self.maybe_load_directories(self.package_directories) self.maybe_load_revisions(self.package_revisions) if self.done: self._generate_and_load_snapshot() self.flush() def load_status(self): return { 'status': self.package_load_status, } def visit_status(self): return self.package_visit_status if __name__ == '__main__': import logging import sys logging.basicConfig(level=logging.DEBUG) if len(sys.argv) != 2: logging.error('Usage: %s ' % sys.argv[0]) sys.exit(1) package_name = sys.argv[1] loader = NpmLoader() loader.load(package_name) diff --git a/swh/loader/npm/tests/test_loader.py b/swh/loader/npm/tests/test_loader.py index 7bb1216..cf8fa50 100644 --- a/swh/loader/npm/tests/test_loader.py +++ b/swh/loader/npm/tests/test_loader.py @@ -1,354 +1,384 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import unittest import requests_mock +from unittest.mock import patch + +from swh.core import tarball from swh.loader.core.tests import BaseLoaderStorageTest from swh.loader.npm.loader import NpmLoader from swh.model.identifiers import snapshot_identifier from .common import ( empty_package, package, package_non_utf8_encoding, package_url, package_metadata_url, package_metadata_file, init_test_data, ) _LOADER_TESTS_CONFIG = { 'content_packet_size': 10000, 'content_packet_size_bytes': 104857600, 'content_size_limit': 104857600, 'debug': False, 'directory_packet_size': 25000, 'occurrence_packet_size': 100000, 'release_packet_size': 100000, 'revision_packet_size': 100000, 'send_contents': True, 'send_directories': True, 'send_releases': True, 'send_revisions': True, 'send_snapshot': True, 'storage': {'args': {}, 'cls': 'memory'}, 'temp_directory': '/tmp/swh.loader.pypi/' } _expected_new_contents_first_visit = [ '4ce3058e16ab3d7e077f65aabf855c34895bf17c', '858c3ceee84c8311adc808f8cdb30d233ddc9d18', '0fa33b4f5a4e0496da6843a38ff1af8b61541996', '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', '9163ac8025923d5a45aaac482262893955c9b37b', '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', '18c03aac6d3e910efb20039c15d70ab5e0297101', '41265c42446aac17ca769e67d1704f99e5a1394d', '783ff33f5882813dca9239452c4a7cadd4dba778', 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', '112d1900b4c2e3e9351050d1b542c9744f9793f3', '5439bbc4bd9a996f1a38244e6892b71850bc98fd', 'd83097a2f994b503185adf4e719d154123150159', 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', 'b3523a26f7147e4af40d9d462adaae6d49eda13e', 'cd065fb435d6fb204a8871bcd623d0d0e673088c', '2854a40855ad839a54f4b08f5cff0cf52fca4399', 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', '585fc5caab9ead178a327d3660d35851db713df1', 'e8cd41a48d79101977e3036a87aeb1aac730686f', '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', '9c3cc2763bf9e9e37067d3607302c4776502df98', '3649a68410e354c83cd4a38b66bd314de4c8f5c9', 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', '38de737da99514de6559ff163c988198bc91367a', ] _expected_new_contents_second_visit = [ '135cb2000df4dfcfd8012d18ba23a54d6f89b105', '1e8e0943ee08958ab0a710dbba110f88068cab74', '25c8e3104daec559482ee1b480262be5da993e0e', '51245e983ebf91468fc59a072fcdddb837676abb', '55833e56224af0cf6fbbdca586c79d1e0e257b37', '785e0e16f2753b7683dd5f9e1bd1b98287334e6a', '876d655e927a95c7511853850c9c078be5f1a44b', 'a2b331450408a22d3026c0444990b3235017c7e1', 'a3f4f4d2055b21445defff5dada6cddb7c815f15', 'b3aeed7cf5be703bd8a029928b431eecf5d205af', 'b93d5e2006138f03e8ae84d0b72350fe6c37753a', 'd196b2fa26032df86c8470e9f47a45cdeb5e23a2', 'e3bae46f8f4f0347dab7ad567bf2f64bff3c1c53', 'f2746efa0b38dcd3bbe7591cc075ee4a618c5943' ] _expected_new_directories_first_visit = [ '80579be563e2ef3e385226fe7a3f079b377f142c', '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', 'bcad03ce58ac136f26f000990fc9064e559fe1c0', '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', '584b5b4b6cf7f038095e820b99386a9c232de931', '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', 'bb5f4ee143c970367eb409f2e4c1104898048b9d', '1b95491047add1103db0dfdfa84a9735dcb11e88', 'a00c6de13471a2d66e64aca140ddb21ef5521e62', '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', '775cc516543be86c15c1dc172f49c0d4e6e78235', 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', ] _expected_new_directories_second_visit = [ '025bca14fcc9f84b7ebb09df4ec1b3fadd89a74c', '14f88da1a1efe2efe1bde2da9245ea1346ed49a0', '513965efeb9dc5832a8c69f354e57c0e1df4cb31', '5281878409fa2ab0d35feeef2fe6463346f4418d', '60b7c18bc5922a81060425edd7a623a4759ba657', '8c81ff424af1c26ff913e16d340f06ea7da0171c', '8c96171056490917a3b953c2a70cecace44f3606', '8faa8fbcbba90c36ab7dd076fd8fda5a9c405f8a', 'b1224309f00536ea6f421af9f690bffab7bdb735', 'c2f820f60db474714853c59765b0f914feb0fcfe', 'e267845618e77ae0db8ca05428c0ee421df06a11', 'e5a783a68869f7bc2fb9126b9100d98f18ea747c' ] _expected_new_revisions_first_visit = { '969e0340155266e2a5b851e428e602152c08385f': '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', 'c9b9ae8360ce8a1e22867226987f61163c12d4c4': '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', '47831123f42cea24d6023e5570825cb62c3c1898': '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', } _expected_new_revisions_second_visit = { 'a4ffa8770a901c895a67bec7a501036e83aae256': '8faa8fbcbba90c36ab7dd076fd8fda5a9c405f8a', 'e6e41e3deb8df2b183f2d45e8f2e49a991c069a9': 'b1224309f00536ea6f421af9f690bffab7bdb735', 'ca12202b8a0eee7364204687649146e73e19ed32': '025bca14fcc9f84b7ebb09df4ec1b3fadd89a74c' } _expected_new_snapshot_first_visit = 'f2f59503de5a8aeabe7b68ce761d9e112713d996' _expected_branches_first_visit = { 'HEAD': { 'target': 'releases/0.0.4', 'target_type': 'alias' }, 'releases/0.0.2': { 'target': '969e0340155266e2a5b851e428e602152c08385f', 'target_type': 'revision' }, 'releases/0.0.3': { 'target': 'c9b9ae8360ce8a1e22867226987f61163c12d4c4', 'target_type': 'revision' }, 'releases/0.0.4': { 'target': '47831123f42cea24d6023e5570825cb62c3c1898', 'target_type': 'revision' } } _expected_new_snapshot_second_visit = '57957179a0ea016fcf9d02874b68547f2bd5698d' # noqa _expected_branches_second_visit = { 'HEAD': { 'target': 'releases/0.2.0', 'target_type': 'alias' }, 'releases/0.0.2': { 'target': '969e0340155266e2a5b851e428e602152c08385f', 'target_type': 'revision' }, 'releases/0.0.3': { 'target': 'c9b9ae8360ce8a1e22867226987f61163c12d4c4', 'target_type': 'revision' }, 'releases/0.0.4': { 'target': '47831123f42cea24d6023e5570825cb62c3c1898', 'target_type': 'revision' }, 'releases/0.0.5': { 'target': 'a4ffa8770a901c895a67bec7a501036e83aae256', 'target_type': 'revision' }, 'releases/0.1.0': { 'target': 'e6e41e3deb8df2b183f2d45e8f2e49a991c069a9', 'target_type': 'revision' }, 'releases/0.2.0': { 'target': 'ca12202b8a0eee7364204687649146e73e19ed32', 'target_type': 'revision' } } class NpmLoaderTest(NpmLoader): def parse_config_file(self, *args, **kwargs): return _LOADER_TESTS_CONFIG @requests_mock.Mocker() class TestNpmLoader(unittest.TestCase, BaseLoaderStorageTest): @classmethod def setUpClass(cls): cls.reset_loader() @classmethod def reset_loader(cls): cls.loader = NpmLoaderTest() cls.storage = cls.loader.storage def reset_loader_counters(self): counters_reset = dict.fromkeys(self.loader.counters.keys(), 0) self.loader.counters.update(counters_reset) def test_npm_loader_1_empty_package(self, m): init_test_data(m, package_metadata_file(empty_package), package_metadata_url(empty_package)) self.loader.load(empty_package, package_url(empty_package), package_metadata_url(empty_package)) self.assertCountContents(0) self.assertCountDirectories(0) self.assertCountRevisions(0) self.assertCountReleases(0) self.assertCountSnapshots(1) expected_branches = {} self.assertSnapshotEqual( snapshot_identifier({'branches': expected_branches}), expected_branches ) self.assertEqual(self.loader.load_status(), {'status': 'uneventful'}) self.assertEqual(self.loader.visit_status(), 'full') self.assertFalse(os.path.exists(self.loader.temp_directory)) def test_npm_loader_2_first_visit(self, m): self.reset_loader() init_test_data(m, package_metadata_file(package, visit=1), package_metadata_url(package)) self.loader.load(package, package_url(package), package_metadata_url(package)) self.assertCountContents(len(_expected_new_contents_first_visit)) self.assertCountDirectories(len(_expected_new_directories_first_visit)) self.assertCountRevisions(3, '3 releases so 3 revisions should be created') # noqa self.assertCountReleases(0, 'No release is created by the npm loader') self.assertCountSnapshots(1, 'Only 1 snapshot targeting all revisions') self.assertContentsContain(_expected_new_contents_first_visit) self.assertDirectoriesContain(_expected_new_directories_first_visit) self.assertRevisionsContain(_expected_new_revisions_first_visit) self.assertSnapshotEqual(_expected_new_snapshot_first_visit, _expected_branches_first_visit) self.assertEqual(self.loader.counters['contents'], len(_expected_new_contents_first_visit)) self.assertEqual(self.loader.counters['directories'], len(_expected_new_directories_first_visit)) self.assertEqual(self.loader.counters['revisions'], len(_expected_new_revisions_first_visit)) self.assertEqual(self.loader.counters['releases'], 0) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') self.assertFalse(os.path.exists(self.loader.temp_directory)) def test_npm_loader_3_first_visit_again(self, m): self.reset_loader_counters() init_test_data(m, package_metadata_file(package, visit=1), package_metadata_url(package)) self.loader.load(package, package_url(package), package_metadata_url(package)) # previously loaded objects should still be here self.assertCountContents(len(_expected_new_contents_first_visit)) self.assertCountDirectories(len(_expected_new_directories_first_visit)) self.assertCountRevisions(len(_expected_new_revisions_first_visit)) self.assertCountReleases(0) self.assertCountSnapshots(1) self.assertSnapshotEqual(_expected_new_snapshot_first_visit, _expected_branches_first_visit) # no objects should have been loaded in that visit counters_reset = dict.fromkeys(self.loader.counters.keys(), 0) self.assertEqual(self.loader.counters, counters_reset) self.assertEqual(self.loader.load_status(), {'status': 'uneventful'}) self.assertEqual(self.loader.visit_status(), 'full') self.assertFalse(os.path.exists(self.loader.temp_directory)) def test_npm_loader_4_second_visit(self, m): self.reset_loader_counters() init_test_data(m, package_metadata_file(package, visit=2), package_metadata_url(package)) self.loader.load(package, package_url(package), package_metadata_url(package)) expected_nb_contents = sum([len(_expected_new_contents_first_visit), len(_expected_new_contents_second_visit)]) expected_nb_directories = sum([len(_expected_new_directories_first_visit), # noqa len(_expected_new_directories_second_visit)]) # noqa expected_nb_revisions = sum([len(_expected_new_revisions_first_visit), len(_expected_new_revisions_second_visit)]) # noqa self.assertCountContents(expected_nb_contents) self.assertCountDirectories(expected_nb_directories) self.assertCountRevisions(expected_nb_revisions) self.assertCountReleases(0) self.assertCountSnapshots(2) self.assertContentsContain(_expected_new_contents_first_visit) self.assertContentsContain(_expected_new_contents_second_visit) self.assertDirectoriesContain(_expected_new_directories_first_visit) self.assertDirectoriesContain(_expected_new_directories_second_visit) self.assertRevisionsContain(_expected_new_revisions_first_visit) self.assertRevisionsContain(_expected_new_revisions_second_visit) self.assertSnapshotEqual(_expected_new_snapshot_first_visit, _expected_branches_first_visit) self.assertSnapshotEqual(_expected_new_snapshot_second_visit, _expected_branches_second_visit) self.assertEqual(self.loader.counters['contents'], len(_expected_new_contents_second_visit)) self.assertEqual(self.loader.counters['directories'], len(_expected_new_directories_second_visit)) self.assertEqual(self.loader.counters['revisions'], len(_expected_new_revisions_second_visit)) self.assertEqual(self.loader.counters['releases'], 0) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') self.assertFalse(os.path.exists(self.loader.temp_directory)) def test_npm_loader_5_package_json_non_unicode_encoding(self, m): init_test_data(m, package_metadata_file(package_non_utf8_encoding), package_metadata_url(package_non_utf8_encoding)) self.loader.load(package_non_utf8_encoding, package_url(package_non_utf8_encoding), package_metadata_url(package_non_utf8_encoding)) self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') self.assertFalse(os.path.exists(self.loader.temp_directory)) + + @patch('swh.loader.npm.client.tarball') + def test_npm_loader_6_invalid_tarball(self, m, mock_tarball): + + def _tarball_uncompress(filepath, path): + if filepath.endswith('0.0.3.tgz'): + raise Exception('Invalid tarball !') + else: + tarball.uncompress(filepath, path) + + mock_tarball.uncompress.side_effect = _tarball_uncompress + + self.reset_loader() + init_test_data(m, package_metadata_file(package, visit=1), + package_metadata_url(package)) + self.loader.load(package, package_url(package), + package_metadata_url(package)) + + snapshot = self.loader.last_snapshot() + for branch, target in snapshot['branches'].items(): + if branch == b'releases/0.0.3': + self.assertTrue(target is None) + else: + self.assertTrue(target is not None) + + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'partial')