diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 11a5a71..b1d027f 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,487 +1,487 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import tempfile import os from typing import ( Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple ) import attr import sentry_sdk from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model import from_disk from swh.model.hashutil import hash_to_hex from swh.model.model import ( BaseModel, Sha1Git, Content, SkippedContent, Directory, Revision, TargetType, Snapshot, Origin ) from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.loader.package.utils import download logger = logging.getLogger(__name__) class PackageLoader: # Origin visit type (str) set by the loader visit_type = '' def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.max_content_size = self.config['max_content_size'] def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git) -> Optional[Revision]: """Build the revision from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: a_metadata: Artifact metadata uncompressed_path: Artifact uncompressed path on disk Returns: SWH data dict """ raise NotImplementedError('build_revision') def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return '' def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot """ snapshot = None visit = self.storage.origin_visit_get_latest( self.url, require_snapshot=True) if visit and visit.get('snapshot'): snapshot = Snapshot.from_dict(snapshot_get_all_branches( self.storage, visit['snapshot'])) return snapshot def known_artifacts( self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]: """Retrieve the known releases/artifact for the origin. Args snapshot: snapshot for the visit Returns: Dict of keys revision id (bytes), values a metadata Dict. """ if not snapshot: return {} # retrieve only revisions (e.g the alias we do not want here) revs = [rev.target for rev in snapshot.branches.values() if rev and rev.target_type == TargetType.REVISION] known_revisions = self.storage.revision_get(revs) ret = {} for revision in known_revisions: if not revision: # revision_get can return None continue ret[revision['id']] = revision['metadata'] return ret def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: """Resolve the revision from a snapshot and an artifact metadata dict. If the artifact has already been downloaded, this will return the existing revision targeting that uncompressed artifact directory. Otherwise, this returns None. Args: snapshot: Snapshot artifact_metadata: Information dict Returns: None or revision identifier """ return None def download_package(self, p_info: Mapping[str, Any], tmpdir: str) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ a_uri = p_info['url'] filename = p_info.get('filename') return [download(a_uri, dest=tmpdir, filename=filename)] def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, 'src') for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """Return an extra dict of branches that are used to update the set of branches. """ return {} def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed status_visit = 'full' # either: partial, full tmp_revisions = {} # type: Dict[str, List] snapshot = None def finalize_visit() -> Dict[str, Any]: """Finalize the visit: - flush eventual unflushed data to storage - update origin visit's status - return the task's status """ if hasattr(self.storage, 'flush'): self.storage.flush() self.storage.origin_visit_update( origin=self.url, visit_id=visit.visit, status=status_visit, snapshot=snapshot and snapshot.id) result: Dict[str, Any] = { 'status': status_load, } if snapshot: result['snapshot_id'] = hash_to_hex(snapshot.id) return result # Prepare origin and origin_visit origin = Origin(url=self.url) try: self.storage.origin_add_one(origin) visit = self.storage.origin_visit_add( self.url, date=self.visit_date, type=self.visit_type) except Exception as e: logger.exception('Failed to initialize origin_visit for %s', self.url) sentry_sdk.capture_exception(e) return {'status': 'failed'} try: last_snapshot = self.last_snapshot() logger.debug('last snapshot: %s', last_snapshot) known_artifacts = self.known_artifacts(last_snapshot) logger.debug('known artifacts: %s', known_artifacts) except Exception as e: logger.exception('Failed to get previous state for %s', self.url) sentry_sdk.capture_exception(e) status_visit = 'partial' status_load = 'failed' return finalize_visit() load_exceptions = [] for version in self.get_versions(): # for each logger.debug('version: %s', version) tmp_revisions[version] = [] # `p_` stands for `package_` for branch_name, p_info in self.get_package_info(version): logger.debug('package_info: %s', p_info) revision_id = self.resolve_revision_from( known_artifacts, p_info['raw']) if revision_id is None: try: revision_id = self._load_revision(p_info, origin) status_load = 'eventful' except Exception as e: load_exceptions.append(e) sentry_sdk.capture_exception(e) logger.exception('Failed loading branch %s for %s', branch_name, self.url) continue if revision_id is None: continue tmp_revisions[version].append((branch_name, revision_id)) if load_exceptions: status_visit = 'partial' if not tmp_revisions: # We could not load any revisions; fail completely - status_visit = 'failed' + status_visit = 'partial' status_load = 'failed' return finalize_visit() try: # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug('default version: %s', default_version) # Retrieve extra branches extra_branches = self.extra_branches() logger.debug('extra branches: %s', extra_branches) snapshot = self._load_snapshot(default_version, tmp_revisions, extra_branches) except Exception as e: logger.exception('Failed to build snapshot for origin %s', self.url) sentry_sdk.capture_exception(e) status_visit = 'partial' status_load = 'failed' return finalize_visit() def _load_revision(self, p_info, origin) -> Optional[Sha1Git]: """Does all the loading of a revision itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and revision to self.storage * returns (revision_id, loaded) Raises exception when unable to download or uncompress artifacts """ with tempfile.TemporaryDirectory() as tmpdir: dl_artifacts = self.download_package(p_info, tmpdir) uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug('uncompressed_path: %s', uncompressed_path) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode('utf-8'), max_content_length=self.max_content_size) contents: List[Content] = [] skipped_contents: List[SkippedContent] = [] directories: List[Directory] = [] for obj in directory.iter_tree(): obj = obj.to_model() if isinstance(obj, Content): # FIXME: read the data from disk later (when the # storage buffer is flushed). obj = obj.with_data() contents.append(obj) elif isinstance(obj, SkippedContent): skipped_contents.append(obj) elif isinstance(obj, Directory): directories.append(obj) else: raise TypeError( f'Unexpected content type from disk: {obj}') logger.debug('Number of skipped contents: %s', len(skipped_contents)) self.storage.skipped_content_add(skipped_contents) logger.debug('Number of contents: %s', len(contents)) self.storage.content_add(contents) logger.debug('Number of directories: %s', len(directories)) self.storage.directory_add(directories) # FIXME: This should be release. cf. D409 revision = self.build_revision( p_info['raw'], uncompressed_path, directory=directory.hash) if not revision: # Some artifacts are missing intrinsic metadata # skipping those return None metadata = revision.metadata or {} metadata.update({ 'original_artifact': [ hashes for _, hashes in dl_artifacts ], }) revision = attr.evolve(revision, metadata=metadata) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) return revision.id def _load_snapshot( self, default_version: str, revisions: Dict[str, List[Tuple[str, bytes]]], extra_branches: Dict[bytes, Mapping[str, Any]] ) -> Optional[Snapshot]: """Build snapshot out of the current revisions stored and extra branches. Then load it in the storage. """ logger.debug('revisions: %s', revisions) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_revisions in revisions.items(): if version == default_version and \ len(branch_name_revisions) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_revisions[0] # except for some corner case (deposit) if branch_name != 'HEAD': branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name.encode('utf-8'), } for branch_name, target in branch_name_revisions: branches[branch_name.encode('utf-8')] = { 'target_type': 'revision', 'target': target, } # Deal with extra-branches for name, branch_target in extra_branches.items(): if name in branches: logger.error("Extra branch '%s' has been ignored", name) else: branches[name] = branch_target snapshot_data = { 'branches': branches } logger.debug('snapshot: %s', snapshot_data) snapshot = Snapshot.from_dict(snapshot_data) logger.debug('snapshot: %s', snapshot) self.storage.snapshot_add([snapshot]) return snapshot diff --git a/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify new file mode 100644 index 0000000..1f7c947 --- /dev/null +++ b/swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify @@ -0,0 +1,20 @@ +{ + "_id": "catify", + "_rev": "1-67058d22a9c3398032ee164ad49cd3db", + "name": "catify", + "time": { + "modified": "2017-08-23T00:06:59.080Z", + "created": "2014-04-03T23:11:33.508Z" + }, + "maintainers": [ + { + "email": "shawn@andgoodwork.com", + "name": "shawnhilgart" + } + ], + "description": "Middleware that redirects users at random to a cat gif.", + "readme": "catify\n======\n\nMiddleware that redirects users at random to a cat gif.\n\n## Usage\n\n```javascript\n\nvar catify = require('catify');\nvar express = require('express');\nvar app = new express();\n\napp.use(catify(50)); // catify accepts one argument for probability\n\n```\n\n", + "versions": {}, + "dist-tags": {}, + "readmeFilename": "" +} diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py index 687ffaa..495275e 100644 --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -1,591 +1,608 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import pytest from swh.model.hashutil import hash_to_bytes from swh.model.model import Person from swh.loader.package.npm.loader import ( NpmLoader, extract_npm_package_author, artifact_to_revision_id ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) def test_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, 'https_replicate.npmjs.com', 'org_visit1') with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata['versions']['0.0.2']) == \ Person( fullname=b'mooz ', name=b'mooz', email=b'stillpedant@gmail.com' ) assert ( extract_npm_package_author(package_metadata['versions']['0.0.3']) == Person( fullname=b'Masafumi Oyamada ', name=b'Masafumi Oyamada', email=b'stillpedant@gmail.com' ) ) package_json = json.loads(''' { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }''') # noqa assert extract_npm_package_author(package_json) == \ Person( fullname=b'Yauheni Pakala ', name=b'Yauheni Pakala', email=b'evgeniy.pakalo@gmail.com' ) package_json = json.loads(''' { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }''') assert extract_npm_package_author(package_json) == \ Person( fullname=b'Shawn Walsh', name=b'Shawn Walsh', email=None ) package_json = json.loads(''' { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ Person( fullname=b'fengmk2 (https://fengmk2.com)', name=b'fengmk2', email=b'fengmk2@gmail.com', ) package_json = json.loads(''' { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ Person( fullname=b'xiaohuoni <448627663@qq.com>', name=b'xiaohuoni', email=b'448627663@qq.com' ) def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes([ '4ce3058e16ab3d7e077f65aabf855c34895bf17c', '858c3ceee84c8311adc808f8cdb30d233ddc9d18', '0fa33b4f5a4e0496da6843a38ff1af8b61541996', '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', '9163ac8025923d5a45aaac482262893955c9b37b', '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', '18c03aac6d3e910efb20039c15d70ab5e0297101', '41265c42446aac17ca769e67d1704f99e5a1394d', '783ff33f5882813dca9239452c4a7cadd4dba778', 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', '112d1900b4c2e3e9351050d1b542c9744f9793f3', '5439bbc4bd9a996f1a38244e6892b71850bc98fd', 'd83097a2f994b503185adf4e719d154123150159', 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', 'b3523a26f7147e4af40d9d462adaae6d49eda13e', 'cd065fb435d6fb204a8871bcd623d0d0e673088c', '2854a40855ad839a54f4b08f5cff0cf52fca4399', 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', '585fc5caab9ead178a327d3660d35851db713df1', 'e8cd41a48d79101977e3036a87aeb1aac730686f', '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', '9c3cc2763bf9e9e37067d3607302c4776502df98', '3649a68410e354c83cd4a38b66bd314de4c8f5c9', 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', '38de737da99514de6559ff163c988198bc91367a', ]) _expected_new_directories_first_visit = normalize_hashes([ '3370d20d6f96dc1c9e50f083e2134881db110f4f', '42753c0c2ab00c4501b552ac4671c68f3cf5aece', 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce', '80579be563e2ef3e385226fe7a3f079b377f142c', '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', 'bcad03ce58ac136f26f000990fc9064e559fe1c0', '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', '584b5b4b6cf7f038095e820b99386a9c232de931', '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', 'bb5f4ee143c970367eb409f2e4c1104898048b9d', '1b95491047add1103db0dfdfa84a9735dcb11e88', 'a00c6de13471a2d66e64aca140ddb21ef5521e62', '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', '775cc516543be86c15c1dc172f49c0d4e6e78235', 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', ]) _expected_new_revisions_first_visit = normalize_hashes({ 'd8a1c7474d2956ac598a19f0f27d52f7015f117e': '42753c0c2ab00c4501b552ac4671c68f3cf5aece', '5f9eb78af37ffd12949f235e86fac04898f9f72a': '3370d20d6f96dc1c9e50f083e2134881db110f4f', 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a': 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'} ) def package_url(package): return 'https://www.npmjs.com/package/%s' % package def package_metadata_url(package): return 'https://replicate.npmjs.com/%s/' % package def test_revision_metadata_structure(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package_url(package)) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'd8a1c7474d2956ac598a19f0f27d52f7015f117e') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_npm_loader_first_visit(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package_url(package)) actual_load_status = loader.load() expected_snapshot_id = 'd0587e1195aed5a8800411a008f2f2d627f18e2d' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats assert len(list(loader.storage.content_get( _expected_new_contents_first_visit))) == len( _expected_new_contents_first_visit) assert list(loader.storage.directory_missing( _expected_new_directories_first_visit)) == [] assert list(loader.storage.revision_missing( _expected_new_revisions_first_visit)) == [] expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': { 'target': 'releases/0.0.4', 'target_type': 'alias' }, 'releases/0.0.2': { 'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e', 'target_type': 'revision' }, 'releases/0.0.3': { 'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a', 'target_type': 'revision' }, 'releases/0.0.4': { 'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) def test_npm_loader_incremental_visit( swh_config, requests_mock_datadir_visits): package = 'org' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats loader._info = None # reset loader internal state actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'eventful' snap_id2 = actual_load_status2['snapshot_id'] assert snap_id2 is not None assert snap_id2 != actual_load_status['snapshot_id'] origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'npm' stats = get_stats(loader.storage) assert { # 3 new releases artifacts 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 15, 'origin': 1, 'origin_visit': 2, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 3, 'skipped_content': 0, 'snapshot': 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://registry.npmjs.org') ] assert len(urls) == len(set(urls)) # we visited each artifact once across @pytest.mark.usefixtures('requests_mock_datadir') def test_npm_loader_version_divergence(swh_config): package = '@aller_shared' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { # 1 new releases artifacts 'content': 534, 'directory': 153, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 2, 'skipped_content': 0, 'snapshot': 1, } == stats expected_snapshot = { 'id': 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92', 'branches': { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0' }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '845673bfe8cbd31b1eaf757745a964137e6f9116', }, 'releases/0.1.1-alpha.14': { 'target_type': 'revision', 'target': '05181c12cd8c22035dd31155656826b85745da37', }, }, } check_snapshot(expected_snapshot, loader.storage) def test_npm_artifact_to_revision_id_none(): """Current loader version should stop soon if nothing can be found """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', }, } known_artifacts = { 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92': {}, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) is None def test_npm_artifact_to_revision_id_old_loader_version(): """Current loader version should solve old metadata scheme """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'package_source': { 'sha1': "something-wrong" } }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'package_source': { 'sha1': '05181c12cd8c22035dd31155656826b85745da37', } } } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116') def test_npm_artifact_to_revision_id_current_loader_version(): """Current loader version should be able to solve current metadata scheme """ artifact_metadata = { 'dist': { 'shasum': '05181c12cd8c22035dd31155656826b85745da37', } } known_artifacts = { hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92'): { 'original_artifact': [{ 'checksums': { 'sha1': "05181c12cd8c22035dd31155656826b85745da37" }, }], }, hash_to_bytes('845673bfe8cbd31b1eaf757745a964137e6f9116'): { 'original_artifact': [{ 'checksums': { 'sha1': 'something-wrong' }, }], }, } assert artifact_to_revision_id(known_artifacts, artifact_metadata) \ == hash_to_bytes('b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92') def test_npm_artifact_with_no_intrinsic_metadata( swh_config, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ package = 'nativescript-telerik-analytics' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' # no branch as one artifact without any intrinsic metadata expected_snapshot = { 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', 'branches': {}, } check_snapshot(expected_snapshot, loader.storage) origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' def test_npm_artifact_with_no_upload_time(swh_config, requests_mock_datadir): """With no time upload, artifact is skipped """ package = 'jammit-no-time' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'uneventful' # no branch as one artifact without any intrinsic metadata expected_snapshot = { 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', 'branches': {}, } check_snapshot(expected_snapshot, loader.storage) origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'partial' assert origin_visit['type'] == 'npm' def test_npm_artifact_use_mtime_if_no_time(swh_config, requests_mock_datadir): """With no time upload, artifact is skipped """ package = 'jammit-express' url = package_url(package) loader = NpmLoader(url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' # artifact is used expected_snapshot = { 'id': 'd6e08e19159f77983242877c373c75222d5ae9dd', 'branches': { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.0.1' }, 'releases/0.0.1': { 'target_type': 'revision', 'target': '9e4dd2b40d1b46b70917c0949aa2195c823a648e', } } } check_snapshot(expected_snapshot, loader.storage) origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' + + +def test_npm_no_artifact(swh_config, requests_mock_datadir): + """If no artifacts at all is found for origin, the visit fails completely + + """ + package = 'catify' + url = package_url(package) + loader = NpmLoader(url) + actual_load_status = loader.load() + assert actual_load_status == { + 'status': 'failed', + } + + origin_visit = loader.storage.origin_visit_get_latest(url) + assert origin_visit['status'] == 'partial' + assert origin_visit['type'] == 'npm'