diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -13,6 +13,7 @@ ) import attr +import sentry_sdk from swh.core.tarball import uncompress from swh.core.config import SWHConfig @@ -264,14 +265,37 @@ tmp_revisions = {} # type: Dict[str, List] snapshot = None + def finalize_visit() -> Dict[str, Any]: + """Finalize the visit: + + - flush eventual unflushed data to storage + - update origin visit's status + - return the task's status + + """ + if hasattr(self.storage, 'flush'): + self.storage.flush() + self.storage.origin_visit_update( + origin=self.url, visit_id=visit.visit, status=status_visit, + snapshot=snapshot and snapshot.id) + + result: Dict[str, Any] = { + 'status': status_load, + } + if snapshot: + result['snapshot_id'] = hash_to_hex(snapshot.id) + return result + # Prepare origin and origin_visit origin = Origin(url=self.url) try: self.storage.origin_add_one(origin) visit = self.storage.origin_visit_add( self.url, date=self.visit_date, type=self.visit_type) - except Exception: - logger.exception('Failed to create origin/origin_visit:') + except Exception as e: + logger.exception('Failed to initialize origin_visit for %s', + self.url) + sentry_sdk.capture_exception(e) return {'status': 'failed'} try: @@ -279,74 +303,85 @@ logger.debug('last snapshot: %s', last_snapshot) known_artifacts = self.known_artifacts(last_snapshot) logger.debug('known artifacts: %s', known_artifacts) + except Exception as e: + logger.exception('Failed to get previous state for %s', self.url) + sentry_sdk.capture_exception(e) + status_visit = 'partial' + status_load = 'failed' + return finalize_visit() + + load_exceptions = [] + + for version in self.get_versions(): # for each + logger.debug('version: %s', version) + tmp_revisions[version] = [] + # `p_` stands for `package_` + for branch_name, p_info in self.get_package_info(version): + logger.debug('package_info: %s', p_info) + revision_id = self.resolve_revision_from( + known_artifacts, p_info['raw']) + if revision_id is None: + try: + revision_id = self._load_revision(p_info, origin) + status_load = 'eventful' + except Exception as e: + load_exceptions.append(e) + sentry_sdk.capture_exception(e) + logger.exception('Failed loading branch %s for %s', + branch_name, self.url) + continue - for version in self.get_versions(): # for each - logger.debug('version: %s', version) - tmp_revisions[version] = [] - # `p_` stands for `package_` - for branch_name, p_info in self.get_package_info(version): - logger.debug('package_info: %s', p_info) - revision_id = self.resolve_revision_from( - known_artifacts, p_info['raw']) if revision_id is None: - (revision_id, loaded) = \ - self._load_revision(p_info, origin) - if loaded: - status_load = 'eventful' - else: - status_visit = 'partial' - if revision_id is None: - continue - - tmp_revisions[version].append((branch_name, revision_id)) - - except Exception: - logger.exception('Fail to load %s' % self.url) + continue + + tmp_revisions[version].append((branch_name, revision_id)) + + if load_exceptions: status_visit = 'partial' + + if not tmp_revisions: + # We could not load any revisions; fail completely + status_visit = 'failed' status_load = 'failed' - finally: + return finalize_visit() + + try: # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug('default version: %s', default_version) + # Retrieve extra branches extra_branches = self.extra_branches() logger.debug('extra branches: %s', extra_branches) - snapshot = self._load_snapshot( - default_version, tmp_revisions, extra_branches) - if hasattr(self.storage, 'flush'): - self.storage.flush() - self.storage.origin_visit_update( - origin=self.url, visit_id=visit.visit, status=status_visit, - snapshot=snapshot and snapshot.id) - result: Dict[str, Any] = { - 'status': status_load, - } - if snapshot: - result['snapshot_id'] = hash_to_hex(snapshot.id) - return result - def _load_revision(self, p_info, origin) -> Tuple[Optional[Sha1Git], bool]: + snapshot = self._load_snapshot(default_version, tmp_revisions, + extra_branches) + + except Exception as e: + logger.exception('Failed to build snapshot for origin %s', + self.url) + sentry_sdk.capture_exception(e) + status_visit = 'partial' + status_load = 'failed' + + return finalize_visit() + + def _load_revision(self, p_info, origin) -> Optional[Sha1Git]: """Does all the loading of a revision itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and revision to self.storage * returns (revision_id, loaded) + + Raises + exception when unable to download or uncompress artifacts + """ with tempfile.TemporaryDirectory() as tmpdir: - try: - dl_artifacts = self.download_package(p_info, tmpdir) - except Exception: - logger.exception('Unable to retrieve %s', - p_info) - return (None, False) - - try: - uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) - logger.debug('uncompressed_path: %s', uncompressed_path) - except ValueError: - logger.exception('Fail to uncompress %s', - p_info['url']) - return (None, False) + dl_artifacts = self.download_package(p_info, tmpdir) + + uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) + logger.debug('uncompressed_path: %s', uncompressed_path) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode('utf-8'), @@ -386,7 +421,7 @@ if not revision: # Some artifacts are missing intrinsic metadata # skipping those - return (None, True) + return None metadata = revision.metadata or {} metadata.update({ @@ -400,7 +435,7 @@ self.storage.revision_add([revision]) - return (revision.id, True) + return revision.id def _load_snapshot( self, default_version: str, diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -545,7 +545,7 @@ loader = NpmLoader(url) actual_load_status = loader.load() - assert actual_load_status['status'] == 'failed' + assert actual_load_status['status'] == 'uneventful' # no branch as one artifact without any intrinsic metadata expected_snapshot = { diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -240,9 +240,7 @@ loader = PyPILoader(url) actual_load_status = loader.load() - assert actual_load_status['status'] == 'failed' - assert actual_load_status[ - 'snapshot_id'] == '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e' + assert actual_load_status == {'status': 'failed'} stats = get_stats(loader.storage) @@ -255,7 +253,7 @@ 'release': 0, 'revision': 0, 'skipped_content': 0, - 'snapshot': 1, + 'snapshot': 0, } == stats origin_visit = next(loader.storage.origin_visit_get(url))