diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -319,30 +319,31 @@ """ summary: Dict[str, Any] = {"status": "uneventful"} - with sentry_sdk.push_scope() as scope: - try: - results = [] - contents = objects.get("content", []) - # FIXME: with swh.objstorage > v2.0: self.objstorage.get_batch(contents) - content_data = self.objstorage.get_batch(c["sha1"] for c in contents) - for item, raw_content in zip(contents, content_data): - id_ = item["sha1"] - scope.set_tag("swh-indexer-content-sha1", hashutil.hash_to_hex(id_)) - if not raw_content: - self.log.warning( - "Content %s not found in objstorage", - hashutil.hash_to_hex(id_), - ) - continue - - results.extend(self.index(id_, data=raw_content)) - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when reading contents metadata.") - sentry_sdk.capture_exception() - summary["status"] = "failed" - return summary + try: + results = [] + contents = objects.get("content", []) + # FIXME: with swh.objstorage > v2.0: self.objstorage.get_batch(contents) + content_data = self.objstorage.get_batch(c["sha1"] for c in contents) + for item, raw_content in zip(contents, content_data): + id_ = item["sha1"] + sentry_sdk.set_tag( + "swh-indexer-content-sha1", hashutil.hash_to_hex(id_) + ) + if not raw_content: + self.log.warning( + "Content %s not found in objstorage", + hashutil.hash_to_hex(id_), + ) + continue + + results.extend(self.index(id_, data=raw_content)) + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when reading contents metadata.") + sentry_sdk.capture_exception() + summary["status"] = "failed" + return summary summary_persist = self.persist_index_computations(results) self.results = results @@ -380,32 +381,31 @@ ] results = [] summary: Dict = {"status": "uneventful"} - with sentry_sdk.push_scope() as scope: - try: - for sha1 in sha1s: - scope.set_tag( - "swh-indexer-content-sha1", hashutil.hash_to_hex(sha1) + try: + for sha1 in sha1s: + sentry_sdk.set_tag( + "swh-indexer-content-sha1", hashutil.hash_to_hex(sha1) + ) + try: + raw_content = self.objstorage.get(sha1) + except ObjNotFoundError: + self.log.warning( + "Content %s not found in objstorage" + % hashutil.hash_to_hex(sha1) ) - try: - raw_content = self.objstorage.get(sha1) - except ObjNotFoundError: - self.log.warning( - "Content %s not found in objstorage" - % hashutil.hash_to_hex(sha1) - ) - continue - res = self.index(sha1, raw_content, **kwargs) - if res: # If no results, skip it - results.extend(res) - summary["status"] = "eventful" - summary = self.persist_index_computations(results) - self.results = results - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when reading contents metadata.") - sentry_sdk.capture_exception() - summary["status"] = "failed" + continue + res = self.index(sha1, raw_content, **kwargs) + if res: # If no results, skip it + results.extend(res) + summary["status"] = "eventful" + summary = self.persist_index_computations(results) + self.results = results + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when reading contents metadata.") + sentry_sdk.capture_exception() + summary["status"] = "failed" return summary @@ -485,17 +485,14 @@ indexing result as dict to persist in the indexer backend """ - with sentry_sdk.push_scope() as scope: - for sha1 in self._list_contents_to_index( - partition_id, nb_partitions, indexed - ): - try: - raw_content = self.objstorage.get(sha1) - except ObjNotFoundError: - self.log.warning(f"Content {sha1.hex()} not found in objstorage") - continue - scope.set_tag("swh-indexer-content-sha1", sha1) - yield from self.index(sha1, raw_content, **kwargs) + for sha1 in self._list_contents_to_index(partition_id, nb_partitions, indexed): + try: + raw_content = self.objstorage.get(sha1) + except ObjNotFoundError: + self.log.warning(f"Content {sha1.hex()} not found in objstorage") + continue + sentry_sdk.set_tag("swh-indexer-content-sha1", sha1) + yield from self.index(sha1, raw_content, **kwargs) def _index_with_skipping_already_done( self, partition_id: int, nb_partitions: int @@ -615,23 +612,21 @@ ] + [Origin(url=origin["url"]) for origin in objects.get("origin", [])] summary: Dict[str, Any] = {"status": "uneventful"} - with sentry_sdk.push_scope() as scope: - try: - results = self.index_list( - origins, - # no need to check they exist, as we just received either an origin - # or visit status; which cannot be created by swh-storage unless - # the origin already exists - check_origin_known=False, - sentry_scope=scope, - ) - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when processing origins") - sentry_sdk.capture_exception() - summary["status"] = "failed" - return summary + try: + results = self.index_list( + origins, + # no need to check they exist, as we just received either an origin + # or visit status; which cannot be created by swh-storage unless + # the origin already exists + check_origin_known=False, + ) + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when processing origins") + sentry_sdk.capture_exception() + summary["status"] = "failed" + return summary summary_persist = self.persist_index_computations(results) self.results = results @@ -642,13 +637,10 @@ summary.update(summary_persist) return summary - def index_list( - self, origins: List[Origin], *, sentry_scope=None, **kwargs - ) -> List[TResult]: + def index_list(self, origins: List[Origin], **kwargs) -> List[TResult]: results = [] for origin in origins: - if sentry_scope is not None: - sentry_scope.set_tag("swh-indexer-origin-url", origin.url) + sentry_sdk.set_tag("swh-indexer-origin-url", origin.url) results.extend(self.index(origin.url, **kwargs)) return results @@ -707,18 +699,17 @@ # TODO: fetch raw_manifest when useful? - with sentry_sdk.push_scope() as scope: - for (dir_id, dir_) in directories: - swhid = f"swh:1:dir:{hashutil.hash_to_hex(dir_id)}" - scope.set_tag("swh-indexer-directory-swhid", swhid) - try: - results.extend(self.index(dir_id, dir_)) - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when processing directory") - sentry_sdk.capture_exception() - summary["status"] = "failed" + for (dir_id, dir_) in directories: + swhid = f"swh:1:dir:{hashutil.hash_to_hex(dir_id)}" + sentry_sdk.set_tag("swh-indexer-directory-swhid", swhid) + try: + results.extend(self.index(dir_id, dir_)) + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when processing directory") + sentry_sdk.capture_exception() + summary["status"] = "failed" summary_persist = self.persist_index_computations(results) if summary_persist: diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -72,18 +72,17 @@ ): def process_journal_objects(self, objects: ObjectsDict) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} - with sentry_sdk.push_scope() as scope: - try: - results = [] - for item in objects.get("raw_extrinsic_metadata", []): - remd = RawExtrinsicMetadata.from_dict(item) - scope.set_tag("swh-indexer-remd-swhid", remd.swhid()) - results.extend(self.index(remd.id, data=remd)) - except Exception: - if not self.catch_exceptions: - raise - summary["status"] = "failed" - return summary + try: + results = [] + for item in objects.get("raw_extrinsic_metadata", []): + remd = RawExtrinsicMetadata.from_dict(item) + sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid()) + results.extend(self.index(remd.id, data=remd)) + except Exception: + if not self.catch_exceptions: + raise + summary["status"] = "failed" + return summary summary_persist = self.persist_index_computations(results) self.results = results @@ -423,7 +422,6 @@ origins: List[Origin], *, check_origin_known: bool = True, - sentry_scope=None, **kwargs, ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] @@ -474,8 +472,8 @@ results = [] for (origin, head_swhid) in origin_heads.items(): - sentry_scope.set_tag("swh-indexer-origin-url", origin.url) - sentry_scope.set_tag("swh-indexer-origin-head-swhid", str(head_swhid)) + sentry_sdk.set_tag("swh-indexer-origin-url", origin.url) + sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid)) if head_swhid.object_type == ObjectType.REVISION: rev = head_revs[head_swhid.object_id] if not rev: diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,6 +7,7 @@ from unittest.mock import Mock import pytest +import sentry_sdk from swh.indexer.indexer import ( ContentIndexer, @@ -107,11 +108,18 @@ # As journal client, not catching exceptions with pytest.raises(_TestException): - assert indexer.process_journal_objects({"content": [{"sha1": sha1}]}) == { - "status": "failed" - } + indexer.process_journal_objects({"content": [{"sha1": sha1}]}) assert sentry_events == [] + # As journal client, check the frontend will be able to get the tag when reporting + try: + indexer.process_journal_objects({"content": [{"sha1": sha1}]}) + except Exception: + sentry_sdk.capture_exception() + else: + assert False + check_sentry(sentry_events, {"swh-indexer-content-sha1": sha1.hex()}) + def test_directory_indexer_catch_exceptions(sentry_events): indexer = CrashingDirectoryIndexer(config=BASE_TEST_CONFIG) @@ -143,6 +151,15 @@ indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) assert sentry_events == [] + # As journal client, check the frontend will be able to get the tag when reporting + try: + indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) + except Exception: + sentry_sdk.capture_exception() + else: + assert False + check_sentry(sentry_events, {"swh-indexer-directory-swhid": swhid}) + def test_origin_indexer_catch_exceptions(sentry_events): indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG) @@ -171,6 +188,15 @@ indexer.process_journal_objects({"origin": [{"url": origin_url}]}) assert sentry_events == [] + # As journal client, check the frontend will be able to get the tag when reporting + try: + indexer.process_journal_objects({"origin": [{"url": origin_url}]}) + except Exception: + sentry_sdk.capture_exception() + else: + assert False + check_sentry(sentry_events, {"swh-indexer-origin-url": origin_url}) + def test_content_partition_indexer_catch_exceptions(): indexer = CrashingContentPartitionIndexer( diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -259,17 +259,21 @@ assert orig_results == [] +@pytest.mark.parametrize("catch_exceptions", [True, False]) def test_origin_metadata_indexer_directory_error( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, sentry_events, + catch_exceptions, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" + indexer.catch_exceptions = catch_exceptions + with patch( "swh.indexer.metadata.DirectoryMetadataIndexer" ".translate_directory_intrinsic_metadata", @@ -296,17 +300,21 @@ assert orig_results == [] +@pytest.mark.parametrize("catch_exceptions", [True, False]) def test_origin_metadata_indexer_content_exception( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, obj_storage, sentry_events, + catch_exceptions, ) -> None: indexer = OriginMetadataIndexer(config=swh_indexer_config) origin = "https://github.com/librariesio/yarn-parser" + indexer.catch_exceptions = catch_exceptions + class TestException(Exception): pass