diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -319,27 +319,30 @@ """ summary: Dict[str, Any] = {"status": "uneventful"} - try: - results = [] - contents = objects.get("content", []) - # FIXME: with swh.objstorage > v2.0: self.objstorage.get_batch(contents) - content_data = self.objstorage.get_batch(c["sha1"] for c in contents) - for item, raw_content in zip(contents, content_data): - id_ = item["sha1"] - if not raw_content: - self.log.warning( - "Content %s not found in objstorage", hashutil.hash_to_hex(id_) - ) - continue - - results.extend(self.index(id_, data=raw_content)) - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when reading contents metadata.") - sentry_sdk.capture_exception() - summary["status"] = "failed" - return summary + with sentry_sdk.push_scope() as scope: + try: + results = [] + contents = objects.get("content", []) + # FIXME: with swh.objstorage > v2.0: self.objstorage.get_batch(contents) + content_data = self.objstorage.get_batch(c["sha1"] for c in contents) + for item, raw_content in zip(contents, content_data): + id_ = item["sha1"] + scope.set_tag("swh-indexer-content-sha1", hashutil.hash_to_hex(id_)) + if not raw_content: + self.log.warning( + "Content %s not found in objstorage", + hashutil.hash_to_hex(id_), + ) + continue + + results.extend(self.index(id_, data=raw_content)) + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when reading contents metadata.") + sentry_sdk.capture_exception() + summary["status"] = "failed" + return summary summary_persist = self.persist_index_computations(results) self.results = results @@ -377,28 +380,32 @@ ] results = [] summary: Dict = {"status": "uneventful"} - try: - for sha1 in sha1s: - try: - raw_content = self.objstorage.get(sha1) - except ObjNotFoundError: - self.log.warning( - "Content %s not found in objstorage" - % hashutil.hash_to_hex(sha1) + with sentry_sdk.push_scope() as scope: + try: + for sha1 in sha1s: + scope.set_tag( + "swh-indexer-content-sha1", hashutil.hash_to_hex(sha1) ) - continue - res = self.index(sha1, raw_content, **kwargs) - if res: # If no results, skip it - results.extend(res) - summary["status"] = "eventful" - summary = self.persist_index_computations(results) - self.results = results - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when reading contents metadata.") - sentry_sdk.capture_exception() - summary["status"] = "failed" + try: + raw_content = self.objstorage.get(sha1) + except ObjNotFoundError: + self.log.warning( + "Content %s not found in objstorage" + % hashutil.hash_to_hex(sha1) + ) + continue + res = self.index(sha1, raw_content, **kwargs) + if res: # If no results, skip it + results.extend(res) + summary["status"] = "eventful" + summary = self.persist_index_computations(results) + self.results = results + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when reading contents metadata.") + sentry_sdk.capture_exception() + summary["status"] = "failed" return summary @@ -478,13 +485,17 @@ indexing result as dict to persist in the indexer backend """ - for sha1 in self._list_contents_to_index(partition_id, nb_partitions, indexed): - try: - raw_content = self.objstorage.get(sha1) - except ObjNotFoundError: - self.log.warning(f"Content {sha1.hex()} not found in objstorage") - continue - yield from self.index(sha1, raw_content, **kwargs) + with sentry_sdk.push_scope() as scope: + for sha1 in self._list_contents_to_index( + partition_id, nb_partitions, indexed + ): + try: + raw_content = self.objstorage.get(sha1) + except ObjNotFoundError: + self.log.warning(f"Content {sha1.hex()} not found in objstorage") + continue + scope.set_tag("swh-indexer-content-sha1", sha1) + yield from self.index(sha1, raw_content, **kwargs) def _index_with_skipping_already_done( self, partition_id: int, nb_partitions: int @@ -604,20 +615,24 @@ ] + [Origin(url=origin["url"]) for origin in objects.get("origin", [])] summary: Dict[str, Any] = {"status": "uneventful"} - try: - results = self.index_list( - origins, - check_origin_known=False, - # no need to check they exist, as we just received either an origin or - # visit status; which cannot be created by swh-storage unless the origin - # already exists - ) - except Exception: - if not self.catch_exceptions: - raise - - summary["status"] = "failed" - return summary + with sentry_sdk.push_scope() as scope: + try: + results = self.index_list( + origins, + # no need to check they exist, as we just received either an origin + # or visit status; which cannot be created by swh-storage unless + # the origin + # already exists + check_origin_known=False, + sentry_scope=scope, + ) + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when processing origins") + sentry_sdk.capture_exception() + summary["status"] = "failed" + return summary summary_persist = self.persist_index_computations(results) self.results = results @@ -628,17 +643,14 @@ summary.update(summary_persist) return summary - def index_list(self, origins: List[Origin], **kwargs) -> List[TResult]: + def index_list( + self, origins: List[Origin], *, sentry_scope=None, **kwargs + ) -> List[TResult]: results = [] for origin in origins: - try: - results.extend(self.index(origin.url, **kwargs)) - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when processing origin %s", origin.url) - sentry_sdk.capture_exception() - raise + if sentry_scope is not None: + sentry_scope.set_tag("swh-indexer-origin-url", origin.url) + results.extend(self.index(origin.url, **kwargs)) return results @@ -696,15 +708,18 @@ # TODO: fetch raw_manifest when useful? - for (dir_id, dir_) in directories: - try: - results.extend(self.index(dir_id, dir_)) - except Exception: - if not self.catch_exceptions: - raise - self.log.exception("Problem when processing directory") - sentry_sdk.capture_exception() - summary["status"] = "failed" + with sentry_sdk.push_scope() as scope: + for (dir_id, dir_) in directories: + swhid = f"swh:1:dir:{hashutil.hash_to_hex(dir_id)}" + scope.set_tag("swh-indexer-directory-swhid", swhid) + try: + results.extend(self.index(dir_id, dir_)) + except Exception: + if not self.catch_exceptions: + raise + self.log.exception("Problem when processing directory") + sentry_sdk.capture_exception() + summary["status"] = "failed" summary_persist = self.persist_index_computations(results) if summary_persist: diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -72,16 +72,18 @@ ): def process_journal_objects(self, objects: ObjectsDict) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} - try: - results = [] - for item in objects.get("raw_extrinsic_metadata", []): - remd = RawExtrinsicMetadata.from_dict(item) - results.extend(self.index(remd.id, data=remd)) - except Exception: - if not self.catch_exceptions: - raise - summary["status"] = "failed" - return summary + with sentry_sdk.push_scope() as scope: + try: + results = [] + for item in objects.get("raw_extrinsic_metadata", []): + remd = RawExtrinsicMetadata.from_dict(item) + scope.set_tag("swh-indexer-remd-swhid", remd.swhid()) + results.extend(self.index(remd.id, data=remd)) + except Exception: + if not self.catch_exceptions: + raise + summary["status"] = "failed" + return summary summary_persist = self.persist_index_computations(results) self.results = results @@ -417,7 +419,12 @@ self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config) def index_list( - self, origins: List[Origin], check_origin_known: bool = True, **kwargs + self, + origins: List[Origin], + *, + check_origin_known: bool = True, + sentry_scope=None, + **kwargs, ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] head_rel_ids = [] @@ -467,6 +474,8 @@ results = [] for (origin, head_swhid) in origin_heads.items(): + sentry_scope.set_tag("swh-indexer-origin-url", origin.url) + sentry_scope.set_tag("swh-indexer-origin-head-swhid", str(head_swhid)) if head_swhid.object_type == ObjectType.REVISION: rev = head_revs[head_swhid.object_id] if not rev: diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py --- a/swh/indexer/tests/test_indexer.py +++ b/swh/indexer/tests/test_indexer.py @@ -73,9 +73,10 @@ return {"nb_added": len(results)} -def check_sentry(sentry_events): +def check_sentry(sentry_events, tags): assert len(sentry_events) == 1 sentry_event = sentry_events.pop() + assert sentry_event.get("tags") == tags assert "'_TestException'" in str(sentry_event) @@ -92,13 +93,13 @@ # As task, catching exceptions assert indexer.run([sha1]) == {"status": "failed"} - check_sentry(sentry_events) + check_sentry(sentry_events, {"swh-indexer-content-sha1": sha1.hex()}) # As journal client, catching exceptions assert indexer.process_journal_objects({"content": [{"sha1": sha1}]}) == { "status": "failed" } - check_sentry(sentry_events) + check_sentry(sentry_events, {"swh-indexer-content-sha1": sha1.hex()}) indexer.catch_exceptions = False @@ -124,16 +125,17 @@ sentry_events = sentry_capture_events() sha1 = DIRECTORY2.id + swhid = str(DIRECTORY2.swhid()) # As task, catching exceptions assert indexer.run([sha1]) == {"status": "failed"} - check_sentry(sentry_events) + check_sentry(sentry_events, {"swh-indexer-directory-swhid": swhid}) # As journal client, catching exceptions assert indexer.process_journal_objects({"directory": [DIRECTORY2.to_dict()]}) == { "status": "failed" } - check_sentry(sentry_events) + check_sentry(sentry_events, {"swh-indexer-directory-swhid": swhid}) indexer.catch_exceptions = False @@ -158,13 +160,13 @@ # As task, catching exceptions assert indexer.run([origin_url]) == {"status": "failed"} - check_sentry(sentry_events) + check_sentry(sentry_events, {"swh-indexer-origin-url": origin_url}) # As journal client, catching exceptions assert indexer.process_journal_objects({"origin": [{"url": origin_url}]}) == { "status": "failed" } - check_sentry(sentry_events) + check_sentry(sentry_events, {"swh-indexer-origin-url": origin_url}) indexer.catch_exceptions = False diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -283,6 +283,12 @@ assert len(sentry_events) == 1 sentry_event = sentry_events.pop() + assert sentry_event.get("tags") == { + "swh-indexer-origin-head-swhid": ( + "swh:1:rev:179fd041d75edab00feba8e4439897422f3bdfa1" + ), + "swh-indexer-origin-url": origin, + } assert "'TypeError'" in str(sentry_event) dir_id = DIRECTORY2.id @@ -320,6 +326,13 @@ assert len(sentry_events) == 1 sentry_event = sentry_events.pop() + assert sentry_event.get("tags") == { + "swh-indexer-content-sha1": "d8f40c3ca9cc30ddaca25c55b5dff18271ff030e", + "swh-indexer-origin-head-swhid": ( + "swh:1:rev:179fd041d75edab00feba8e4439897422f3bdfa1" + ), + "swh-indexer-origin-url": origin, + } assert ".TestException'" in str(sentry_event), sentry_event dir_id = DIRECTORY2.id