Differential D7727 Diff 28001 swh/loader/core/loader.py

Changeset View

Standalone View

swh/loader/core/loader.py

Show First 20 Lines • Show All 461 Lines • ▼ Show 20 Lines	def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]:
self.log.debug("lister_not provided, skipping extrinsic origin metadata")		self.log.debug("lister_not provided, skipping extrinsic origin metadata")
return []		return []

assert (		assert (
self.lister_instance_name is not None		self.lister_instance_name is not None
), "lister_instance_name is None, but lister_name is not"		), "lister_instance_name is None, but lister_name is not"

metadata = []		metadata = []
for cls in get_fetchers_for_lister(self.lister_name):
		fetcher_classes = get_fetchers_for_lister(self.lister_name)
		statsd.histogram(
		f"{STATSD_PREFIX}.metadata.fetchers",
		len(fetcher_classes),
		tags={"visit_type": self.visit_type},
		olasdUnsubmitted Done Inline Actions For all these histograms, we will want to change the configuration of the statsd exporter for this metric to generate a histogram with relevant buckets. Our default buckets are only relevant for timings and will be really wasteful for this (which will pretty much always return 0 or 1) https://forge.softwareheritage.org/source/puppet-swh-site/browse/production/data/common/common.yaml$3124-3151 https://github.com/prometheus/statsd_exporter#statsd-timers-and-distributions (something like: mappings: - match: whatever this metric name will end up being observer_type: histogram histogram_options: buckets: [0, 1, 2, 3, 4, 5] (you will want to test that in docker as well) olasd: For all these histograms, we will want to change the configuration of the statsd exporter for…
		)
		for cls in fetcher_classes:
metadata_fetcher = cls(		metadata_fetcher = cls(
origin=self.origin,		origin=self.origin,
lister_name=self.lister_name,		lister_name=self.lister_name,
lister_instance_name=self.lister_instance_name,		lister_instance_name=self.lister_instance_name,
credentials=self.metadata_fetcher_credentials,		credentials=self.metadata_fetcher_credentials,
)		)
with self.statsd_timed("fetch_one_metadata"):		with self.statsd_timed(
		"fetch_one_metadata", tags={"fetcher": cls.FETCHER_NAME}
		):
metadata.extend(metadata_fetcher.get_origin_metadata())		metadata.extend(metadata_fetcher.get_origin_metadata())
if self.parent_origins is None:		if self.parent_origins is None:
self.parent_origins = metadata_fetcher.get_parent_origins()		self.parent_origins = metadata_fetcher.get_parent_origins()
		statsd.histogram(
		f"{STATSD_PREFIX}.metadata.parent_origins",
		len(self.parent_origins),
		tags={"fetcher": cls.FETCHER_NAME, "visit_type": self.visit_type},
		)
		statsd.histogram(
		f"{STATSD_PREFIX}.metadata.objects",
		len(metadata),
		tags={"visit_type": self.visit_type},
		olasdUnsubmitted Done Inline Actions Do we really want a histogram, or just a running total? (note that the histogram will also generate a running total) olasd: Do we really want a histogram, or just a running total? (note that the histogram will also…
		vlorentzAuthorUnsubmitted Done Inline Actions I don't know; I'm only interested in the average, and an histogram seemed like the only way to do that (ditto above) vlorentz: I don't know; I'm only interested in the average, and an histogram seemed like the only way to…
		olasdUnsubmitted Done Inline Actions If you just want an average, then you just need to have a pair of metrics: one that is incremented for each operation (and will be shared across all metrics), and one that is incremented by the number of objects you want to count the average of. You can then divide these two to get a running average. The prometheus statsd exporter just generates these series automatically for histograms, so that they can be used with the prometheus histogram functions: there's a `<foo>_count` series for the number of times a value was measured, and a `<foo>_sum` series for the sum of all `<foo>` values mesured, in addition to all the "bucket counts" series (which generate one series for the number of counts lower than a given bucket limit). Either way, in terms of storage, as long as we pinpoint a small set of relevant buckets, it shouldn't matter too much. If you think that we're never going to need the fine-grained metrics (I probably agree), please use counters directly. olasd: If you just want an average, then you just need to have a pair of metrics: one that is…
		vlorentzAuthorUnsubmitted Done Inline Actions Done. I kept `{name}` inside the metric instead of tags; because it doesn't make sense to aggregate them (like in the other diff) IMO vlorentz: Done. I kept `{name}` inside the metric instead of tags; because it doesn't make sense to…
		olasdUnsubmitted Not Done Inline Actions Yeah, that's perfect! olasd: Yeah, that's perfect!
		)

return metadata		return metadata

@contextlib.contextmanager		@contextlib.contextmanager
def statsd_timed(self, name, tags={}):		def statsd_timed(self, name, tags={}):
with statsd.timed(		with statsd.timed(
f"{STATSD_PREFIX}.operation_duration_seconds",		f"{STATSD_PREFIX}.operation_duration_seconds",
tags={"visit_type": self.visit_type, "operation": name, **tags},		tags={"visit_type": self.visit_type, "operation": name, **tags},
▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines