No OneTemporary
Actions

Size

98 KB

Subscribers

None

View Options

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.

	diff --git a/swh/graph/luigi.py b/swh/graph/luigi.py
	index b162244..a190a80 100644
	--- a/swh/graph/luigi.py
	+++ b/swh/graph/luigi.py
	@@ -1,648 +1,654 @@
	# Copyright (C) 2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	"""
	Luigi tasks
	===========

	This module contains `Luigi <https://luigi.readthedocs.io/>`_ tasks,
	as an alternative to the CLI that can be composed with other tasks,
	such as swh-dataset's.

	Unlike the CLI, this requires the graph to be named `graph`.

	File layout
	-----------

	In addition to files documented in :ref:`graph-compression` (eg. :file:`graph.graph`,
	:file:`graph.mph`, ...), tasks in this module produce this directory structure::

	swh_<date>[_<flavor>]/
	graph.graph
	graph.mph
	...
	meta/
	export.json
	compression.json

	``graph.meta/export.json`` is copied from the ORC dataset exported by
	:mod:`swh.dataset.luigi`.

	``graph.meta/compression.json`` contains information about the compression itself,
	for provenance tracking.
	For example:

	.. code-block:: json

	[
	{
	"steps": null,
	"export_start": "2022-11-08T11:00:54.998799+00:00",
	"export_end": "2022-11-08T11:05:53.105519+00:00",
	"object_type": [
	"origin",
	"origin_visit"
	],
	"hostname": "desktop5",
	"conf": {},
	"tool": {
	"name": "swh.graph",
	"version": "2.2.0"
	}
	}
	]

	When the compression pipeline is run in separate steps, each of the steps is recorded
	as an object in the root list.

	S3 layout
	---------

	As ``.bin`` files are meant to be accessed randomly, they are uncompressed on disk.
	However, this is undesirable on at-rest/long-term storage like on S3, because
	some are very sparse (eg. :file:`graph.property.committer_timestamp.bin` can be
	quickly compressed from 300 to 1GB).

	Therefore, these files are compressed to ``.bin.zst``, and need to be decompressed
	when downloading.

	The layout is otherwise the same as the file layout.
	"""

	# WARNING: do not import unnecessary things here to keep cli startup time under
	# control
	from pathlib import Path
	from typing import Dict, List, Tuple

	import luigi

	from swh.dataset.luigi import Format, LocalExport, ObjectType, S3PathParameter


	class CompressGraph(luigi.Task):
	local_export_path = luigi.PathParameter(significant=False)
	local_graph_path = luigi.PathParameter()
	batch_size = luigi.IntParameter(
	default=0,
	significant=False,
	description="""
	Size of work batches to use while compressing.
	Larger is faster, but consumes more resources.
	""",
	)

	object_types = list(ObjectType)
	# To make this configurable, we could use this:
	# object_types = luigi.EnumListParameter(
	# enum=ObjectType, default=list(ObjectType), batch_method=merge_lists
	# )
	# then use swh.dataset.luigi._export_metadata_has_object_types to check in
	# .meta/export.json that all objects are present before skipping the task

	def requires(self) -> List[luigi.Task]:
	"""Returns a :class:`LocalExport` task."""
	return [
	LocalExport(
	local_export_path=self.local_export_path,
	formats=[Format.orc], # type: ignore[attr-defined]
	object_types=self.object_types,
	)
	]

	def output(self) -> List[luigi.LocalTarget]:
	"""Returns the ``meta/*.json`` targets"""
	return [self._export_meta(), self._compression_meta()]

	def _export_meta(self) -> luigi.Target:
	"""Returns the metadata on the dataset export"""
	return luigi.LocalTarget(self.local_graph_path / "meta/export.json")

	def _compression_meta(self) -> luigi.Target:
	"""Returns the metadata on the compression pipeline"""
	return luigi.LocalTarget(self.local_graph_path / "meta/compression.json")

	def run(self):
	"""Runs the full compression pipeline, then writes :file:`meta/compression.json`

	This does not support running individual steps yet."""
	import datetime
	import json
	import shutil
	import socket

	import pkg_resources

	from swh.graph import webgraph

	conf = {} # TODO: make this configurable
	steps = None # TODO: make this configurable

	if self.batch_size:
	conf["batch_size"] = self.batch_size

	# Delete stamps. Otherwise interrupting this compression pipeline may leave
	# stamps from a previous successful compression
	if self._export_meta().exists():
	self._export_meta().remove()
	if self._compression_meta().exists():
	self._compression_meta().remove()

	# Make sure we don't accidentally append to existing files
	if self.local_graph_path.exists():
	shutil.rmtree(self.local_graph_path)

	output_directory = self.local_graph_path
	graph_name = "graph"

	def progress_cb(percentage: int, step: webgraph.CompressionStep):
	self.set_progress_percentage(percentage)
	self.set_status_message(f"Running {step.name} (step #{step.value})")

	start_date = datetime.datetime.now(tz=datetime.timezone.utc)
	webgraph.compress(
	graph_name,
	self.local_export_path / "orc",
	output_directory,
	steps,
	conf,
	)
	end_date = datetime.datetime.now(tz=datetime.timezone.utc)

	# Copy dataset export metadata
	with self._export_meta().open("w") as write_fd:
	with (self.local_export_path / "meta" / "export.json").open() as read_fd:
	write_fd.write(read_fd.read())

	# Append metadata about this compression pipeline
	if self._compression_meta().exists():
	with self._compression_meta().open("w") as fd:
	meta = json.load(fd)
	else:
	meta = []

	meta.append(
	{
	"steps": steps,
	"compression_start": start_date.isoformat(),
	"compression_end": end_date.isoformat(),
	"object_type": [object_type.name for object_type in self.object_types],
	"hostname": socket.getfqdn(),
	"conf": conf,
	"tool": {
	"name": "swh.graph",
	"version": pkg_resources.get_distribution("swh.graph").version,
	},
	}
	)
	with self._compression_meta().open("w") as fd:
	json.dump(meta, fd, indent=4)


	class UploadGraphToS3(luigi.Task):
	"""Uploads a local compressed graphto S3; creating automatically if it does
	not exist.

	Example invocation::

	luigi --local-scheduler --module swh.graph.luigi UploadGraphToS3 \
	--local-graph-path=graph/ \
	--s3-graph-path=s3://softwareheritage/graph/swh_2022-11-08/compressed/
	"""

	local_graph_path = luigi.PathParameter(significant=False)
	s3_graph_path = S3PathParameter()

	def requires(self) -> List[luigi.Task]:
	"""Returns a :class:`CompressGraph` task that writes local files at the
	expected location."""
	return [
	CompressGraph(
	local_graph_path=self.local_graph_path,
	)
	]

	def output(self) -> List[luigi.Target]:
	"""Returns stamp and meta paths on S3."""
	return [self._meta()]

	def _meta(self):
	import luigi.contrib.s3

	return luigi.contrib.s3.S3Target(f"{self.s3_graph_path}/meta/compression.json")

	def run(self) -> None:
	"""Copies all files: first the graph itself, then :file:`meta/compression.json`."""
	import subprocess
	import tempfile

	import luigi.contrib.s3
	import tqdm

	compression_metadata_path = self.local_graph_path / "meta" / "compression.json"
	seen_compression_metadata = False

	client = luigi.contrib.s3.S3Client()

	# recursively copy local files to S3, and end with compression metadata
	paths = list(self.local_graph_path.glob("*/"))
	for (i, path) in tqdm.tqdm(
	list(enumerate(paths)),
	desc="Uploading compressed graph",
	):
	if path == compression_metadata_path:
	# Write it last
	seen_compression_metadata = True
	continue
	if path.is_dir():
	continue
	relative_path = path.relative_to(self.local_graph_path)
	self.set_progress_percentage(int(i * 100 / len(paths)))

	if path.suffix == ".bin":
	# Large sparse file; store it compressed on S3.
	with tempfile.NamedTemporaryFile(
	prefix=path.stem, suffix=".bin.zst"
	) as fd:
	self.set_status_message(f"Compressing {relative_path}")
	subprocess.run(
	["zstdmt", "--force", "--keep", path, "-o", fd.name], check=True
	)
	self.set_status_message(f"Uploading {relative_path} (compressed)")
	client.put_multipart(
	fd.name,
	f"{self.s3_graph_path}/{relative_path}.zst",
	ACL="public-read",
	)
	else:
	self.set_status_message(f"Uploading {relative_path}")
	client.put_multipart(
	path, f"{self.s3_graph_path}/{relative_path}", ACL="public-read"
	)

	assert (
	seen_compression_metadata
	), "did not see meta/compression.json in directory listing"

	# Write it last, to act as a stamp
	client.put(
	compression_metadata_path,
	self._meta().path,
	ACL="public-read",
	)


	class DownloadGraphFromS3(luigi.Task):
	"""Downloads a local dataset graph from S3.

	This performs the inverse operation of :class:`UploadGraphToS3`

	Example invocation::

	luigi --local-scheduler --module swh.graph.luigi DownloadGraphFromS3 \
	--local-graph-path=graph/ \
	--s3-graph-path=s3://softwareheritage/graph/swh_2022-11-08/compressed/
	"""

	local_graph_path = luigi.PathParameter()
	s3_graph_path = S3PathParameter(significant=False)

	def requires(self) -> List[luigi.Task]:
	"""Returns a :class:`ExportGraph` task that writes local files at the
	expected location."""
	return [
	UploadGraphToS3(
	local_graph_path=self.local_graph_path,
	s3_graph_path=self.s3_graph_path,
	)
	]

	def output(self) -> List[luigi.Target]:
	"""Returns stamp and meta paths on the local filesystem."""
	return [self._meta()]

	def _meta(self):
	return luigi.LocalTarget(self.local_graph_path / "meta" / "export.json")

	def run(self) -> None:
	"""Copies all files: first the graph itself, then :file:`meta/compression.json`."""
	import subprocess
	import tempfile

	import luigi.contrib.s3
	import tqdm

	client = luigi.contrib.s3.S3Client()

	compression_metadata_path = f"{self.s3_graph_path}/meta/compression.json"
	seen_compression_metadata = False

	# recursively copy local files to S3, and end with compression metadata
	files = list(client.list(self.s3_graph_path))
	for (i, file_) in tqdm.tqdm(
	list(enumerate(files)),
	desc="Downloading",
	):
	if file_ == compression_metadata_path:
	# Will copy it last
	seen_compression_metadata = True
	continue
	self.set_progress_percentage(int(i * 100 / len(files)))
	local_path = self.local_graph_path / file_
	local_path.parent.mkdir(parents=True, exist_ok=True)
	if file_.endswith(".bin.zst"):
	# The file was compressed before uploading to S3, we need it
	# to be decompressed locally
	with tempfile.NamedTemporaryFile(
	prefix=local_path.stem, suffix=".bin.zst"
	) as fd:
	self.set_status_message(f"Downloading {file_} (compressed)")
	client.get(
	f"{self.s3_graph_path}/{file_}",
	fd.name,
	)
	self.set_status_message(f"Decompressing {file_}")
	subprocess.run(
	[
	"zstdmt",
	"--force",
	"-d",
	fd.name,
	"-o",
	str(local_path)[0:-4],
	],
	check=True,
	)
	else:
	self.set_status_message(f"Downloading {file_}")
	client.get(
	f"{self.s3_graph_path}/{file_}",
	str(local_path),
	)

	assert (
	seen_compression_metadata
	), "did not see meta/compression.json in directory listing"

	# Write it last, to act as a stamp
	client.get(
	compression_metadata_path,
	self._meta().path,
	)


	class LocalGraph(luigi.Task):
	"""Task that depends on a local dataset being present -- either directly from
	:class:`ExportGraph` or via :class:`DownloadGraphFromS3`.
	"""

	local_graph_path = luigi.PathParameter()
	compression_task_type = luigi.TaskParameter(
	default=DownloadGraphFromS3,
	significant=False,
	description="""The task used to get the compressed graph if it is not present.
	Should be either ``swh.graph.luigi.CompressGraph`` or
	``swh.graph.luigi.DownloadGraphFromS3``.""",
	)

	def requires(self) -> List[luigi.Task]:
	"""Returns an instance of either :class:`CompressGraph` or
	:class:`DownloadGraphFromS3` depending on the value of
	:attr:`compression_task_type`."""

	if issubclass(self.compression_task_type, CompressGraph):
	return [
	CompressGraph(
	local_graph_path=self.local_graph_path,
	)
	]
	elif issubclass(self.compression_task_type, DownloadGraphFromS3):
	return [
	DownloadGraphFromS3(
	local_graph_path=self.local_graph_path,
	)
	]
	else:
	raise ValueError(
	f"Unexpected compression_task_type: "
	f"{self.compression_task_type.__name__}"
	)

	def output(self) -> List[luigi.Target]:
	"""Returns stamp and meta paths on the local filesystem."""
	return [self._meta()]

	def _meta(self):
	return luigi.LocalTarget(self.local_graph_path / "meta" / "compression.json")


	def _run_script(script: str, output_path: Path) -> None:
	import os
	import subprocess

	from .config import check_config

	conf: Dict = {} # TODO: configurable

	conf = check_config(conf)
	env = {
	**os.environ.copy(),
	"JAVA_TOOL_OPTIONS": conf["java_tool_options"],
	"CLASSPATH": conf["classpath"],
	}

	tmp_output_path = Path(f"{output_path}.tmp")

	subprocess.run(
	["bash", "-c", f"{script.strip()} > {tmp_output_path}"], env=env, check=True
	)

	# Atomically write the output file
	tmp_output_path.replace(output_path)


	class TopoSort(luigi.Task):
	"""Creates a file that contains all SWHIDs in topological order from a compressed
	graph."""

	local_graph_path = luigi.PathParameter()
	topological_order_path = luigi.PathParameter()
	graph_name = luigi.Parameter(default="graph")

	def requires(self) -> List[luigi.Task]:
	"""Returns an instance of :class:`LocalGraph`."""
	return [LocalGraph(local_graph_path=self.local_graph_path)]

	def output(self) -> luigi.Target:
	""".csv.zst file that contains the topological order."""
	return luigi.LocalTarget(self.topological_order_path)

	def run(self) -> None:
	"""Runs org.softwareheritage.graph.utils.TopoSort and compresses"""
	object_types = "rev,rel,snp,ori"
	class_name = "org.softwareheritage.graph.utils.TopoSort"
	script = f"""
	java {class_name} '{self.local_graph_path}/{self.graph_name}' '{object_types}' \
	\| pv --line-mode --wait \
	\| zstdmt -19
	"""
	_run_script(script, self.topological_order_path)


	class ListOriginContributors(luigi.Task):
	"""Creates a file that contains all SWHIDs in topological order from a compressed
	graph."""

	local_graph_path = luigi.PathParameter()
	topological_order_path = luigi.PathParameter()
	origin_contributors_path = luigi.PathParameter()
	graph_name = luigi.Parameter(default="graph")

	def requires(self) -> List[luigi.Task]:
	"""Returns an instance of :class:`LocalGraph` and :class:`TopoSort`."""
	return [
	LocalGraph(local_graph_path=self.local_graph_path),
	TopoSort(
	local_graph_path=self.local_graph_path,
	topological_order_path=self.topological_order_path,
	graph_name=self.graph_name,
	),
	]

	def output(self) -> luigi.Target:
	""".csv.zst file that contains the topological order."""
	return luigi.LocalTarget(self.origin_contributors_path)

	def run(self) -> None:
	"""Runs org.softwareheritage.graph.utils.TopoSort and compresses"""
	class_name = "org.softwareheritage.graph.utils.ListOriginContributors"
	script = f"""
	zstdcat {self.topological_order_path} \
	\| java {class_name} '{self.local_graph_path}/{self.graph_name}' \
	\| pv --line-mode --wait \
	\| zstdmt -19
	"""
	_run_script(script, self.origin_contributors_path)


	class ExportDeanonymizationTable(luigi.Task):
	"""Exports (from swh-storage) a .csv.zst file that contains the columns:
	``base64(sha256(full_name))`, ``base64(full_name)``, and ``escape(full_name)``.

	The first column is the anonymized full name found in :file:`graph.persons.csv.zst`
	in the compressed graph, and the latter two are the original name."""

	storage_dsn = luigi.Parameter(
	default="service=swh",
	description="postgresql DSN of the swh-storage database to read from.",
	)
	deanonymization_table_path = luigi.PathParameter()

	def output(self) -> luigi.Target:
	""".csv.zst file that contains the table."""
	return luigi.LocalTarget(self.deanonymization_table_path)

	def run(self) -> None:
	"""Runs a postgresql query to compute the table."""

	_run_script(
	f"""
	psql '{self.storage_dsn}' -c "COPY (select encode(digest(fullname, 'sha256'), 'base64') as sha256_base64, encode(fullname, 'base64') as base64, encode(fullname, 'escape') as escaped from person) TO STDOUT CSV HEADER" \| zstdmt -19
	""", # noqa
	self.deanonymization_table_path,
	)


	class DeanonymizeOriginContributors(luigi.Task):
	"""Generates a .csv.zst file similar to :class:`ListOriginContributors`'s,
	but with ``person_base64`` and ``person_escaped`` columns in addition to
	``person_id``.

	This assumes that :file:`graph.persons.csv.zst` is anonymized (SHA256 of names
	instead of names); which may not be true depending on how the swh-dataset export
	cas configured.
	"""

	local_graph_path = luigi.PathParameter()
	graph_name = luigi.Parameter(default="graph")
	origin_contributors_path = luigi.PathParameter()
	deanonymization_table_path = luigi.PathParameter()
	deanonymized_origin_contributors_path = luigi.PathParameter()

	def requires(self) -> List[luigi.Task]:
	"""Returns instances of :class:`LocalGraph`, :class:`ListOriginContributors`,
	and :class:`ExportDeanonymizationTable`."""
	return [
	LocalGraph(local_graph_path=self.local_graph_path),
	ListOriginContributors(
	local_graph_path=self.local_graph_path,
	origin_contributors_path=self.origin_contributors_path,
	),
	ExportDeanonymizationTable(
	deanonymization_table_path=self.deanonymization_table_path,
	),
	]

	def output(self) -> luigi.Target:
	""".csv.zst file similar to :meth:`ListOriginContributors.output`'s,
	but with ``person_base64`` and ``person_escaped`` columns in addition to
	``person_id``"""
	return luigi.LocalTarget(self.deanonymized_origin_contributors_path)

	def run(self) -> None:
	"""Loads the list of persons (``graph.persons.csv.zst`` in the graph dataset
	and the deanonymization table in memory, then uses them to map each row
	in the original (anonymized) contributors list to the deanonymized one."""
	# TODO: .persons.csv.zst may be already deanonymized (if the swh-dataset export
	# was configured to do so); this should add support for it.

	import base64
	import csv

	import pyzstd

	# Load the deanonymization table, to map sha256(name) to base64(name)
	# and escape(name)
	sha256_to_names: Dict[bytes, Tuple[bytes, str]] = {}
	with pyzstd.open(self.deanonymization_table_path, "rt") as fd:
	csv_reader = csv.reader(fd)
	header = next(csv_reader)
	assert header == ["sha256_base64", "base64", "escaped"], header
	for line in csv_reader:
	(base64_sha256_name, base64_name, escaped_name) = line
	sha256_name = base64.b64decode(base64_sha256_name)
	name = base64.b64decode(base64_name)
	sha256_to_names[sha256_name] = (name, escaped_name)

	# Combine with the list of sha256(name), to get the list of base64(name)
	# and escape(name)
	persons_path = self.local_graph_path / f"{self.graph_name}.persons.csv.zst"
	with pyzstd.open(persons_path, "rb") as fd:
	person_id_to_names: List[Tuple[bytes, str]] = [
	sha256_to_names.pop(base64.b64decode(line.strip()), (b"", ""))
	for line in fd
	]

	tmp_output_path = Path(f"{self.deanonymized_origin_contributors_path}.tmp")

	# Finally, write a new table of origin_contributors, by reading the anonymized
	# table line-by-line and deanonymizing each id

	# Open temporary output for writes as CSV
	with pyzstd.open(tmp_output_path, "wt") as output_fd:
	csv_writer = csv.writer(output_fd, lineterminator="\n")
	# write header
	csv_writer.writerow(("origin_SWHID", "person_base64", "person_escaped"))

	# Open input for reads as CSV
	with pyzstd.open(self.origin_contributors_path, "rt") as input_fd:
	csv_reader = csv.reader(input_fd)
	header = next(csv_reader)
	assert header == ["origin_SWHID", "person_id"], header
	for (origin_swhid, person_id) in csv_reader:
	+ if person_id == "null":
	+ # FIXME: workaround for a bug in contribution graphs generated
	+ # before 2022-12-01. Those were only used in tests and never
	+ # published, so the conditional can be removed when this is
	+ # productionized
	+ continue
	(name, escaped_name) = person_id_to_names[int(person_id)]
	base64_name = base64.b64encode(name).decode("ascii")
	csv_writer.writerow((origin_swhid, base64_name, escaped_name))

	tmp_output_path.replace(self.deanonymized_origin_contributors_path)
	diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labelobl b/swh/graph/tests/dataset/compressed/example-labelled.labelobl
	new file mode 100644
	index 0000000..d4a6621
	Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example-labelled.labelobl differ
	diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
	index fbb7a5a..a87f20a 100644
	--- a/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
	+++ b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
	@@ -1,2 +1 @@
	-í
	-Âpæ)í
	\ No newline at end of file
	+ úh*Â¸~±tÐV
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labels b/swh/graph/tests/dataset/compressed/example-labelled.labels
	index 1b876ec..935dd46 100644
	--- a/swh/graph/tests/dataset/compressed/example-labelled.labels
	+++ b/swh/graph/tests/dataset/compressed/example-labelled.labels
	@@ -1 +1 @@
	-D¤º%B](P(iõ¢
	\ No newline at end of file
	+§ BaéÂQ@RB@RiÐD
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example-labelled.properties b/swh/graph/tests/dataset/compressed/example-labelled.properties
	index 4f4c55a..4c6856d 100644
	--- a/swh/graph/tests/dataset/compressed/example-labelled.properties
	+++ b/swh/graph/tests/dataset/compressed/example-labelled.properties
	@@ -1,3 +1,3 @@
	graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph
	-labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6)
	+labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,7)
	underlyinggraph = example
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labelobl b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labelobl
	new file mode 100644
	index 0000000..b734d0d
	Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labelobl differ
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets
	index 7726435..603a32f 100644
	--- a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets
	+++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets
	@@ -1,2 +1 @@
	- B!B
	-(P¡
	\ No newline at end of file
	+Ô.I,*0ZèX
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels
	index 9448e72..9375cc7 100644
	--- a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels
	+++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels
	@@ -1,2 +1 @@
	- P:¢RH
	-jºP u¢
	\ No newline at end of file
	+§â%!P£I ¢HJaÐ
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties b/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties
	index 5ee584a..da8e63b 100644
	--- a/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties
	+++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties
	@@ -1,3 +1,3 @@
	graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph
	-labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6)
	+labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,7)
	underlyinggraph = example-transposed
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed.graph b/swh/graph/tests/dataset/compressed/example-transposed.graph
	index d8cbf2b..1a5e5be 100644
	--- a/swh/graph/tests/dataset/compressed/example-transposed.graph
	+++ b/swh/graph/tests/dataset/compressed/example-transposed.graph
	@@ -1 +1 @@
	-^®t5Òízèí ½®ÖzºZá:¨»]À
	\ No newline at end of file
	+®¥òâ7ö/Ë¥Úý:ÕÒõt´+Fº[#ê«ÅÐ
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed.obl b/swh/graph/tests/dataset/compressed/example-transposed.obl
	index 7ad141b..989e9e9 100644
	Binary files a/swh/graph/tests/dataset/compressed/example-transposed.obl and b/swh/graph/tests/dataset/compressed/example-transposed.obl differ
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed.offsets b/swh/graph/tests/dataset/compressed/example-transposed.offsets
	index b3044db..bcd313c 100644
	--- a/swh/graph/tests/dataset/compressed/example-transposed.offsets
	+++ b/swh/graph/tests/dataset/compressed/example-transposed.offsets
	@@ -1 +1,2 @@
	-) (P8&(R
	\ No newline at end of file
	+¡H48P¡BE
	+4h¡Ã¡@
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example-transposed.properties b/swh/graph/tests/dataset/compressed/example-transposed.properties
	index 96fcfba..9f09c32 100644
	--- a/swh/graph/tests/dataset/compressed/example-transposed.properties
	+++ b/swh/graph/tests/dataset/compressed/example-transposed.properties
	@@ -1,35 +1,35 @@
	#BVGraph properties
	-#Wed Mar 30 17:33:29 CEST 2022
	-bitsforreferences=28
	-avgbitsforintervals=0.762
	+#Thu Dec 01 10:50:01 CET 2022
	+bitsforreferences=31
	+avgbitsforintervals=0.833
	graphclass=it.unimi.dsi.big.webgraph.BVGraph
	-avgdist=0.429
	-successoravggap=4.261
	-residualexpstats=5,8,3,2,1
	-arcs=23
	+avgdist=0.417
	+successoravggap=6.518
	+residualexpstats=8,5,8,3,0,1
	+arcs=28
	minintervallength=4
	-bitsforoutdegrees=61
	-residualavgloggap=2.076977934449935
	-avgbitsforoutdegrees=2.905
	-bitsforresiduals=85
	-successoravgloggap=1.9987119736846723
	+bitsforoutdegrees=68
	+residualavgloggap=2.2068709506771227
	+avgbitsforoutdegrees=2.833
	+bitsforresiduals=115
	+successoravgloggap=2.3010835643149283
	maxrefcount=3
	-successorexpstats=7,9,4,2,1
	-residualarcs=19
	-avgbitsforresiduals=4.048
	-avgbitsforblocks=0.19
	+successorexpstats=9,5,8,4,1,1
	+residualarcs=25
	+avgbitsforresiduals=4.792
	+avgbitsforblocks=0.125
	windowsize=7
	-residualavggap=4.632
	-copiedarcs=4
	-avgbitsforreferences=1.333
	+residualavggap=5.860
	+copiedarcs=3
	+avgbitsforreferences=1.292
	version=0
	-compratio=1.53
	-bitsperlink=8.435
	+compratio=1.501
	+bitsperlink=8.464
	compressionflags=
	-nodes=21
	-avgref=0.238
	+nodes=24
	+avgref=0.125
	zetak=3
	-bitsforintervals=16
	+bitsforintervals=20
	intervalisedarcs=0
	-bitspernode=9.238
	-bitsforblocks=4
	+bitspernode=9.875
	+bitsforblocks=3
	diff --git a/swh/graph/tests/dataset/compressed/example.edges.count.txt b/swh/graph/tests/dataset/compressed/example.edges.count.txt
	index 4099407..9902f17 100644
	--- a/swh/graph/tests/dataset/compressed/example.edges.count.txt
	+++ b/swh/graph/tests/dataset/compressed/example.edges.count.txt
	@@ -1 +1 @@
	-23
	+28
	diff --git a/swh/graph/tests/dataset/compressed/example.edges.stats.txt b/swh/graph/tests/dataset/compressed/example.edges.stats.txt
	index c9b8ac7..d01dba6 100644
	--- a/swh/graph/tests/dataset/compressed/example.edges.stats.txt
	+++ b/swh/graph/tests/dataset/compressed/example.edges.stats.txt
	@@ -1,8 +1,8 @@
	dir:cnt 8
	dir:dir 3
	-ori:snp 1
	-rel:rev 2
	+ori:snp 2
	+rel:rev 3
	rev:dir 4
	rev:rev 3
	-snp:rel 1
	-snp:rev 1
	+snp:rel 3
	+snp:rev 2
	diff --git a/swh/graph/tests/dataset/compressed/example.graph b/swh/graph/tests/dataset/compressed/example.graph
	index d99357d..3d76ab1 100644
	--- a/swh/graph/tests/dataset/compressed/example.graph
	+++ b/swh/graph/tests/dataset/compressed/example.graph
	@@ -1 +1 @@
	-}Ýø º]ïªËétô]~[Ô1tÞ@
	\ No newline at end of file
	+]Ø~ÿåÓz] õY>ª¿.õ¤kºíè9Ñt
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example.indegree b/swh/graph/tests/dataset/compressed/example.indegree
	index 1ea373e..87022a8 100644
	--- a/swh/graph/tests/dataset/compressed/example.indegree
	+++ b/swh/graph/tests/dataset/compressed/example.indegree
	@@ -1,4 +1,5 @@
	-2
	+3
	16
	-2
	+4
	+0
	1
	diff --git a/swh/graph/tests/dataset/compressed/example.labels.count.txt b/swh/graph/tests/dataset/compressed/example.labels.count.txt
	index 45a4fb7..ec63514 100644
	--- a/swh/graph/tests/dataset/compressed/example.labels.count.txt
	+++ b/swh/graph/tests/dataset/compressed/example.labels.count.txt
	@@ -1 +1 @@
	-8
	+9
	diff --git a/swh/graph/tests/dataset/compressed/example.labels.csv.zst b/swh/graph/tests/dataset/compressed/example.labels.csv.zst
	index 1cc8931..50209e7 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.labels.csv.zst and b/swh/graph/tests/dataset/compressed/example.labels.csv.zst differ
	diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray
	index 01451e0..139b2bc 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray and b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray differ
	diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers
	index 755c4c7..7acff4e 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers and b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers differ
	diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.properties b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
	index deeac3a..e2f298d 100644
	--- a/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
	+++ b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
	@@ -1,2 +1,2 @@
	-n=8
	+n=9
	ratio=4
	diff --git a/swh/graph/tests/dataset/compressed/example.labels.mph b/swh/graph/tests/dataset/compressed/example.labels.mph
	index e417aec..60d1007 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.labels.mph and b/swh/graph/tests/dataset/compressed/example.labels.mph differ
	diff --git a/swh/graph/tests/dataset/compressed/example.mph b/swh/graph/tests/dataset/compressed/example.mph
	index f696b19..136bedf 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.mph and b/swh/graph/tests/dataset/compressed/example.mph differ
	diff --git a/swh/graph/tests/dataset/compressed/example.node2swhid.bin b/swh/graph/tests/dataset/compressed/example.node2swhid.bin
	index e86dae4..109a1ac 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.node2swhid.bin and b/swh/graph/tests/dataset/compressed/example.node2swhid.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.node2type.map b/swh/graph/tests/dataset/compressed/example.node2type.map
	index 1a5b7a7..0a84a00 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.node2type.map and b/swh/graph/tests/dataset/compressed/example.node2type.map differ
	diff --git a/swh/graph/tests/dataset/compressed/example.nodes.count.txt b/swh/graph/tests/dataset/compressed/example.nodes.count.txt
	index aabe6ec..a45fd52 100644
	--- a/swh/graph/tests/dataset/compressed/example.nodes.count.txt
	+++ b/swh/graph/tests/dataset/compressed/example.nodes.count.txt
	@@ -1 +1 @@
	-21
	+24
	diff --git a/swh/graph/tests/dataset/compressed/example.nodes.csv.zst b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst
	index 0559f37..24ba056 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.nodes.csv.zst and b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst differ
	diff --git a/swh/graph/tests/dataset/compressed/example.nodes.stats.txt b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
	index 097e698..704e51e 100644
	--- a/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
	+++ b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
	@@ -1,6 +1,6 @@
	cnt 7
	dir 6
	-ori 1
	-rel 2
	+ori 2
	+rel 3
	rev 4
	-snp 1
	+snp 2
	diff --git a/swh/graph/tests/dataset/compressed/example.obl b/swh/graph/tests/dataset/compressed/example.obl
	index 8538d49..36e5d26 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.obl and b/swh/graph/tests/dataset/compressed/example.obl differ
	diff --git a/swh/graph/tests/dataset/compressed/example.offsets b/swh/graph/tests/dataset/compressed/example.offsets
	index 1249e27..4e0eab1 100644
	--- a/swh/graph/tests/dataset/compressed/example.offsets
	+++ b/swh/graph/tests/dataset/compressed/example.offsets
	@@ -1,2 +1,2 @@
	-A!Bi
	-CB
	+
	+(`¡¨rAD9E!A
	\ No newline at end of file
	diff --git a/swh/graph/tests/dataset/compressed/example.order b/swh/graph/tests/dataset/compressed/example.order
	index ff64db4..8d38627 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.order and b/swh/graph/tests/dataset/compressed/example.order differ
	diff --git a/swh/graph/tests/dataset/compressed/example.outdegree b/swh/graph/tests/dataset/compressed/example.outdegree
	index 5b8a720..7533d27 100644
	--- a/swh/graph/tests/dataset/compressed/example.outdegree
	+++ b/swh/graph/tests/dataset/compressed/example.outdegree
	@@ -1,4 +1,4 @@
	7
	-6
	+8
	7
	-1
	+2
	diff --git a/swh/graph/tests/dataset/compressed/example.persons.mph b/swh/graph/tests/dataset/compressed/example.persons.mph
	index 6787503..1d9d8b6 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.persons.mph and b/swh/graph/tests/dataset/compressed/example.persons.mph differ
	diff --git a/swh/graph/tests/dataset/compressed/example.properties b/swh/graph/tests/dataset/compressed/example.properties
	index 11d426e..75581f6 100644
	--- a/swh/graph/tests/dataset/compressed/example.properties
	+++ b/swh/graph/tests/dataset/compressed/example.properties
	@@ -1,35 +1,35 @@
	#BVGraph properties
	-#Wed Mar 30 17:33:28 CEST 2022
	-bitsforreferences=15
	+#Thu Dec 01 10:50:00 CET 2022
	+bitsforreferences=20
	avgbitsforintervals=0.667
	graphclass=it.unimi.dsi.big.webgraph.BVGraph
	-avgdist=0.048
	-successoravggap=3.935
	-residualexpstats=8,9,2,2,1
	-arcs=23
	+avgdist=0.125
	+successoravggap=5.125
	+residualexpstats=4,9,8,4,1
	+arcs=28
	minintervallength=4
	-bitsforoutdegrees=51
	-residualavgloggap=1.8895225435666037
	-avgbitsforoutdegrees=2.429
	-bitsforresiduals=98
	-successoravgloggap=1.8859500382836039
	+bitsforoutdegrees=62
	+residualavgloggap=2.3484556402638956
	+avgbitsforoutdegrees=2.583
	+bitsforresiduals=122
	+successoravgloggap=2.280971484604246
	maxrefcount=3
	-successorexpstats=8,10,2,2,1
	-residualarcs=22
	-avgbitsforresiduals=4.667
	-avgbitsforblocks=0.048
	+successorexpstats=5,10,8,4,1
	+residualarcs=26
	+avgbitsforresiduals=5.083
	+avgbitsforblocks=0.083
	windowsize=7
	-residualavggap=4.000
	-copiedarcs=1
	-avgbitsforreferences=0.714
	+residualavggap=5.385
	+copiedarcs=2
	+avgbitsforreferences=0.833
	version=0
	-compratio=1.412
	-bitsperlink=7.783
	+compratio=1.406
	+bitsperlink=7.929
	compressionflags=
	-nodes=21
	-avgref=0.048
	+nodes=24
	+avgref=0.083
	zetak=3
	-bitsforintervals=14
	+bitsforintervals=16
	intervalisedarcs=0
	-bitspernode=8.524
	-bitsforblocks=1
	+bitspernode=9.25
	+bitsforblocks=2
	diff --git a/swh/graph/tests/dataset/compressed/example.property.author_id.bin b/swh/graph/tests/dataset/compressed/example.property.author_id.bin
	index 7072382..49fa8cf 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.author_id.bin and b/swh/graph/tests/dataset/compressed/example.property.author_id.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin
	index 18ae5fa..fcae808 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin
	index ab8222e..d84999f 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_id.bin b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin
	index 693c904..257d0c3 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.committer_id.bin and b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin
	index 4c00061..81899c6 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin
	index 9c4f149..7ce5005 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin
	index 274f279..08bd265 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin and b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.content.length.bin b/swh/graph/tests/dataset/compressed/example.property.content.length.bin
	index 4848e0e..b6881ea 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.content.length.bin and b/swh/graph/tests/dataset/compressed/example.property.content.length.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.message.bin b/swh/graph/tests/dataset/compressed/example.property.message.bin
	index 5d50ccf..76b5371 100644
	--- a/swh/graph/tests/dataset/compressed/example.property.message.bin
	+++ b/swh/graph/tests/dataset/compressed/example.property.message.bin
	@@ -1,7 +1,9 @@
	VmVyc2lvbiAxLjA=
	VmVyc2lvbiAyLjA=
	+VmVyc2lvbiAyLjAgYnV0IHdpdGggbm8gYXV0aG9y
	SW5pdGlhbCBjb21taXQ=
	QWRkIHBhcnNlcg==
	QWRkIHRlc3Rz
	UmVmYWN0b3IgY29kZWJhc2U=
	aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=
	+aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy
	diff --git a/swh/graph/tests/dataset/compressed/example.property.message.offset.bin b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin
	index a452a83..f92396f 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.message.offset.bin and b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
	index ba37d43..70cc465 100644
	--- a/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
	+++ b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
	@@ -1,2 +1,3 @@
	djEuMA==
	djIuMA==
	+djIuMC1hbm9ueW1vdXM=
	diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin
	index f6f589d..784d82d 100644
	Binary files a/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin and b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin differ
	diff --git a/swh/graph/tests/dataset/compressed/example.stats b/swh/graph/tests/dataset/compressed/example.stats
	index 541f39a..e66eab4 100644
	--- a/swh/graph/tests/dataset/compressed/example.stats
	+++ b/swh/graph/tests/dataset/compressed/example.stats
	@@ -1,20 +1,20 @@
	-nodes=21
	-arcs=23
	+nodes=24
	+arcs=28
	loops=0
	-successoravggap=4.588
	-avglocality=2.522
	+successoravggap=5.900
	+avglocality=3.143
	minoutdegree=0
	maxoutdegree=3
	-minoutdegreenode=1
	-maxoutdegreenode=9
	+minoutdegreenode=8
	+maxoutdegreenode=1
	dangling=7
	terminal=7
	-percdangling=33.333333333333336
	-avgoutdegree=1.0952380952380953
	-successorlogdeltastats=13,5,3,2
	-successoravglogdelta=0.814
	+percdangling=29.166666666666668
	+avgoutdegree=1.1666666666666667
	+successorlogdeltastats=13,9,3,2,1
	+successoravglogdelta=0.880
	minindegree=0
	-maxindegree=3
	-minindegreenode=20
	-maxindegreenode=17
	-avgindegree=1.0952380952380953
	+maxindegree=4
	+minindegreenode=21
	+maxindegreenode=3
	+avgindegree=1.1666666666666667
	diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst
	index 11bf2e2..b604c27 100644
	Binary files a/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst and b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst differ
	diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst
	index 850e058..fb13661 100644
	Binary files a/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst and b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst differ
	diff --git a/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst
	index 59b5b0e..c071723 100644
	Binary files a/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst and b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst differ
	diff --git a/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst
	index 11bfce7..b85a6a2 100644
	Binary files a/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst and b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst differ
	diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst
	index 97db59f..4fc1630 100644
	Binary files a/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst and b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst differ
	diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst
	index 5cd8295..b6c660a 100644
	Binary files a/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst and b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst differ
	diff --git a/swh/graph/tests/dataset/generate_dataset.py b/swh/graph/tests/dataset/generate_dataset.py
	index c6abc00..9ff900c 100755
	--- a/swh/graph/tests/dataset/generate_dataset.py
	+++ b/swh/graph/tests/dataset/generate_dataset.py
	@@ -1,358 +1,402 @@
	#!/usr/bin/env python3

	-# Copyright (C) 2021 The Software Heritage developers
	+# Copyright (C) 2021-2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	# type: ignore

	import argparse
	import datetime
	import logging
	from pathlib import Path
	import shutil

	from swh.dataset.exporters.edges import GraphEdgesExporter
	from swh.dataset.exporters.orc import ORCExporter
	from swh.graph.webgraph import compress
	from swh.model.model import (
	Content,
	Directory,
	DirectoryEntry,
	ObjectType,
	Origin,
	OriginVisit,
	OriginVisitStatus,
	Person,
	Release,
	Revision,
	RevisionType,
	SkippedContent,
	Snapshot,
	SnapshotBranch,
	TargetType,
	Timestamp,
	TimestampWithTimezone,
	)


	def h(id: int, width=40) -> bytes:
	return bytes.fromhex(f"{id:0{width}}")


	PERSONS = [
	Person(fullname=b"foo", name=b"foo", email=b""),
	Person(fullname=b"bar", name=b"bar", email=b""),
	Person(fullname=b"baz", name=b"baz", email=b""),
	]

	TEST_DATASET = [
	Content(sha1_git=h(1), sha1=h(1), sha256=h(1, 64), blake2s256=h(1, 64), length=42),
	Content(sha1_git=h(4), sha1=h(4), sha256=h(4, 64), blake2s256=h(4, 64), length=404),
	Content(
	sha1_git=h(5), sha1=h(5), sha256=h(5, 64), blake2s256=h(5, 64), length=1337
	),
	Content(sha1_git=h(7), sha1=h(7), sha256=h(7, 64), blake2s256=h(7, 64), length=666),
	Content(
	sha1_git=h(11), sha1=h(11), sha256=h(11, 64), blake2s256=h(11, 64), length=313
	),
	Content(
	sha1_git=h(14), sha1=h(14), sha256=h(14, 64), blake2s256=h(14, 64), length=14
	),
	SkippedContent(
	sha1_git=h(15),
	sha1=h(15),
	sha256=h(15, 64),
	blake2s256=h(15, 64),
	length=404,
	status="absent",
	reason="Not found",
	),
	Directory(
	id=h(2),
	entries=(
	DirectoryEntry(
	name=b"README.md",
	perms=0o100644,
	type="file",
	target=h(1),
	),
	),
	),
	Directory(
	id=h(6),
	entries=(
	DirectoryEntry(
	name=b"README.md",
	perms=0o100644,
	type="file",
	target=h(4),
	),
	DirectoryEntry(
	name=b"parser.c",
	perms=0o100644,
	type="file",
	target=h(5),
	),
	),
	),
	Directory(
	id=h(8),
	entries=(
	DirectoryEntry(
	name=b"README.md",
	perms=0o100644,
	type="file",
	target=h(1),
	),
	DirectoryEntry(
	name=b"parser.c",
	perms=0o100644,
	type="file",
	target=h(7),
	),
	DirectoryEntry(
	name=b"tests",
	perms=0o100755,
	type="dir",
	target=h(6),
	),
	),
	),
	Directory(
	id=h(12),
	entries=(
	DirectoryEntry(
	name=b"README.md",
	perms=0o100644,
	type="file",
	target=h(11),
	),
	DirectoryEntry(
	name=b"oldproject",
	perms=0o100755,
	type="dir",
	target=h(8),
	),
	),
	),
	Directory(
	id=h(16),
	entries=(
	DirectoryEntry(
	name=b"TODO.txt",
	perms=0o100644,
	type="file",
	target=h(15),
	),
	),
	),
	Directory(
	id=h(17),
	entries=(
	DirectoryEntry(
	name=b"TODO.txt",
	perms=0o100644,
	type="file",
	target=h(14),
	),
	DirectoryEntry(
	name=b"old",
	perms=0o100755,
	type="dir",
	target=h(16),
	),
	),
	),
	Revision(
	id=h(3),
	message=b"Initial commit",
	date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111122220,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	committer=PERSONS[0],
	author=PERSONS[0],
	committer_date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111122220,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	type=RevisionType.GIT,
	directory=h(2),
	synthetic=False,
	metadata=None,
	parents=(),
	),
	Revision(
	id=h(9),
	message=b"Add parser",
	date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111144440,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	committer=PERSONS[1],
	author=PERSONS[1],
	committer_date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111155550,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	type=RevisionType.GIT,
	directory=h(8),
	synthetic=False,
	metadata=None,
	parents=(h(3),),
	),
	Revision(
	id=h(13),
	message=b"Add tests",
	date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111166660,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	committer=PERSONS[1],
	author=PERSONS[0],
	committer_date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111166660,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	type=RevisionType.GIT,
	directory=h(12),
	synthetic=False,
	metadata=None,
	parents=(h(9),),
	),
	Revision(
	id=h(18),
	message=b"Refactor codebase",
	date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111177770,
	microseconds=0,
	),
	offset_bytes=b"+0000",
	),
	committer=PERSONS[0],
	author=PERSONS[2],
	committer_date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1111177770,
	microseconds=0,
	),
	offset_bytes=b"+0000",
	),
	type=RevisionType.GIT,
	directory=h(17),
	synthetic=False,
	metadata=None,
	parents=(h(13),),
	),
	Release(
	id=h(10),
	name=b"v1.0",
	date=TimestampWithTimezone(
	timestamp=Timestamp(
	seconds=1234567890,
	microseconds=0,
	),
	offset_bytes=b"+0200",
	),
	author=PERSONS[0],
	target_type=ObjectType.REVISION,
	target=h(9),
	message=b"Version 1.0",
	synthetic=False,
	),
	Release(
	id=h(19),
	name=b"v2.0",
	date=None,
	author=PERSONS[1],
	target_type=ObjectType.REVISION,
	target=h(18),
	message=b"Version 2.0",
	synthetic=False,
	),
	+ Release(
	+ id=h(21),
	+ name=b"v2.0-anonymous",
	+ date=None,
	+ author=None,
	+ target_type=ObjectType.REVISION,
	+ target=h(18),
	+ message=b"Version 2.0 but with no author",
	+ synthetic=False,
	+ ),
	Snapshot(
	id=h(20),
	branches={
	b"refs/heads/master": SnapshotBranch(
	target=h(9), target_type=TargetType.REVISION
	),
	b"refs/tags/v1.0": SnapshotBranch(
	target=h(10), target_type=TargetType.RELEASE
	),
	},
	),
	OriginVisit(
	origin="https://example.com/swh/graph",
	date=datetime.datetime(
	2013, 5, 7, 4, 20, 39, 369271, tzinfo=datetime.timezone.utc
	),
	visit=1,
	type="git",
	),
	OriginVisitStatus(
	origin="https://example.com/swh/graph",
	date=datetime.datetime(
	2013, 5, 7, 4, 20, 41, 369271, tzinfo=datetime.timezone.utc
	),
	visit=1,
	type="git",
	status="full",
	snapshot=h(20),
	metadata=None,
	),
	Origin(url="https://example.com/swh/graph"),
	+ Snapshot(
	+ id=h(22),
	+ branches={
	+ b"refs/heads/master": SnapshotBranch(
	+ target=h(9), target_type=TargetType.REVISION
	+ ),
	+ b"refs/tags/v1.0": SnapshotBranch(
	+ target=h(10), target_type=TargetType.RELEASE
	+ ),
	+ b"refs/tags/v2.0-anonymous": SnapshotBranch(
	+ target=h(21), target_type=TargetType.RELEASE
	+ ),
	+ },
	+ ),
	+ OriginVisit(
	+ origin="https://example.com/swh/graph2",
	+ date=datetime.datetime(
	+ 2013, 5, 7, 4, 20, 39, 369271, tzinfo=datetime.timezone.utc
	+ ),
	+ visit=1,
	+ type="git",
	+ ),
	+ OriginVisitStatus(
	+ origin="https://example.com/swh/graph2",
	+ date=datetime.datetime(
	+ 2013, 5, 7, 4, 20, 41, 369271, tzinfo=datetime.timezone.utc
	+ ),
	+ visit=1,
	+ type="git",
	+ status="full",
	+ snapshot=h(22),
	+ metadata=None,
	+ ),
	+ Origin(url="https://example.com/swh/graph2"),
	]


	def main():
	logging.basicConfig(level=logging.INFO)

	parser = argparse.ArgumentParser(description="Generate a test dataset")
	parser.add_argument(
	"--compress",
	action="store_true",
	default=False,
	help="Also compress the dataset",
	)
	parser.add_argument("output", help="output directory", nargs="?", default=".")
	args = parser.parse_args()

	exporters = {"edges": GraphEdgesExporter, "orc": ORCExporter}
	config = {"test_unique_file_id": "all"}
	output_path = Path(args.output)
	for name, exporter in exporters.items():
	if (output_path / name).exists():
	shutil.rmtree(output_path / name)
	with exporter(config, output_path / name) as e:
	for obj in TEST_DATASET:
	e.process_object(obj.object_type, obj.to_dict())

	if args.compress:
	if (output_path / "compressed").exists():
	shutil.rmtree(output_path / "compressed")
	compress("example", output_path / "orc", output_path / "compressed")


	if __name__ == "__main__":
	main()
	diff --git a/swh/graph/tests/dataset/img/example.dot b/swh/graph/tests/dataset/img/example.dot
	index d1bdb1f..2963627 100644
	--- a/swh/graph/tests/dataset/img/example.dot
	+++ b/swh/graph/tests/dataset/img/example.dot
	@@ -1,82 +1,91 @@
	digraph "Software Heritage mini DAG" {
	ranksep=1;
	nodesep=0.5;

	subgraph cnt {
	01 [label="cnt:0x01"];
	04 [label="cnt:0x04"];
	05 [label="cnt:0x05"];
	07 [label="cnt:0x07"];
	11 [label="cnt:0x11"];
	14 [label="cnt:0x14"];
	15 [label="cnt:0x15"];
	}

	subgraph cluster_dir {
	label="File contents";
	node [shape=folder];
	02 [label="dir:0x02"];
	06 [label="dir:0x06"];
	08 [label="dir:0x08"];
	12 [label="dir:0x12"];
	16 [label="dir:0x16"];
	17 [label="dir:0x17"];

	02 -> 01;
	06 -> 04;
	06 -> 05;
	08 -> 01;
	08 -> 06;
	08 -> 07;
	12 -> 08;
	12 -> 11;
	16 -> 15;
	17 -> 14;
	17 -> 16;
	}

	subgraph cluster_rev {
	label="Revisions";
	node [shape=diamond];
	03 [label="rev:0x03"];
	09 [label="rev:0x09"];
	13 [label="rev:0x13"];
	18 [label="rev:0x18"];

	03 -> 02;
	09 -> 08;
	13 -> 12;
	18 -> 17;
	// horizontal rev -> rev edges
	09 -> 03 [constraint=false];
	13 -> 09 [constraint=false];
	18 -> 13 [constraint=false];
	}

	subgraph cluster_rel {
	label="Releases";
	node [shape=octagon];
	10 [label="rel:0x10"];
	19 [label="rel:0x19"];
	+ 21 [label="rel:0x21"];

	10 -> 09;
	19 -> 18;
	+ 21 -> 18;
	}

	subgraph cluster_snp {
	label="Snapshots";
	node [shape=doubleoctagon];
	20 [label="snp:0x20"];
	+ 22 [label="snp:0x22"];

	20 -> 09;
	20 -> 10;
	+
	+ 22 -> 09;
	+ 22 -> 10;
	+ 22 -> 21;
	}

	subgraph cluster_ori {
	label="Origins";
	node [shape=egg];
	- 21 [label="ori:0x21"];
	+ ori1 [label="ori:8340"];
	+ ori2 [label="ori:8f50"];

	- 21 -> 20;
	+ ori1 -> 20;
	+ ori2 -> 22;
	}
	}
	diff --git a/swh/graph/tests/dataset/orc/content/content-all.orc b/swh/graph/tests/dataset/orc/content/content-all.orc
	index b038074..68f2677 100644
	Binary files a/swh/graph/tests/dataset/orc/content/content-all.orc and b/swh/graph/tests/dataset/orc/content/content-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/directory/directory-all.orc b/swh/graph/tests/dataset/orc/directory/directory-all.orc
	index 2df504e..cb74397 100644
	Binary files a/swh/graph/tests/dataset/orc/directory/directory-all.orc and b/swh/graph/tests/dataset/orc/directory/directory-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc b/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc
	index 1a3d9f4..6d54b41 100644
	Binary files a/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc and b/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/origin/origin-all.orc b/swh/graph/tests/dataset/orc/origin/origin-all.orc
	index cec803a..fd49daa 100644
	Binary files a/swh/graph/tests/dataset/orc/origin/origin-all.orc and b/swh/graph/tests/dataset/orc/origin/origin-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc b/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc
	index c7965bb..d338904 100644
	Binary files a/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc and b/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc b/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc
	index 0a19cb1..9b9ef8c 100644
	Binary files a/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc and b/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/release/release-all.orc b/swh/graph/tests/dataset/orc/release/release-all.orc
	index 888fa82..b947717 100644
	Binary files a/swh/graph/tests/dataset/orc/release/release-all.orc and b/swh/graph/tests/dataset/orc/release/release-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/revision/revision-all.orc b/swh/graph/tests/dataset/orc/revision/revision-all.orc
	index 8c186d1..39e383d 100644
	Binary files a/swh/graph/tests/dataset/orc/revision/revision-all.orc and b/swh/graph/tests/dataset/orc/revision/revision-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc b/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc
	index 05a6b8d..66f46a7 100644
	Binary files a/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc and b/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc b/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc
	index 92f1748..df5742d 100644
	Binary files a/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc and b/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc b/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc
	index ed19277..6e80a38 100644
	Binary files a/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc and b/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc b/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc
	index 41bee79..5dffb82 100644
	Binary files a/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc and b/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc differ
	diff --git a/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc b/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc
	index c3a11b6..d7bd487 100644
	Binary files a/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc and b/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc differ
	diff --git a/swh/graph/tests/test_cli.py b/swh/graph/tests/test_cli.py
	index eceb164..b9c2250 100644
	--- a/swh/graph/tests/test_cli.py
	+++ b/swh/graph/tests/test_cli.py
	@@ -1,58 +1,58 @@
	# Copyright (C) 2019 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from pathlib import Path
	from tempfile import TemporaryDirectory
	from typing import Dict

	from click.testing import CliRunner
	import yaml

	from swh.graph.cli import graph_cli_group

	DATA_DIR = Path(__file__).parents[0] / "dataset"


	def read_properties(properties_fname) -> Dict[str, str]:
	"""read a Java .properties file"""
	with open(properties_fname) as f:
	keyvalues = (
	line.split("=", maxsplit=1)
	for line in f
	if not line.strip().startswith("#")
	)
	return dict((k.strip(), v.strip()) for (k, v) in keyvalues)


	def test_pipeline():
	"""run full compression pipeline"""
	# bare bone configuration, to allow testing the compression pipeline
	# with minimum RAM requirements on trivial graphs
	config = {"graph": {"compress": {"batch_size": 1000}}}
	runner = CliRunner()

	with TemporaryDirectory(suffix=".swh-graph-test") as tmpdir:
	config_path = Path(tmpdir, "config.yml")
	config_path.write_text(yaml.dump(config))

	result = runner.invoke(
	graph_cli_group,
	[
	"--config-file",
	config_path,
	"compress",
	"--input-dataset",
	DATA_DIR / "orc",
	"--output-directory",
	tmpdir,
	"--graph-name",
	"example",
	],
	)
	assert result.exit_code == 0, result
	properties = read_properties(Path(tmpdir) / "example.properties")

	- assert int(properties["nodes"]) == 21
	- assert int(properties["arcs"]) == 23
	+ assert int(properties["nodes"]) == 24
	+ assert int(properties["arcs"]) == 28
	diff --git a/swh/graph/tests/test_grpc.py b/swh/graph/tests/test_grpc.py
	index a98c549..73ae271 100644
	--- a/swh/graph/tests/test_grpc.py
	+++ b/swh/graph/tests/test_grpc.py
	@@ -1,129 +1,130 @@
	# Copyright (c) 2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import hashlib

	from google.protobuf.field_mask_pb2 import FieldMask

	from swh.graph.grpc.swhgraph_pb2 import (
	GraphDirection,
	NodeFilter,
	StatsRequest,
	TraversalRequest,
	)

	TEST_ORIGIN_ID = "swh:1:ori:{}".format(
	hashlib.sha1(b"https://example.com/swh/graph").hexdigest()
	)


	def test_stats(graph_grpc_stub):
	stats = graph_grpc_stub.Stats(StatsRequest())
	- assert stats.num_nodes == 21
	- assert stats.num_edges == 23
	+ assert stats.num_nodes == 24
	+ assert stats.num_edges == 28
	assert isinstance(stats.compression_ratio, float)
	assert isinstance(stats.bits_per_node, float)
	assert isinstance(stats.bits_per_edge, float)
	assert isinstance(stats.avg_locality, float)
	assert stats.indegree_min == 0
	- assert stats.indegree_max == 3
	+ assert stats.indegree_max == 4
	assert isinstance(stats.indegree_avg, float)
	assert stats.outdegree_min == 0
	assert stats.outdegree_max == 3
	assert isinstance(stats.outdegree_avg, float)


	def test_leaves(graph_grpc_stub):
	request = graph_grpc_stub.Traverse(
	TraversalRequest(
	src=[TEST_ORIGIN_ID],
	mask=FieldMask(paths=["swhid"]),
	return_nodes=NodeFilter(types="cnt"),
	)
	)
	actual = [node.swhid for node in request]
	expected = [
	"swh:1:cnt:0000000000000000000000000000000000000001",
	"swh:1:cnt:0000000000000000000000000000000000000004",
	"swh:1:cnt:0000000000000000000000000000000000000005",
	"swh:1:cnt:0000000000000000000000000000000000000007",
	]
	assert set(actual) == set(expected)


	def test_neighbors(graph_grpc_stub):
	request = graph_grpc_stub.Traverse(
	TraversalRequest(
	src=["swh:1:rev:0000000000000000000000000000000000000009"],
	direction=GraphDirection.BACKWARD,
	mask=FieldMask(paths=["swhid"]),
	min_depth=1,
	max_depth=1,
	)
	)
	actual = [node.swhid for node in request]
	expected = [
	+ "swh:1:snp:0000000000000000000000000000000000000022",
	"swh:1:snp:0000000000000000000000000000000000000020",
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000013",
	]
	assert set(actual) == set(expected)


	def test_visit_nodes(graph_grpc_stub):
	request = graph_grpc_stub.Traverse(
	TraversalRequest(
	src=["swh:1:rel:0000000000000000000000000000000000000010"],
	mask=FieldMask(paths=["swhid"]),
	edges="rel:rev,rev:rev",
	)
	)
	actual = [node.swhid for node in request]
	expected = [
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	]
	assert set(actual) == set(expected)


	def test_visit_nodes_filtered(graph_grpc_stub):
	request = graph_grpc_stub.Traverse(
	TraversalRequest(
	src=["swh:1:rel:0000000000000000000000000000000000000010"],
	mask=FieldMask(paths=["swhid"]),
	return_nodes=NodeFilter(types="dir"),
	)
	)
	actual = [node.swhid for node in request]
	expected = [
	"swh:1:dir:0000000000000000000000000000000000000002",
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:dir:0000000000000000000000000000000000000006",
	]
	assert set(actual) == set(expected)


	def test_visit_nodes_filtered_star(graph_grpc_stub):
	request = graph_grpc_stub.Traverse(
	TraversalRequest(
	src=["swh:1:rel:0000000000000000000000000000000000000010"],
	mask=FieldMask(paths=["swhid"]),
	)
	)
	actual = [node.swhid for node in request]
	expected = [
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	"swh:1:dir:0000000000000000000000000000000000000002",
	"swh:1:cnt:0000000000000000000000000000000000000001",
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:cnt:0000000000000000000000000000000000000007",
	"swh:1:dir:0000000000000000000000000000000000000006",
	"swh:1:cnt:0000000000000000000000000000000000000004",
	"swh:1:cnt:0000000000000000000000000000000000000005",
	]
	assert set(actual) == set(expected)
	diff --git a/swh/graph/tests/test_http_client.py b/swh/graph/tests/test_http_client.py
	index 1b8cb6e..8b5a6d0 100644
	--- a/swh/graph/tests/test_http_client.py
	+++ b/swh/graph/tests/test_http_client.py
	@@ -1,450 +1,456 @@
	# Copyright (c) 2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import hashlib

	import pytest
	from pytest import raises

	from swh.core.api import RemoteException
	from swh.graph.http_client import GraphArgumentException

	TEST_ORIGIN_ID = "swh:1:ori:{}".format(
	hashlib.sha1(b"https://example.com/swh/graph").hexdigest()
	)


	def test_stats(graph_client):
	stats = graph_client.stats()
	- assert stats["num_nodes"] == 21
	- assert stats["num_edges"] == 23
	+ assert stats["num_nodes"] == 24
	+ assert stats["num_edges"] == 28
	assert isinstance(stats["compression_ratio"], float)
	assert isinstance(stats["bits_per_node"], float)
	assert isinstance(stats["bits_per_edge"], float)
	assert isinstance(stats["avg_locality"], float)
	assert stats["indegree_min"] == 0
	- assert stats["indegree_max"] == 3
	+ assert stats["indegree_max"] == 4
	assert isinstance(stats["indegree_avg"], float)
	assert stats["outdegree_min"] == 0
	assert stats["outdegree_max"] == 3
	assert isinstance(stats["outdegree_avg"], float)


	def test_leaves(graph_client):
	actual = list(graph_client.leaves(TEST_ORIGIN_ID))
	expected = [
	"swh:1:cnt:0000000000000000000000000000000000000001",
	"swh:1:cnt:0000000000000000000000000000000000000004",
	"swh:1:cnt:0000000000000000000000000000000000000005",
	"swh:1:cnt:0000000000000000000000000000000000000007",
	]
	assert set(actual) == set(expected)


	@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
	def test_leaves_with_limit(graph_client, max_matching_nodes):
	actual = list(
	graph_client.leaves(TEST_ORIGIN_ID, max_matching_nodes=max_matching_nodes)
	)
	expected = [
	"swh:1:cnt:0000000000000000000000000000000000000001",
	"swh:1:cnt:0000000000000000000000000000000000000004",
	"swh:1:cnt:0000000000000000000000000000000000000005",
	"swh:1:cnt:0000000000000000000000000000000000000007",
	]

	if max_matching_nodes == 0:
	assert set(actual) == set(expected)
	else:
	assert set(actual) <= set(expected)
	assert len(actual) == min(4, max_matching_nodes)


	def test_neighbors(graph_client):
	actual = list(
	graph_client.neighbors(
	"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
	)
	)
	expected = [
	+ "swh:1:snp:0000000000000000000000000000000000000022",
	"swh:1:snp:0000000000000000000000000000000000000020",
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000013",
	]
	assert set(actual) == set(expected)


	def test_visit_nodes(graph_client):
	actual = list(
	graph_client.visit_nodes(
	"swh:1:rel:0000000000000000000000000000000000000010",
	edges="rel:rev,rev:rev",
	)
	)
	expected = [
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	]
	assert set(actual) == set(expected)


	@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
	def test_visit_nodes_limit(graph_client, max_matching_nodes):
	actual = list(
	graph_client.visit_nodes(
	"swh:1:rel:0000000000000000000000000000000000000010",
	edges="rel:rev,rev:rev",
	max_matching_nodes=max_matching_nodes,
	)
	)
	expected = [
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	]
	if max_matching_nodes == 0:
	assert set(actual) == set(expected)
	else:
	assert set(actual) <= set(expected)
	assert len(actual) == min(3, max_matching_nodes)


	def test_visit_nodes_filtered(graph_client):
	actual = list(
	graph_client.visit_nodes(
	"swh:1:rel:0000000000000000000000000000000000000010",
	return_types="dir",
	)
	)
	expected = [
	"swh:1:dir:0000000000000000000000000000000000000002",
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:dir:0000000000000000000000000000000000000006",
	]
	assert set(actual) == set(expected)


	@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
	def test_visit_nodes_filtered_limit(graph_client, max_matching_nodes):
	actual = list(
	graph_client.visit_nodes(
	"swh:1:rel:0000000000000000000000000000000000000010",
	return_types="dir",
	max_matching_nodes=max_matching_nodes,
	)
	)
	expected = [
	"swh:1:dir:0000000000000000000000000000000000000002",
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:dir:0000000000000000000000000000000000000006",
	]
	if max_matching_nodes == 0:
	assert set(actual) == set(expected)
	else:
	assert set(actual) <= set(expected)
	assert len(actual) == min(3, max_matching_nodes)


	def test_visit_nodes_filtered_star(graph_client):
	actual = list(
	graph_client.visit_nodes(
	"swh:1:rel:0000000000000000000000000000000000000010",
	return_types="*",
	)
	)
	expected = [
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	"swh:1:dir:0000000000000000000000000000000000000002",
	"swh:1:cnt:0000000000000000000000000000000000000001",
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:cnt:0000000000000000000000000000000000000007",
	"swh:1:dir:0000000000000000000000000000000000000006",
	"swh:1:cnt:0000000000000000000000000000000000000004",
	"swh:1:cnt:0000000000000000000000000000000000000005",
	]
	assert set(actual) == set(expected)


	def test_visit_edges(graph_client):
	actual = list(
	graph_client.visit_edges(
	"swh:1:rel:0000000000000000000000000000000000000010",
	edges="rel:rev,rev:rev,rev:dir",
	)
	)
	expected = [
	(
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:dir:0000000000000000000000000000000000000008",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000003",
	"swh:1:dir:0000000000000000000000000000000000000002",
	),
	]
	assert set(actual) == set(expected)


	def test_visit_edges_limited(graph_client):
	actual = list(
	graph_client.visit_edges(
	"swh:1:rel:0000000000000000000000000000000000000010",
	max_edges=4,
	edges="rel:rev,rev:rev,rev:dir",
	)
	)
	expected = [
	(
	"swh:1:rel:0000000000000000000000000000000000000010",
	"swh:1:rev:0000000000000000000000000000000000000009",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:dir:0000000000000000000000000000000000000008",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000003",
	"swh:1:dir:0000000000000000000000000000000000000002",
	),
	]
	+
	# As there are four valid answers (up to reordering), we cannot check for
	- # equality. Instead, we check the client returned all edges but one.
	+ # equality. Instead, we check the client returned either
	+ # * all edges but one, or
	+ # * all edges
	+ # and the right answer depends on which edges were traversed, which is
	+ # non-deterministic
	assert set(actual).issubset(set(expected))
	- assert len(actual) == 3
	+ assert 3 <= len(actual) <= 4


	def test_visit_edges_diamond_pattern(graph_client):
	actual = list(
	graph_client.visit_edges(
	"swh:1:rev:0000000000000000000000000000000000000009",
	edges="*",
	)
	)
	expected = [
	(
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:rev:0000000000000000000000000000000000000003",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000009",
	"swh:1:dir:0000000000000000000000000000000000000008",
	),
	(
	"swh:1:rev:0000000000000000000000000000000000000003",
	"swh:1:dir:0000000000000000000000000000000000000002",
	),
	(
	"swh:1:dir:0000000000000000000000000000000000000002",
	"swh:1:cnt:0000000000000000000000000000000000000001",
	),
	(
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:cnt:0000000000000000000000000000000000000001",
	),
	(
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:cnt:0000000000000000000000000000000000000007",
	),
	(
	"swh:1:dir:0000000000000000000000000000000000000008",
	"swh:1:dir:0000000000000000000000000000000000000006",
	),
	(
	"swh:1:dir:0000000000000000000000000000000000000006",
	"swh:1:cnt:0000000000000000000000000000000000000004",
	),
	(
	"swh:1:dir:0000000000000000000000000000000000000006",
	"swh:1:cnt:0000000000000000000000000000000000000005",
	),
	]
	assert set(actual) == set(expected)


	@pytest.mark.skip(reason="currently disabled due to T1969")
	def test_walk(graph_client):
	args = ("swh:1:dir:0000000000000000000000000000000000000016", "rel")
	kwargs = {
	"edges": "dir:dir,dir:rev,rev:*",
	"direction": "backward",
	"traversal": "bfs",
	}

	actual = list(graph_client.walk(args, *kwargs))
	expected = [
	"swh:1:dir:0000000000000000000000000000000000000016",
	"swh:1:dir:0000000000000000000000000000000000000017",
	"swh:1:rev:0000000000000000000000000000000000000018",
	"swh:1:rel:0000000000000000000000000000000000000019",
	]
	assert set(actual) == set(expected)

	kwargs2 = kwargs.copy()
	kwargs2["limit"] = -1
	actual = list(graph_client.walk(args, *kwargs2))
	expected = ["swh:1:rel:0000000000000000000000000000000000000019"]
	assert set(actual) == set(expected)

	kwargs2 = kwargs.copy()
	kwargs2["limit"] = 2
	actual = list(graph_client.walk(args, *kwargs2))
	expected = [
	"swh:1:dir:0000000000000000000000000000000000000016",
	"swh:1:dir:0000000000000000000000000000000000000017",
	]
	assert set(actual) == set(expected)


	@pytest.mark.skip(reason="Random walk is deprecated")
	def test_random_walk_dst_is_type(graph_client):
	"""as the walk is random, we test a visit from a cnt node to a release
	reachable from every single path in the backward graph, and only check the
	final node of the path (i.e., the release)
	"""
	args = ("swh:1:cnt:0000000000000000000000000000000000000015", "rel")
	kwargs = {"direction": "backward"}
	expected_root = "swh:1:rel:0000000000000000000000000000000000000019"

	actual = list(graph_client.random_walk(args, *kwargs))
	assert len(actual) > 1 # no release directly links to a content
	assert actual[0] == args[0]
	assert actual[-1] == expected_root

	kwargs2 = kwargs.copy()
	kwargs2["limit"] = -1
	actual = list(graph_client.random_walk(args, *kwargs2))
	assert actual == [expected_root]

	kwargs2["limit"] = -2
	actual = list(graph_client.random_walk(args, *kwargs2))
	assert len(actual) == 2
	assert actual[-1] == expected_root

	kwargs2["limit"] = 3
	actual = list(graph_client.random_walk(args, *kwargs2))
	assert len(actual) == 3


	@pytest.mark.skip(reason="Random walk is deprecated")
	def test_random_walk_dst_is_node(graph_client):
	"""Same as test_random_walk_dst_is_type, but we target the specific release
	node instead of a type
	"""
	args = (
	"swh:1:cnt:0000000000000000000000000000000000000015",
	"swh:1:rel:0000000000000000000000000000000000000019",
	)
	kwargs = {"direction": "backward"}
	expected_root = "swh:1:rel:0000000000000000000000000000000000000019"

	actual = list(graph_client.random_walk(args, *kwargs))
	assert len(actual) > 1 # no origin directly links to a content
	assert actual[0] == args[0]
	assert actual[-1] == expected_root

	kwargs2 = kwargs.copy()
	kwargs2["limit"] = -1
	actual = list(graph_client.random_walk(args, *kwargs2))
	assert actual == [expected_root]

	kwargs2["limit"] = -2
	actual = list(graph_client.random_walk(args, *kwargs2))
	assert len(actual) == 2
	assert actual[-1] == expected_root

	kwargs2["limit"] = 3
	actual = list(graph_client.random_walk(args, *kwargs2))
	assert len(actual) == 3


	def test_count(graph_client):
	actual = graph_client.count_leaves(TEST_ORIGIN_ID)
	assert actual == 4
	actual = graph_client.count_visit_nodes(
	"swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev"
	)
	assert actual == 3
	actual = graph_client.count_neighbors(
	"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
	)
	- assert actual == 3
	+ assert actual == 4


	@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
	def test_count_with_limit(graph_client, max_matching_nodes):
	actual = graph_client.count_leaves(
	TEST_ORIGIN_ID, max_matching_nodes=max_matching_nodes
	)
	if max_matching_nodes == 0:
	assert actual == 4
	else:
	assert actual == min(4, max_matching_nodes)


	def test_param_validation(graph_client):
	with raises(GraphArgumentException) as exc_info: # SWHID not found
	list(graph_client.leaves("swh:1:rel:00ffffffff000000000000000000000000000010"))
	if exc_info.value.response:
	assert exc_info.value.response.status_code == 404

	with raises(GraphArgumentException) as exc_info: # malformed SWHID
	list(
	graph_client.neighbors("swh:1:rel:00ffffffff00000000zzzzzzz000000000000010")
	)
	if exc_info.value.response:
	assert exc_info.value.response.status_code == 400

	with raises(GraphArgumentException) as exc_info: # malformed edge specificaiton
	list(
	graph_client.visit_nodes(
	"swh:1:dir:0000000000000000000000000000000000000016",
	edges="dir:notanodetype,dir:rev,rev:*",
	direction="backward",
	)
	)
	if exc_info.value.response:
	assert exc_info.value.response.status_code == 400

	with raises(GraphArgumentException) as exc_info: # malformed direction
	list(
	graph_client.visit_nodes(
	"swh:1:dir:0000000000000000000000000000000000000016",
	edges="dir:dir,dir:rev,rev:*",
	direction="notadirection",
	)
	)
	if exc_info.value.response:
	assert exc_info.value.response.status_code == 400


	@pytest.mark.skip(reason="currently disabled due to T1969")
	def test_param_validation_walk(graph_client):
	"""test validation of walk-specific parameters only"""
	with raises(RemoteException) as exc_info: # malformed traversal order
	list(
	graph_client.walk(
	"swh:1:dir:0000000000000000000000000000000000000016",
	"rel",
	edges="dir:dir,dir:rev,rev:*",
	direction="backward",
	traversal="notatraversalorder",
	)
	)
	assert exc_info.value.response.status_code == 400
	diff --git a/swh/graph/tests/test_luigi.py b/swh/graph/tests/test_luigi.py
	index 232479e..1cce95c 100644
	--- a/swh/graph/tests/test_luigi.py
	+++ b/swh/graph/tests/test_luigi.py
	@@ -1,36 +1,36 @@
	# Copyright (C) 2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import json
	from pathlib import Path

	from swh.graph.luigi import CompressGraph

	from .test_cli import read_properties

	DATA_DIR = Path(__file__).parents[0] / "dataset"


	def test_compressgraph(tmpdir):
	tmpdir = Path(tmpdir)

	task = CompressGraph(
	local_export_path=DATA_DIR,
	local_graph_path=tmpdir / "compressed_graph",
	batch_size=1000, # go fast on the trivial dataset
	)

	task.run()

	properties = read_properties(tmpdir / "compressed_graph" / "graph.properties")

	- assert int(properties["nodes"]) == 21
	- assert int(properties["arcs"]) == 23
	+ assert int(properties["nodes"]) == 24
	+ assert int(properties["arcs"]) == 28

	export_meta_path = tmpdir / "compressed_graph/meta/export.json"
	assert export_meta_path.read_bytes() == (DATA_DIR / "meta/export.json").read_bytes()

	compression_meta_path = tmpdir / "compressed_graph/meta/compression.json"
	assert json.load(compression_meta_path.open())[0]["conf"] == {"batch_size": 1000}
	diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py
	index b696af3..dccb45c 100644
	--- a/swh/graph/tests/test_origin_contributors.py
	+++ b/swh/graph/tests/test_origin_contributors.py
	@@ -1,180 +1,187 @@
	# Copyright (C) 2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import datetime
	from pathlib import Path
	import subprocess

	from swh.graph.luigi import (
	DeanonymizeOriginContributors,
	ExportDeanonymizationTable,
	ListOriginContributors,
	)
	from swh.model.model import (
	ObjectType,
	Person,
	Release,
	Revision,
	RevisionType,
	TimestampWithTimezone,
	)

	from .test_toposort import EXPECTED as TOPOLOGICAL_ORDER

	DATA_DIR = Path(__file__).parents[0] / "dataset"


	# FIXME: do not hardcode ids here; they should be dynamically loaded
	# from the test graph
	ORIGIN_CONTRIBUTORS = """\
	origin_SWHID,person_id
	swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,0
	swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,2
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,0
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,null
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,2
	"""

	DEANONYMIZATION_TABLE = """\
	sha256_base64,base64,escaped
	8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
	aZA9TeLhVzqVDQHQOd53UABAZYyek0tY3vTo6VSlA4U=,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe <jdoe@example.com>
	UaCrgAZBvn1LBd2sAinmdNvAX/G4sjo1aJA9GDd9UUs=,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5uZXQ+,Jane Doe <jdoe@example.net>
	""" # noqa

	PERSONS = """\
	aZA9TeLhVzqVDQHQOd53UABAZYyek0tY3vTo6VSlA4U=
	UaCrgAZBvn1LBd2sAinmdNvAX/G4sjo1aJA9GDd9UUs=
	8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=
	"""

	DEANONYMIZED_ORIGIN_CONTRIBUTORS = """\
	origin_SWHID,person_base64,person_escaped
	swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe <jdoe@example.com>
	swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe <jdoe@example.com>
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5uZXQ+,Jane Doe <jdoe@example.net>
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
	""" # noqa


	def test_list_origin_contributors(tmpdir):
	tmpdir = Path(tmpdir)

	topological_order_path = tmpdir / "topo_order.csv.zst"
	origin_contributors_path = tmpdir / "origin_contributors.csv.zst"

	subprocess.run(
	["zstdmt", "-o", topological_order_path],
	input=TOPOLOGICAL_ORDER.encode(),
	check=True,
	)

	task = ListOriginContributors(
	local_graph_path=DATA_DIR / "compressed",
	topological_order_path=topological_order_path,
	origin_contributors_path=origin_contributors_path,
	graph_name="example",
	)

	task.run()

	csv_text = subprocess.check_output(["zstdcat", origin_contributors_path]).decode()

	assert csv_text == ORIGIN_CONTRIBUTORS


	def test_export_deanonymization_table(tmpdir, swh_storage_postgresql, swh_storage):
	tmpdir = Path(tmpdir)

	tstz = TimestampWithTimezone.from_datetime(
	datetime.datetime.now(tz=datetime.timezone.utc)
	)
	swh_storage.release_add(
	[
	Release(
	name=b"v1.0",
	message=b"first release",
	author=Person.from_fullname(b"John Doe <jdoe@example.org>"),
	target=b"\x00" * 20,
	target_type=ObjectType.REVISION,
	synthetic=True,
	)
	]
	)
	swh_storage.revision_add(
	[
	Revision(
	message=b"first commit",
	author=Person.from_fullname(b"Jane Doe <jdoe@example.com>"),
	committer=Person.from_fullname(b"Jane Doe <jdoe@example.net>"),
	date=tstz,
	committer_date=tstz,
	directory=b"\x00" * 20,
	type=RevisionType.GIT,
	synthetic=True,
	)
	]
	)

	deanonymization_table_path = tmpdir / "person_sha256_to_names.csv.zst"

	task = ExportDeanonymizationTable(
	storage_dsn=swh_storage_postgresql.dsn,
	deanonymization_table_path=deanonymization_table_path,
	)

	task.run()

	csv_text = subprocess.check_output(["zstdcat", deanonymization_table_path]).decode()

	(header, *rows) = csv_text.split("\n")
	(expected_header, *expected_rows) = DEANONYMIZATION_TABLE.split("\n")

	assert header == expected_header

	assert rows.pop() == "", "Missing trailing newline"
	expected_rows.pop()

	assert set(rows) == set(expected_rows)


	def test_deanonymize_origin_contributors(tmpdir):
	tmpdir = Path(tmpdir)

	persons_path = tmpdir / "example.persons.csv.zst"
	origin_contributors_path = tmpdir / "origin_contributors.csv.zst"
	deanonymization_table_path = tmpdir / "person_sha256_to_names.csv.zst"
	deanonymized_origin_contributors_path = (
	tmpdir / "origin_contributors.deanonymized.csv.zst"
	)

	subprocess.run(
	["zstdmt", "-o", origin_contributors_path],
	input=ORIGIN_CONTRIBUTORS.encode(),
	check=True,
	)

	subprocess.run(
	["zstdmt", "-o", persons_path],
	input=PERSONS.encode(),
	check=True,
	)

	subprocess.run(
	["zstdmt", "-o", deanonymization_table_path],
	input=DEANONYMIZATION_TABLE.encode(),
	check=True,
	)

	task = DeanonymizeOriginContributors(
	local_graph_path=tmpdir,
	origin_contributors_path=origin_contributors_path,
	deanonymization_table_path=deanonymization_table_path,
	deanonymized_origin_contributors_path=deanonymized_origin_contributors_path,
	graph_name="example",
	)

	task.run()

	csv_text = subprocess.check_output(
	["zstdcat", deanonymized_origin_contributors_path]
	).decode()

	assert csv_text == DEANONYMIZED_ORIGIN_CONTRIBUTORS
	diff --git a/swh/graph/tests/test_toposort.py b/swh/graph/tests/test_toposort.py
	index 1c5e2ed..6d35628 100644
	--- a/swh/graph/tests/test_toposort.py
	+++ b/swh/graph/tests/test_toposort.py
	@@ -1,59 +1,67 @@
	# Copyright (C) 2022 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from pathlib import Path
	import subprocess

	from swh.graph.luigi import TopoSort

	DATA_DIR = Path(__file__).parents[0] / "dataset"


	+# FIXME: the order of sample ancestors should not be hardcoded
	+# FIXME: swh:1:snp:0000000000000000000000000000000000000022,3,1,swh has three possible
	+# sample ancestors; they should not be hardecoded here
	EXPECTED = """\
	SWHID,ancestors,successors,sample_ancestor1,sample_ancestor2
	swh:1:rev:0000000000000000000000000000000000000003,0,1,,
	-swh:1:rev:0000000000000000000000000000000000000009,1,3,swh:1:rev:0000000000000000000000000000000000000003,
	-swh:1:rel:0000000000000000000000000000000000000010,1,1,swh:1:rev:0000000000000000000000000000000000000009,
	+swh:1:rev:0000000000000000000000000000000000000009,1,4,swh:1:rev:0000000000000000000000000000000000000003,
	+swh:1:rel:0000000000000000000000000000000000000010,1,2,swh:1:rev:0000000000000000000000000000000000000009,
	swh:1:snp:0000000000000000000000000000000000000020,2,1,swh:1:rev:0000000000000000000000000000000000000009,swh:1:rel:0000000000000000000000000000000000000010
	swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,1,0,swh:1:snp:0000000000000000000000000000000000000020,
	swh:1:rev:0000000000000000000000000000000000000013,1,1,swh:1:rev:0000000000000000000000000000000000000009,
	-swh:1:rev:0000000000000000000000000000000000000018,1,1,swh:1:rev:0000000000000000000000000000000000000013,
	+swh:1:rev:0000000000000000000000000000000000000018,1,2,swh:1:rev:0000000000000000000000000000000000000013,
	swh:1:rel:0000000000000000000000000000000000000019,1,0,swh:1:rev:0000000000000000000000000000000000000018,
	+swh:1:rel:0000000000000000000000000000000000000021,1,1,swh:1:rev:0000000000000000000000000000000000000018,
	+swh:1:snp:0000000000000000000000000000000000000022,3,1,swh:1:rev:0000000000000000000000000000000000000009,swh:1:rel:0000000000000000000000000000000000000010
	+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0,swh:1:snp:0000000000000000000000000000000000000022,
	"""


	def test_toposort(tmpdir):
	tmpdir = Path(tmpdir)

	topological_order_path = tmpdir / "topo_order.csv.zst"

	task = TopoSort(
	local_graph_path=DATA_DIR / "compressed",
	topological_order_path=topological_order_path,
	graph_name="example",
	)

	task.run()

	csv_text = subprocess.check_output(["zstdcat", topological_order_path]).decode()

	(header, *rows) = csv_text.split("\n")
	(expected_header, *expected_lines) = EXPECTED.split("\n")
	assert header == expected_header

	# The only possible first line
	assert rows[0] == "swh:1:rev:0000000000000000000000000000000000000003,0,1,,"

	assert set(rows) == set(expected_lines)

	assert rows.pop() == "", "Missing trailing newline"

	- # The only two possible last lines
	+ # The only three possible last lines
	assert rows[-1] in [
	"swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,1,0"
	",swh:1:snp:0000000000000000000000000000000000000020,",
	+ "swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0"
	+ ",swh:1:snp:0000000000000000000000000000000000000022,",
	"swh:1:rel:0000000000000000000000000000000000000019,1,0"
	",swh:1:rev:0000000000000000000000000000000000000018,",
	]

File Metadata

Mime Type: application/octet-stream
Expires: Tue, Apr 8, 9:35 AM (2 d)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3246386

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions