Page MenuHomeSoftware Heritage

No OneTemporary

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
diff --git a/swh/graph/luigi.py b/swh/graph/luigi.py
index b162244..a190a80 100644
--- a/swh/graph/luigi.py
+++ b/swh/graph/luigi.py
@@ -1,648 +1,654 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Luigi tasks
===========
This module contains `Luigi <https://luigi.readthedocs.io/>`_ tasks,
as an alternative to the CLI that can be composed with other tasks,
such as swh-dataset's.
Unlike the CLI, this requires the graph to be named `graph`.
File layout
-----------
In addition to files documented in :ref:`graph-compression` (eg. :file:`graph.graph`,
:file:`graph.mph`, ...), tasks in this module produce this directory structure::
swh_<date>[_<flavor>]/
graph.graph
graph.mph
...
meta/
export.json
compression.json
``graph.meta/export.json`` is copied from the ORC dataset exported by
:mod:`swh.dataset.luigi`.
``graph.meta/compression.json`` contains information about the compression itself,
for provenance tracking.
For example:
.. code-block:: json
[
{
"steps": null,
"export_start": "2022-11-08T11:00:54.998799+00:00",
"export_end": "2022-11-08T11:05:53.105519+00:00",
"object_type": [
"origin",
"origin_visit"
],
"hostname": "desktop5",
"conf": {},
"tool": {
"name": "swh.graph",
"version": "2.2.0"
}
}
]
When the compression pipeline is run in separate steps, each of the steps is recorded
as an object in the root list.
S3 layout
---------
As ``.bin`` files are meant to be accessed randomly, they are uncompressed on disk.
However, this is undesirable on at-rest/long-term storage like on S3, because
some are very sparse (eg. :file:`graph.property.committer_timestamp.bin` can be
quickly compressed from 300 to 1GB).
Therefore, these files are compressed to ``.bin.zst``, and need to be decompressed
when downloading.
The layout is otherwise the same as the file layout.
"""
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
from pathlib import Path
from typing import Dict, List, Tuple
import luigi
from swh.dataset.luigi import Format, LocalExport, ObjectType, S3PathParameter
class CompressGraph(luigi.Task):
local_export_path = luigi.PathParameter(significant=False)
local_graph_path = luigi.PathParameter()
batch_size = luigi.IntParameter(
default=0,
significant=False,
description="""
Size of work batches to use while compressing.
Larger is faster, but consumes more resources.
""",
)
object_types = list(ObjectType)
# To make this configurable, we could use this:
# object_types = luigi.EnumListParameter(
# enum=ObjectType, default=list(ObjectType), batch_method=merge_lists
# )
# then use swh.dataset.luigi._export_metadata_has_object_types to check in
# .meta/export.json that all objects are present before skipping the task
def requires(self) -> List[luigi.Task]:
"""Returns a :class:`LocalExport` task."""
return [
LocalExport(
local_export_path=self.local_export_path,
formats=[Format.orc], # type: ignore[attr-defined]
object_types=self.object_types,
)
]
def output(self) -> List[luigi.LocalTarget]:
"""Returns the ``meta/*.json`` targets"""
return [self._export_meta(), self._compression_meta()]
def _export_meta(self) -> luigi.Target:
"""Returns the metadata on the dataset export"""
return luigi.LocalTarget(self.local_graph_path / "meta/export.json")
def _compression_meta(self) -> luigi.Target:
"""Returns the metadata on the compression pipeline"""
return luigi.LocalTarget(self.local_graph_path / "meta/compression.json")
def run(self):
"""Runs the full compression pipeline, then writes :file:`meta/compression.json`
This does not support running individual steps yet."""
import datetime
import json
import shutil
import socket
import pkg_resources
from swh.graph import webgraph
conf = {} # TODO: make this configurable
steps = None # TODO: make this configurable
if self.batch_size:
conf["batch_size"] = self.batch_size
# Delete stamps. Otherwise interrupting this compression pipeline may leave
# stamps from a previous successful compression
if self._export_meta().exists():
self._export_meta().remove()
if self._compression_meta().exists():
self._compression_meta().remove()
# Make sure we don't accidentally append to existing files
if self.local_graph_path.exists():
shutil.rmtree(self.local_graph_path)
output_directory = self.local_graph_path
graph_name = "graph"
def progress_cb(percentage: int, step: webgraph.CompressionStep):
self.set_progress_percentage(percentage)
self.set_status_message(f"Running {step.name} (step #{step.value})")
start_date = datetime.datetime.now(tz=datetime.timezone.utc)
webgraph.compress(
graph_name,
self.local_export_path / "orc",
output_directory,
steps,
conf,
)
end_date = datetime.datetime.now(tz=datetime.timezone.utc)
# Copy dataset export metadata
with self._export_meta().open("w") as write_fd:
with (self.local_export_path / "meta" / "export.json").open() as read_fd:
write_fd.write(read_fd.read())
# Append metadata about this compression pipeline
if self._compression_meta().exists():
with self._compression_meta().open("w") as fd:
meta = json.load(fd)
else:
meta = []
meta.append(
{
"steps": steps,
"compression_start": start_date.isoformat(),
"compression_end": end_date.isoformat(),
"object_type": [object_type.name for object_type in self.object_types],
"hostname": socket.getfqdn(),
"conf": conf,
"tool": {
"name": "swh.graph",
"version": pkg_resources.get_distribution("swh.graph").version,
},
}
)
with self._compression_meta().open("w") as fd:
json.dump(meta, fd, indent=4)
class UploadGraphToS3(luigi.Task):
"""Uploads a local compressed graphto S3; creating automatically if it does
not exist.
Example invocation::
luigi --local-scheduler --module swh.graph.luigi UploadGraphToS3 \
--local-graph-path=graph/ \
--s3-graph-path=s3://softwareheritage/graph/swh_2022-11-08/compressed/
"""
local_graph_path = luigi.PathParameter(significant=False)
s3_graph_path = S3PathParameter()
def requires(self) -> List[luigi.Task]:
"""Returns a :class:`CompressGraph` task that writes local files at the
expected location."""
return [
CompressGraph(
local_graph_path=self.local_graph_path,
)
]
def output(self) -> List[luigi.Target]:
"""Returns stamp and meta paths on S3."""
return [self._meta()]
def _meta(self):
import luigi.contrib.s3
return luigi.contrib.s3.S3Target(f"{self.s3_graph_path}/meta/compression.json")
def run(self) -> None:
"""Copies all files: first the graph itself, then :file:`meta/compression.json`."""
import subprocess
import tempfile
import luigi.contrib.s3
import tqdm
compression_metadata_path = self.local_graph_path / "meta" / "compression.json"
seen_compression_metadata = False
client = luigi.contrib.s3.S3Client()
# recursively copy local files to S3, and end with compression metadata
paths = list(self.local_graph_path.glob("**/*"))
for (i, path) in tqdm.tqdm(
list(enumerate(paths)),
desc="Uploading compressed graph",
):
if path == compression_metadata_path:
# Write it last
seen_compression_metadata = True
continue
if path.is_dir():
continue
relative_path = path.relative_to(self.local_graph_path)
self.set_progress_percentage(int(i * 100 / len(paths)))
if path.suffix == ".bin":
# Large sparse file; store it compressed on S3.
with tempfile.NamedTemporaryFile(
prefix=path.stem, suffix=".bin.zst"
) as fd:
self.set_status_message(f"Compressing {relative_path}")
subprocess.run(
["zstdmt", "--force", "--keep", path, "-o", fd.name], check=True
)
self.set_status_message(f"Uploading {relative_path} (compressed)")
client.put_multipart(
fd.name,
f"{self.s3_graph_path}/{relative_path}.zst",
ACL="public-read",
)
else:
self.set_status_message(f"Uploading {relative_path}")
client.put_multipart(
path, f"{self.s3_graph_path}/{relative_path}", ACL="public-read"
)
assert (
seen_compression_metadata
), "did not see meta/compression.json in directory listing"
# Write it last, to act as a stamp
client.put(
compression_metadata_path,
self._meta().path,
ACL="public-read",
)
class DownloadGraphFromS3(luigi.Task):
"""Downloads a local dataset graph from S3.
This performs the inverse operation of :class:`UploadGraphToS3`
Example invocation::
luigi --local-scheduler --module swh.graph.luigi DownloadGraphFromS3 \
--local-graph-path=graph/ \
--s3-graph-path=s3://softwareheritage/graph/swh_2022-11-08/compressed/
"""
local_graph_path = luigi.PathParameter()
s3_graph_path = S3PathParameter(significant=False)
def requires(self) -> List[luigi.Task]:
"""Returns a :class:`ExportGraph` task that writes local files at the
expected location."""
return [
UploadGraphToS3(
local_graph_path=self.local_graph_path,
s3_graph_path=self.s3_graph_path,
)
]
def output(self) -> List[luigi.Target]:
"""Returns stamp and meta paths on the local filesystem."""
return [self._meta()]
def _meta(self):
return luigi.LocalTarget(self.local_graph_path / "meta" / "export.json")
def run(self) -> None:
"""Copies all files: first the graph itself, then :file:`meta/compression.json`."""
import subprocess
import tempfile
import luigi.contrib.s3
import tqdm
client = luigi.contrib.s3.S3Client()
compression_metadata_path = f"{self.s3_graph_path}/meta/compression.json"
seen_compression_metadata = False
# recursively copy local files to S3, and end with compression metadata
files = list(client.list(self.s3_graph_path))
for (i, file_) in tqdm.tqdm(
list(enumerate(files)),
desc="Downloading",
):
if file_ == compression_metadata_path:
# Will copy it last
seen_compression_metadata = True
continue
self.set_progress_percentage(int(i * 100 / len(files)))
local_path = self.local_graph_path / file_
local_path.parent.mkdir(parents=True, exist_ok=True)
if file_.endswith(".bin.zst"):
# The file was compressed before uploading to S3, we need it
# to be decompressed locally
with tempfile.NamedTemporaryFile(
prefix=local_path.stem, suffix=".bin.zst"
) as fd:
self.set_status_message(f"Downloading {file_} (compressed)")
client.get(
f"{self.s3_graph_path}/{file_}",
fd.name,
)
self.set_status_message(f"Decompressing {file_}")
subprocess.run(
[
"zstdmt",
"--force",
"-d",
fd.name,
"-o",
str(local_path)[0:-4],
],
check=True,
)
else:
self.set_status_message(f"Downloading {file_}")
client.get(
f"{self.s3_graph_path}/{file_}",
str(local_path),
)
assert (
seen_compression_metadata
), "did not see meta/compression.json in directory listing"
# Write it last, to act as a stamp
client.get(
compression_metadata_path,
self._meta().path,
)
class LocalGraph(luigi.Task):
"""Task that depends on a local dataset being present -- either directly from
:class:`ExportGraph` or via :class:`DownloadGraphFromS3`.
"""
local_graph_path = luigi.PathParameter()
compression_task_type = luigi.TaskParameter(
default=DownloadGraphFromS3,
significant=False,
description="""The task used to get the compressed graph if it is not present.
Should be either ``swh.graph.luigi.CompressGraph`` or
``swh.graph.luigi.DownloadGraphFromS3``.""",
)
def requires(self) -> List[luigi.Task]:
"""Returns an instance of either :class:`CompressGraph` or
:class:`DownloadGraphFromS3` depending on the value of
:attr:`compression_task_type`."""
if issubclass(self.compression_task_type, CompressGraph):
return [
CompressGraph(
local_graph_path=self.local_graph_path,
)
]
elif issubclass(self.compression_task_type, DownloadGraphFromS3):
return [
DownloadGraphFromS3(
local_graph_path=self.local_graph_path,
)
]
else:
raise ValueError(
f"Unexpected compression_task_type: "
f"{self.compression_task_type.__name__}"
)
def output(self) -> List[luigi.Target]:
"""Returns stamp and meta paths on the local filesystem."""
return [self._meta()]
def _meta(self):
return luigi.LocalTarget(self.local_graph_path / "meta" / "compression.json")
def _run_script(script: str, output_path: Path) -> None:
import os
import subprocess
from .config import check_config
conf: Dict = {} # TODO: configurable
conf = check_config(conf)
env = {
**os.environ.copy(),
"JAVA_TOOL_OPTIONS": conf["java_tool_options"],
"CLASSPATH": conf["classpath"],
}
tmp_output_path = Path(f"{output_path}.tmp")
subprocess.run(
["bash", "-c", f"{script.strip()} > {tmp_output_path}"], env=env, check=True
)
# Atomically write the output file
tmp_output_path.replace(output_path)
class TopoSort(luigi.Task):
"""Creates a file that contains all SWHIDs in topological order from a compressed
graph."""
local_graph_path = luigi.PathParameter()
topological_order_path = luigi.PathParameter()
graph_name = luigi.Parameter(default="graph")
def requires(self) -> List[luigi.Task]:
"""Returns an instance of :class:`LocalGraph`."""
return [LocalGraph(local_graph_path=self.local_graph_path)]
def output(self) -> luigi.Target:
""".csv.zst file that contains the topological order."""
return luigi.LocalTarget(self.topological_order_path)
def run(self) -> None:
"""Runs org.softwareheritage.graph.utils.TopoSort and compresses"""
object_types = "rev,rel,snp,ori"
class_name = "org.softwareheritage.graph.utils.TopoSort"
script = f"""
java {class_name} '{self.local_graph_path}/{self.graph_name}' '{object_types}' \
| pv --line-mode --wait \
| zstdmt -19
"""
_run_script(script, self.topological_order_path)
class ListOriginContributors(luigi.Task):
"""Creates a file that contains all SWHIDs in topological order from a compressed
graph."""
local_graph_path = luigi.PathParameter()
topological_order_path = luigi.PathParameter()
origin_contributors_path = luigi.PathParameter()
graph_name = luigi.Parameter(default="graph")
def requires(self) -> List[luigi.Task]:
"""Returns an instance of :class:`LocalGraph` and :class:`TopoSort`."""
return [
LocalGraph(local_graph_path=self.local_graph_path),
TopoSort(
local_graph_path=self.local_graph_path,
topological_order_path=self.topological_order_path,
graph_name=self.graph_name,
),
]
def output(self) -> luigi.Target:
""".csv.zst file that contains the topological order."""
return luigi.LocalTarget(self.origin_contributors_path)
def run(self) -> None:
"""Runs org.softwareheritage.graph.utils.TopoSort and compresses"""
class_name = "org.softwareheritage.graph.utils.ListOriginContributors"
script = f"""
zstdcat {self.topological_order_path} \
| java {class_name} '{self.local_graph_path}/{self.graph_name}' \
| pv --line-mode --wait \
| zstdmt -19
"""
_run_script(script, self.origin_contributors_path)
class ExportDeanonymizationTable(luigi.Task):
"""Exports (from swh-storage) a .csv.zst file that contains the columns:
``base64(sha256(full_name))`, ``base64(full_name)``, and ``escape(full_name)``.
The first column is the anonymized full name found in :file:`graph.persons.csv.zst`
in the compressed graph, and the latter two are the original name."""
storage_dsn = luigi.Parameter(
default="service=swh",
description="postgresql DSN of the swh-storage database to read from.",
)
deanonymization_table_path = luigi.PathParameter()
def output(self) -> luigi.Target:
""".csv.zst file that contains the table."""
return luigi.LocalTarget(self.deanonymization_table_path)
def run(self) -> None:
"""Runs a postgresql query to compute the table."""
_run_script(
f"""
psql '{self.storage_dsn}' -c "COPY (select encode(digest(fullname, 'sha256'), 'base64') as sha256_base64, encode(fullname, 'base64') as base64, encode(fullname, 'escape') as escaped from person) TO STDOUT CSV HEADER" | zstdmt -19
""", # noqa
self.deanonymization_table_path,
)
class DeanonymizeOriginContributors(luigi.Task):
"""Generates a .csv.zst file similar to :class:`ListOriginContributors`'s,
but with ``person_base64`` and ``person_escaped`` columns in addition to
``person_id``.
This assumes that :file:`graph.persons.csv.zst` is anonymized (SHA256 of names
instead of names); which may not be true depending on how the swh-dataset export
cas configured.
"""
local_graph_path = luigi.PathParameter()
graph_name = luigi.Parameter(default="graph")
origin_contributors_path = luigi.PathParameter()
deanonymization_table_path = luigi.PathParameter()
deanonymized_origin_contributors_path = luigi.PathParameter()
def requires(self) -> List[luigi.Task]:
"""Returns instances of :class:`LocalGraph`, :class:`ListOriginContributors`,
and :class:`ExportDeanonymizationTable`."""
return [
LocalGraph(local_graph_path=self.local_graph_path),
ListOriginContributors(
local_graph_path=self.local_graph_path,
origin_contributors_path=self.origin_contributors_path,
),
ExportDeanonymizationTable(
deanonymization_table_path=self.deanonymization_table_path,
),
]
def output(self) -> luigi.Target:
""".csv.zst file similar to :meth:`ListOriginContributors.output`'s,
but with ``person_base64`` and ``person_escaped`` columns in addition to
``person_id``"""
return luigi.LocalTarget(self.deanonymized_origin_contributors_path)
def run(self) -> None:
"""Loads the list of persons (``graph.persons.csv.zst`` in the graph dataset
and the deanonymization table in memory, then uses them to map each row
in the original (anonymized) contributors list to the deanonymized one."""
# TODO: .persons.csv.zst may be already deanonymized (if the swh-dataset export
# was configured to do so); this should add support for it.
import base64
import csv
import pyzstd
# Load the deanonymization table, to map sha256(name) to base64(name)
# and escape(name)
sha256_to_names: Dict[bytes, Tuple[bytes, str]] = {}
with pyzstd.open(self.deanonymization_table_path, "rt") as fd:
csv_reader = csv.reader(fd)
header = next(csv_reader)
assert header == ["sha256_base64", "base64", "escaped"], header
for line in csv_reader:
(base64_sha256_name, base64_name, escaped_name) = line
sha256_name = base64.b64decode(base64_sha256_name)
name = base64.b64decode(base64_name)
sha256_to_names[sha256_name] = (name, escaped_name)
# Combine with the list of sha256(name), to get the list of base64(name)
# and escape(name)
persons_path = self.local_graph_path / f"{self.graph_name}.persons.csv.zst"
with pyzstd.open(persons_path, "rb") as fd:
person_id_to_names: List[Tuple[bytes, str]] = [
sha256_to_names.pop(base64.b64decode(line.strip()), (b"", ""))
for line in fd
]
tmp_output_path = Path(f"{self.deanonymized_origin_contributors_path}.tmp")
# Finally, write a new table of origin_contributors, by reading the anonymized
# table line-by-line and deanonymizing each id
# Open temporary output for writes as CSV
with pyzstd.open(tmp_output_path, "wt") as output_fd:
csv_writer = csv.writer(output_fd, lineterminator="\n")
# write header
csv_writer.writerow(("origin_SWHID", "person_base64", "person_escaped"))
# Open input for reads as CSV
with pyzstd.open(self.origin_contributors_path, "rt") as input_fd:
csv_reader = csv.reader(input_fd)
header = next(csv_reader)
assert header == ["origin_SWHID", "person_id"], header
for (origin_swhid, person_id) in csv_reader:
+ if person_id == "null":
+ # FIXME: workaround for a bug in contribution graphs generated
+ # before 2022-12-01. Those were only used in tests and never
+ # published, so the conditional can be removed when this is
+ # productionized
+ continue
(name, escaped_name) = person_id_to_names[int(person_id)]
base64_name = base64.b64encode(name).decode("ascii")
csv_writer.writerow((origin_swhid, base64_name, escaped_name))
tmp_output_path.replace(self.deanonymized_origin_contributors_path)
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labelobl b/swh/graph/tests/dataset/compressed/example-labelled.labelobl
new file mode 100644
index 0000000..d4a6621
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example-labelled.labelobl differ
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
index fbb7a5a..a87f20a 100644
--- a/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
+++ b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
@@ -1,2 +1 @@
-„í
-Âpæ)í š
\ No newline at end of file
+ úh*…¸~±tÐV
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labels b/swh/graph/tests/dataset/compressed/example-labelled.labels
index 1b876ec..935dd46 100644
--- a/swh/graph/tests/dataset/compressed/example-labelled.labels
+++ b/swh/graph/tests/dataset/compressed/example-labelled.labels
@@ -1 +1 @@
-D¤º%B](P(iõ‚¢
\ No newline at end of file
+§ BaéÂQ@RB@RiПD
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.properties b/swh/graph/tests/dataset/compressed/example-labelled.properties
index 4f4c55a..4c6856d 100644
--- a/swh/graph/tests/dataset/compressed/example-labelled.properties
+++ b/swh/graph/tests/dataset/compressed/example-labelled.properties
@@ -1,3 +1,3 @@
graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph
-labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6)
+labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,7)
underlyinggraph = example
diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labelobl b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labelobl
new file mode 100644
index 0000000..b734d0d
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labelobl differ
diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets
index 7726435..603a32f 100644
--- a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets
+++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets
@@ -1,2 +1 @@
- šB•!B…
-(P¡‚”
\ No newline at end of file
+Ô.I ,*0Z…‹èX
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels
index 9448e72..9375cc7 100644
--- a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels
+++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels
@@ -1,2 +1 @@
- P:¢RH•
-jºP u‚¢
\ No newline at end of file
+§â”%!P£I ¢HJaА
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties b/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties
index 5ee584a..da8e63b 100644
--- a/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties
+++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties
@@ -1,3 +1,3 @@
graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph
-labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6)
+labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,7)
underlyinggraph = example-transposed
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.graph b/swh/graph/tests/dataset/compressed/example-transposed.graph
index d8cbf2b..1a5e5be 100644
--- a/swh/graph/tests/dataset/compressed/example-transposed.graph
+++ b/swh/graph/tests/dataset/compressed/example-transposed.graph
@@ -1 +1 @@
-^®—t5Òízèí ½®Ö…zºZá:¨»]À
\ No newline at end of file
+®¥òâ7ö/—Ë¥Úý:ÕÒõt´+Fº[#ê«ÅÐ
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.obl b/swh/graph/tests/dataset/compressed/example-transposed.obl
index 7ad141b..989e9e9 100644
Binary files a/swh/graph/tests/dataset/compressed/example-transposed.obl and b/swh/graph/tests/dataset/compressed/example-transposed.obl differ
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.offsets b/swh/graph/tests/dataset/compressed/example-transposed.offsets
index b3044db..bcd313c 100644
--- a/swh/graph/tests/dataset/compressed/example-transposed.offsets
+++ b/swh/graph/tests/dataset/compressed/example-transposed.offsets
@@ -1 +1,2 @@
-Š) (P€ˆ‘‚†Š8&(R
\ No newline at end of file
+¡H48P¡B…E
+4h¡Ã¡@
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.properties b/swh/graph/tests/dataset/compressed/example-transposed.properties
index 96fcfba..9f09c32 100644
--- a/swh/graph/tests/dataset/compressed/example-transposed.properties
+++ b/swh/graph/tests/dataset/compressed/example-transposed.properties
@@ -1,35 +1,35 @@
#BVGraph properties
-#Wed Mar 30 17:33:29 CEST 2022
-bitsforreferences=28
-avgbitsforintervals=0.762
+#Thu Dec 01 10:50:01 CET 2022
+bitsforreferences=31
+avgbitsforintervals=0.833
graphclass=it.unimi.dsi.big.webgraph.BVGraph
-avgdist=0.429
-successoravggap=4.261
-residualexpstats=5,8,3,2,1
-arcs=23
+avgdist=0.417
+successoravggap=6.518
+residualexpstats=8,5,8,3,0,1
+arcs=28
minintervallength=4
-bitsforoutdegrees=61
-residualavgloggap=2.076977934449935
-avgbitsforoutdegrees=2.905
-bitsforresiduals=85
-successoravgloggap=1.9987119736846723
+bitsforoutdegrees=68
+residualavgloggap=2.2068709506771227
+avgbitsforoutdegrees=2.833
+bitsforresiduals=115
+successoravgloggap=2.3010835643149283
maxrefcount=3
-successorexpstats=7,9,4,2,1
-residualarcs=19
-avgbitsforresiduals=4.048
-avgbitsforblocks=0.19
+successorexpstats=9,5,8,4,1,1
+residualarcs=25
+avgbitsforresiduals=4.792
+avgbitsforblocks=0.125
windowsize=7
-residualavggap=4.632
-copiedarcs=4
-avgbitsforreferences=1.333
+residualavggap=5.860
+copiedarcs=3
+avgbitsforreferences=1.292
version=0
-compratio=1.53
-bitsperlink=8.435
+compratio=1.501
+bitsperlink=8.464
compressionflags=
-nodes=21
-avgref=0.238
+nodes=24
+avgref=0.125
zetak=3
-bitsforintervals=16
+bitsforintervals=20
intervalisedarcs=0
-bitspernode=9.238
-bitsforblocks=4
+bitspernode=9.875
+bitsforblocks=3
diff --git a/swh/graph/tests/dataset/compressed/example.edges.count.txt b/swh/graph/tests/dataset/compressed/example.edges.count.txt
index 4099407..9902f17 100644
--- a/swh/graph/tests/dataset/compressed/example.edges.count.txt
+++ b/swh/graph/tests/dataset/compressed/example.edges.count.txt
@@ -1 +1 @@
-23
+28
diff --git a/swh/graph/tests/dataset/compressed/example.edges.stats.txt b/swh/graph/tests/dataset/compressed/example.edges.stats.txt
index c9b8ac7..d01dba6 100644
--- a/swh/graph/tests/dataset/compressed/example.edges.stats.txt
+++ b/swh/graph/tests/dataset/compressed/example.edges.stats.txt
@@ -1,8 +1,8 @@
dir:cnt 8
dir:dir 3
-ori:snp 1
-rel:rev 2
+ori:snp 2
+rel:rev 3
rev:dir 4
rev:rev 3
-snp:rel 1
-snp:rev 1
+snp:rel 3
+snp:rev 2
diff --git a/swh/graph/tests/dataset/compressed/example.graph b/swh/graph/tests/dataset/compressed/example.graph
index d99357d..3d76ab1 100644
--- a/swh/graph/tests/dataset/compressed/example.graph
+++ b/swh/graph/tests/dataset/compressed/example.graph
@@ -1 +1 @@
-}Ýø º]望˚étô]~[Ô1tޗ@
\ No newline at end of file
+]“؝~ÿåÓz]­ õY>ª¿.õ¤kºíè9Ñt
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example.indegree b/swh/graph/tests/dataset/compressed/example.indegree
index 1ea373e..87022a8 100644
--- a/swh/graph/tests/dataset/compressed/example.indegree
+++ b/swh/graph/tests/dataset/compressed/example.indegree
@@ -1,4 +1,5 @@
-2
+3
16
-2
+4
+0
1
diff --git a/swh/graph/tests/dataset/compressed/example.labels.count.txt b/swh/graph/tests/dataset/compressed/example.labels.count.txt
index 45a4fb7..ec63514 100644
--- a/swh/graph/tests/dataset/compressed/example.labels.count.txt
+++ b/swh/graph/tests/dataset/compressed/example.labels.count.txt
@@ -1 +1 @@
-8
+9
diff --git a/swh/graph/tests/dataset/compressed/example.labels.csv.zst b/swh/graph/tests/dataset/compressed/example.labels.csv.zst
index 1cc8931..50209e7 100644
Binary files a/swh/graph/tests/dataset/compressed/example.labels.csv.zst and b/swh/graph/tests/dataset/compressed/example.labels.csv.zst differ
diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray
index 01451e0..139b2bc 100644
Binary files a/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray and b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray differ
diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers
index 755c4c7..7acff4e 100644
Binary files a/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers and b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers differ
diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.properties b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
index deeac3a..e2f298d 100644
--- a/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
+++ b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
@@ -1,2 +1,2 @@
-n=8
+n=9
ratio=4
diff --git a/swh/graph/tests/dataset/compressed/example.labels.mph b/swh/graph/tests/dataset/compressed/example.labels.mph
index e417aec..60d1007 100644
Binary files a/swh/graph/tests/dataset/compressed/example.labels.mph and b/swh/graph/tests/dataset/compressed/example.labels.mph differ
diff --git a/swh/graph/tests/dataset/compressed/example.mph b/swh/graph/tests/dataset/compressed/example.mph
index f696b19..136bedf 100644
Binary files a/swh/graph/tests/dataset/compressed/example.mph and b/swh/graph/tests/dataset/compressed/example.mph differ
diff --git a/swh/graph/tests/dataset/compressed/example.node2swhid.bin b/swh/graph/tests/dataset/compressed/example.node2swhid.bin
index e86dae4..109a1ac 100644
Binary files a/swh/graph/tests/dataset/compressed/example.node2swhid.bin and b/swh/graph/tests/dataset/compressed/example.node2swhid.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.node2type.map b/swh/graph/tests/dataset/compressed/example.node2type.map
index 1a5b7a7..0a84a00 100644
Binary files a/swh/graph/tests/dataset/compressed/example.node2type.map and b/swh/graph/tests/dataset/compressed/example.node2type.map differ
diff --git a/swh/graph/tests/dataset/compressed/example.nodes.count.txt b/swh/graph/tests/dataset/compressed/example.nodes.count.txt
index aabe6ec..a45fd52 100644
--- a/swh/graph/tests/dataset/compressed/example.nodes.count.txt
+++ b/swh/graph/tests/dataset/compressed/example.nodes.count.txt
@@ -1 +1 @@
-21
+24
diff --git a/swh/graph/tests/dataset/compressed/example.nodes.csv.zst b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst
index 0559f37..24ba056 100644
Binary files a/swh/graph/tests/dataset/compressed/example.nodes.csv.zst and b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/compressed/example.nodes.stats.txt b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
index 097e698..704e51e 100644
--- a/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
+++ b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
@@ -1,6 +1,6 @@
cnt 7
dir 6
-ori 1
-rel 2
+ori 2
+rel 3
rev 4
-snp 1
+snp 2
diff --git a/swh/graph/tests/dataset/compressed/example.obl b/swh/graph/tests/dataset/compressed/example.obl
index 8538d49..36e5d26 100644
Binary files a/swh/graph/tests/dataset/compressed/example.obl and b/swh/graph/tests/dataset/compressed/example.obl differ
diff --git a/swh/graph/tests/dataset/compressed/example.offsets b/swh/graph/tests/dataset/compressed/example.offsets
index 1249e27..4e0eab1 100644
--- a/swh/graph/tests/dataset/compressed/example.offsets
+++ b/swh/graph/tests/dataset/compressed/example.offsets
@@ -1,2 +1,2 @@
-ŽA!B’‚i
-C‚B†
+(`¡¨rAD9E!A
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example.order b/swh/graph/tests/dataset/compressed/example.order
index ff64db4..8d38627 100644
Binary files a/swh/graph/tests/dataset/compressed/example.order and b/swh/graph/tests/dataset/compressed/example.order differ
diff --git a/swh/graph/tests/dataset/compressed/example.outdegree b/swh/graph/tests/dataset/compressed/example.outdegree
index 5b8a720..7533d27 100644
--- a/swh/graph/tests/dataset/compressed/example.outdegree
+++ b/swh/graph/tests/dataset/compressed/example.outdegree
@@ -1,4 +1,4 @@
7
-6
+8
7
-1
+2
diff --git a/swh/graph/tests/dataset/compressed/example.persons.mph b/swh/graph/tests/dataset/compressed/example.persons.mph
index 6787503..1d9d8b6 100644
Binary files a/swh/graph/tests/dataset/compressed/example.persons.mph and b/swh/graph/tests/dataset/compressed/example.persons.mph differ
diff --git a/swh/graph/tests/dataset/compressed/example.properties b/swh/graph/tests/dataset/compressed/example.properties
index 11d426e..75581f6 100644
--- a/swh/graph/tests/dataset/compressed/example.properties
+++ b/swh/graph/tests/dataset/compressed/example.properties
@@ -1,35 +1,35 @@
#BVGraph properties
-#Wed Mar 30 17:33:28 CEST 2022
-bitsforreferences=15
+#Thu Dec 01 10:50:00 CET 2022
+bitsforreferences=20
avgbitsforintervals=0.667
graphclass=it.unimi.dsi.big.webgraph.BVGraph
-avgdist=0.048
-successoravggap=3.935
-residualexpstats=8,9,2,2,1
-arcs=23
+avgdist=0.125
+successoravggap=5.125
+residualexpstats=4,9,8,4,1
+arcs=28
minintervallength=4
-bitsforoutdegrees=51
-residualavgloggap=1.8895225435666037
-avgbitsforoutdegrees=2.429
-bitsforresiduals=98
-successoravgloggap=1.8859500382836039
+bitsforoutdegrees=62
+residualavgloggap=2.3484556402638956
+avgbitsforoutdegrees=2.583
+bitsforresiduals=122
+successoravgloggap=2.280971484604246
maxrefcount=3
-successorexpstats=8,10,2,2,1
-residualarcs=22
-avgbitsforresiduals=4.667
-avgbitsforblocks=0.048
+successorexpstats=5,10,8,4,1
+residualarcs=26
+avgbitsforresiduals=5.083
+avgbitsforblocks=0.083
windowsize=7
-residualavggap=4.000
-copiedarcs=1
-avgbitsforreferences=0.714
+residualavggap=5.385
+copiedarcs=2
+avgbitsforreferences=0.833
version=0
-compratio=1.412
-bitsperlink=7.783
+compratio=1.406
+bitsperlink=7.929
compressionflags=
-nodes=21
-avgref=0.048
+nodes=24
+avgref=0.083
zetak=3
-bitsforintervals=14
+bitsforintervals=16
intervalisedarcs=0
-bitspernode=8.524
-bitsforblocks=1
+bitspernode=9.25
+bitsforblocks=2
diff --git a/swh/graph/tests/dataset/compressed/example.property.author_id.bin b/swh/graph/tests/dataset/compressed/example.property.author_id.bin
index 7072382..49fa8cf 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.author_id.bin and b/swh/graph/tests/dataset/compressed/example.property.author_id.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin
index 18ae5fa..fcae808 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin
index ab8222e..d84999f 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_id.bin b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin
index 693c904..257d0c3 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.committer_id.bin and b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin
index 4c00061..81899c6 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin
index 9c4f149..7ce5005 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin
index 274f279..08bd265 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin and b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.content.length.bin b/swh/graph/tests/dataset/compressed/example.property.content.length.bin
index 4848e0e..b6881ea 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.content.length.bin and b/swh/graph/tests/dataset/compressed/example.property.content.length.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.message.bin b/swh/graph/tests/dataset/compressed/example.property.message.bin
index 5d50ccf..76b5371 100644
--- a/swh/graph/tests/dataset/compressed/example.property.message.bin
+++ b/swh/graph/tests/dataset/compressed/example.property.message.bin
@@ -1,7 +1,9 @@
VmVyc2lvbiAxLjA=
VmVyc2lvbiAyLjA=
+VmVyc2lvbiAyLjAgYnV0IHdpdGggbm8gYXV0aG9y
SW5pdGlhbCBjb21taXQ=
QWRkIHBhcnNlcg==
QWRkIHRlc3Rz
UmVmYWN0b3IgY29kZWJhc2U=
aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=
+aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy
diff --git a/swh/graph/tests/dataset/compressed/example.property.message.offset.bin b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin
index a452a83..f92396f 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.message.offset.bin and b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
index ba37d43..70cc465 100644
--- a/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
+++ b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
@@ -1,2 +1,3 @@
djEuMA==
djIuMA==
+djIuMC1hbm9ueW1vdXM=
diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin
index f6f589d..784d82d 100644
Binary files a/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin and b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.stats b/swh/graph/tests/dataset/compressed/example.stats
index 541f39a..e66eab4 100644
--- a/swh/graph/tests/dataset/compressed/example.stats
+++ b/swh/graph/tests/dataset/compressed/example.stats
@@ -1,20 +1,20 @@
-nodes=21
-arcs=23
+nodes=24
+arcs=28
loops=0
-successoravggap=4.588
-avglocality=2.522
+successoravggap=5.900
+avglocality=3.143
minoutdegree=0
maxoutdegree=3
-minoutdegreenode=1
-maxoutdegreenode=9
+minoutdegreenode=8
+maxoutdegreenode=1
dangling=7
terminal=7
-percdangling=33.333333333333336
-avgoutdegree=1.0952380952380953
-successorlogdeltastats=13,5,3,2
-successoravglogdelta=0.814
+percdangling=29.166666666666668
+avgoutdegree=1.1666666666666667
+successorlogdeltastats=13,9,3,2,1
+successoravglogdelta=0.880
minindegree=0
-maxindegree=3
-minindegreenode=20
-maxindegreenode=17
-avgindegree=1.0952380952380953
+maxindegree=4
+minindegreenode=21
+maxindegreenode=3
+avgindegree=1.1666666666666667
diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst
index 11bf2e2..b604c27 100644
Binary files a/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst and b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst
index 850e058..fb13661 100644
Binary files a/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst and b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst
index 59b5b0e..c071723 100644
Binary files a/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst and b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst
index 11bfce7..b85a6a2 100644
Binary files a/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst and b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst
index 97db59f..4fc1630 100644
Binary files a/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst and b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst
index 5cd8295..b6c660a 100644
Binary files a/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst and b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/generate_dataset.py b/swh/graph/tests/dataset/generate_dataset.py
index c6abc00..9ff900c 100755
--- a/swh/graph/tests/dataset/generate_dataset.py
+++ b/swh/graph/tests/dataset/generate_dataset.py
@@ -1,358 +1,402 @@
#!/usr/bin/env python3
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# type: ignore
import argparse
import datetime
import logging
from pathlib import Path
import shutil
from swh.dataset.exporters.edges import GraphEdgesExporter
from swh.dataset.exporters.orc import ORCExporter
from swh.graph.webgraph import compress
from swh.model.model import (
Content,
Directory,
DirectoryEntry,
ObjectType,
Origin,
OriginVisit,
OriginVisitStatus,
Person,
Release,
Revision,
RevisionType,
SkippedContent,
Snapshot,
SnapshotBranch,
TargetType,
Timestamp,
TimestampWithTimezone,
)
def h(id: int, width=40) -> bytes:
return bytes.fromhex(f"{id:0{width}}")
PERSONS = [
Person(fullname=b"foo", name=b"foo", email=b""),
Person(fullname=b"bar", name=b"bar", email=b""),
Person(fullname=b"baz", name=b"baz", email=b""),
]
TEST_DATASET = [
Content(sha1_git=h(1), sha1=h(1), sha256=h(1, 64), blake2s256=h(1, 64), length=42),
Content(sha1_git=h(4), sha1=h(4), sha256=h(4, 64), blake2s256=h(4, 64), length=404),
Content(
sha1_git=h(5), sha1=h(5), sha256=h(5, 64), blake2s256=h(5, 64), length=1337
),
Content(sha1_git=h(7), sha1=h(7), sha256=h(7, 64), blake2s256=h(7, 64), length=666),
Content(
sha1_git=h(11), sha1=h(11), sha256=h(11, 64), blake2s256=h(11, 64), length=313
),
Content(
sha1_git=h(14), sha1=h(14), sha256=h(14, 64), blake2s256=h(14, 64), length=14
),
SkippedContent(
sha1_git=h(15),
sha1=h(15),
sha256=h(15, 64),
blake2s256=h(15, 64),
length=404,
status="absent",
reason="Not found",
),
Directory(
id=h(2),
entries=(
DirectoryEntry(
name=b"README.md",
perms=0o100644,
type="file",
target=h(1),
),
),
),
Directory(
id=h(6),
entries=(
DirectoryEntry(
name=b"README.md",
perms=0o100644,
type="file",
target=h(4),
),
DirectoryEntry(
name=b"parser.c",
perms=0o100644,
type="file",
target=h(5),
),
),
),
Directory(
id=h(8),
entries=(
DirectoryEntry(
name=b"README.md",
perms=0o100644,
type="file",
target=h(1),
),
DirectoryEntry(
name=b"parser.c",
perms=0o100644,
type="file",
target=h(7),
),
DirectoryEntry(
name=b"tests",
perms=0o100755,
type="dir",
target=h(6),
),
),
),
Directory(
id=h(12),
entries=(
DirectoryEntry(
name=b"README.md",
perms=0o100644,
type="file",
target=h(11),
),
DirectoryEntry(
name=b"oldproject",
perms=0o100755,
type="dir",
target=h(8),
),
),
),
Directory(
id=h(16),
entries=(
DirectoryEntry(
name=b"TODO.txt",
perms=0o100644,
type="file",
target=h(15),
),
),
),
Directory(
id=h(17),
entries=(
DirectoryEntry(
name=b"TODO.txt",
perms=0o100644,
type="file",
target=h(14),
),
DirectoryEntry(
name=b"old",
perms=0o100755,
type="dir",
target=h(16),
),
),
),
Revision(
id=h(3),
message=b"Initial commit",
date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111122220,
microseconds=0,
),
offset_bytes=b"+0200",
),
committer=PERSONS[0],
author=PERSONS[0],
committer_date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111122220,
microseconds=0,
),
offset_bytes=b"+0200",
),
type=RevisionType.GIT,
directory=h(2),
synthetic=False,
metadata=None,
parents=(),
),
Revision(
id=h(9),
message=b"Add parser",
date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111144440,
microseconds=0,
),
offset_bytes=b"+0200",
),
committer=PERSONS[1],
author=PERSONS[1],
committer_date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111155550,
microseconds=0,
),
offset_bytes=b"+0200",
),
type=RevisionType.GIT,
directory=h(8),
synthetic=False,
metadata=None,
parents=(h(3),),
),
Revision(
id=h(13),
message=b"Add tests",
date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111166660,
microseconds=0,
),
offset_bytes=b"+0200",
),
committer=PERSONS[1],
author=PERSONS[0],
committer_date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111166660,
microseconds=0,
),
offset_bytes=b"+0200",
),
type=RevisionType.GIT,
directory=h(12),
synthetic=False,
metadata=None,
parents=(h(9),),
),
Revision(
id=h(18),
message=b"Refactor codebase",
date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111177770,
microseconds=0,
),
offset_bytes=b"+0000",
),
committer=PERSONS[0],
author=PERSONS[2],
committer_date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1111177770,
microseconds=0,
),
offset_bytes=b"+0000",
),
type=RevisionType.GIT,
directory=h(17),
synthetic=False,
metadata=None,
parents=(h(13),),
),
Release(
id=h(10),
name=b"v1.0",
date=TimestampWithTimezone(
timestamp=Timestamp(
seconds=1234567890,
microseconds=0,
),
offset_bytes=b"+0200",
),
author=PERSONS[0],
target_type=ObjectType.REVISION,
target=h(9),
message=b"Version 1.0",
synthetic=False,
),
Release(
id=h(19),
name=b"v2.0",
date=None,
author=PERSONS[1],
target_type=ObjectType.REVISION,
target=h(18),
message=b"Version 2.0",
synthetic=False,
),
+ Release(
+ id=h(21),
+ name=b"v2.0-anonymous",
+ date=None,
+ author=None,
+ target_type=ObjectType.REVISION,
+ target=h(18),
+ message=b"Version 2.0 but with no author",
+ synthetic=False,
+ ),
Snapshot(
id=h(20),
branches={
b"refs/heads/master": SnapshotBranch(
target=h(9), target_type=TargetType.REVISION
),
b"refs/tags/v1.0": SnapshotBranch(
target=h(10), target_type=TargetType.RELEASE
),
},
),
OriginVisit(
origin="https://example.com/swh/graph",
date=datetime.datetime(
2013, 5, 7, 4, 20, 39, 369271, tzinfo=datetime.timezone.utc
),
visit=1,
type="git",
),
OriginVisitStatus(
origin="https://example.com/swh/graph",
date=datetime.datetime(
2013, 5, 7, 4, 20, 41, 369271, tzinfo=datetime.timezone.utc
),
visit=1,
type="git",
status="full",
snapshot=h(20),
metadata=None,
),
Origin(url="https://example.com/swh/graph"),
+ Snapshot(
+ id=h(22),
+ branches={
+ b"refs/heads/master": SnapshotBranch(
+ target=h(9), target_type=TargetType.REVISION
+ ),
+ b"refs/tags/v1.0": SnapshotBranch(
+ target=h(10), target_type=TargetType.RELEASE
+ ),
+ b"refs/tags/v2.0-anonymous": SnapshotBranch(
+ target=h(21), target_type=TargetType.RELEASE
+ ),
+ },
+ ),
+ OriginVisit(
+ origin="https://example.com/swh/graph2",
+ date=datetime.datetime(
+ 2013, 5, 7, 4, 20, 39, 369271, tzinfo=datetime.timezone.utc
+ ),
+ visit=1,
+ type="git",
+ ),
+ OriginVisitStatus(
+ origin="https://example.com/swh/graph2",
+ date=datetime.datetime(
+ 2013, 5, 7, 4, 20, 41, 369271, tzinfo=datetime.timezone.utc
+ ),
+ visit=1,
+ type="git",
+ status="full",
+ snapshot=h(22),
+ metadata=None,
+ ),
+ Origin(url="https://example.com/swh/graph2"),
]
def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="Generate a test dataset")
parser.add_argument(
"--compress",
action="store_true",
default=False,
help="Also compress the dataset",
)
parser.add_argument("output", help="output directory", nargs="?", default=".")
args = parser.parse_args()
exporters = {"edges": GraphEdgesExporter, "orc": ORCExporter}
config = {"test_unique_file_id": "all"}
output_path = Path(args.output)
for name, exporter in exporters.items():
if (output_path / name).exists():
shutil.rmtree(output_path / name)
with exporter(config, output_path / name) as e:
for obj in TEST_DATASET:
e.process_object(obj.object_type, obj.to_dict())
if args.compress:
if (output_path / "compressed").exists():
shutil.rmtree(output_path / "compressed")
compress("example", output_path / "orc", output_path / "compressed")
if __name__ == "__main__":
main()
diff --git a/swh/graph/tests/dataset/img/example.dot b/swh/graph/tests/dataset/img/example.dot
index d1bdb1f..2963627 100644
--- a/swh/graph/tests/dataset/img/example.dot
+++ b/swh/graph/tests/dataset/img/example.dot
@@ -1,82 +1,91 @@
digraph "Software Heritage mini DAG" {
ranksep=1;
nodesep=0.5;
subgraph cnt {
01 [label="cnt:0x01"];
04 [label="cnt:0x04"];
05 [label="cnt:0x05"];
07 [label="cnt:0x07"];
11 [label="cnt:0x11"];
14 [label="cnt:0x14"];
15 [label="cnt:0x15"];
}
subgraph cluster_dir {
label="File contents";
node [shape=folder];
02 [label="dir:0x02"];
06 [label="dir:0x06"];
08 [label="dir:0x08"];
12 [label="dir:0x12"];
16 [label="dir:0x16"];
17 [label="dir:0x17"];
02 -> 01;
06 -> 04;
06 -> 05;
08 -> 01;
08 -> 06;
08 -> 07;
12 -> 08;
12 -> 11;
16 -> 15;
17 -> 14;
17 -> 16;
}
subgraph cluster_rev {
label="Revisions";
node [shape=diamond];
03 [label="rev:0x03"];
09 [label="rev:0x09"];
13 [label="rev:0x13"];
18 [label="rev:0x18"];
03 -> 02;
09 -> 08;
13 -> 12;
18 -> 17;
// horizontal rev -> rev edges
09 -> 03 [constraint=false];
13 -> 09 [constraint=false];
18 -> 13 [constraint=false];
}
subgraph cluster_rel {
label="Releases";
node [shape=octagon];
10 [label="rel:0x10"];
19 [label="rel:0x19"];
+ 21 [label="rel:0x21"];
10 -> 09;
19 -> 18;
+ 21 -> 18;
}
subgraph cluster_snp {
label="Snapshots";
node [shape=doubleoctagon];
20 [label="snp:0x20"];
+ 22 [label="snp:0x22"];
20 -> 09;
20 -> 10;
+
+ 22 -> 09;
+ 22 -> 10;
+ 22 -> 21;
}
subgraph cluster_ori {
label="Origins";
node [shape=egg];
- 21 [label="ori:0x21"];
+ ori1 [label="ori:8340"];
+ ori2 [label="ori:8f50"];
- 21 -> 20;
+ ori1 -> 20;
+ ori2 -> 22;
}
}
diff --git a/swh/graph/tests/dataset/orc/content/content-all.orc b/swh/graph/tests/dataset/orc/content/content-all.orc
index b038074..68f2677 100644
Binary files a/swh/graph/tests/dataset/orc/content/content-all.orc and b/swh/graph/tests/dataset/orc/content/content-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/directory/directory-all.orc b/swh/graph/tests/dataset/orc/directory/directory-all.orc
index 2df504e..cb74397 100644
Binary files a/swh/graph/tests/dataset/orc/directory/directory-all.orc and b/swh/graph/tests/dataset/orc/directory/directory-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc b/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc
index 1a3d9f4..6d54b41 100644
Binary files a/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc and b/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/origin/origin-all.orc b/swh/graph/tests/dataset/orc/origin/origin-all.orc
index cec803a..fd49daa 100644
Binary files a/swh/graph/tests/dataset/orc/origin/origin-all.orc and b/swh/graph/tests/dataset/orc/origin/origin-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc b/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc
index c7965bb..d338904 100644
Binary files a/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc and b/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc b/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc
index 0a19cb1..9b9ef8c 100644
Binary files a/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc and b/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/release/release-all.orc b/swh/graph/tests/dataset/orc/release/release-all.orc
index 888fa82..b947717 100644
Binary files a/swh/graph/tests/dataset/orc/release/release-all.orc and b/swh/graph/tests/dataset/orc/release/release-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/revision/revision-all.orc b/swh/graph/tests/dataset/orc/revision/revision-all.orc
index 8c186d1..39e383d 100644
Binary files a/swh/graph/tests/dataset/orc/revision/revision-all.orc and b/swh/graph/tests/dataset/orc/revision/revision-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc b/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc
index 05a6b8d..66f46a7 100644
Binary files a/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc and b/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc b/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc
index 92f1748..df5742d 100644
Binary files a/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc and b/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc b/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc
index ed19277..6e80a38 100644
Binary files a/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc and b/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc b/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc
index 41bee79..5dffb82 100644
Binary files a/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc and b/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc b/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc
index c3a11b6..d7bd487 100644
Binary files a/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc and b/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc differ
diff --git a/swh/graph/tests/test_cli.py b/swh/graph/tests/test_cli.py
index eceb164..b9c2250 100644
--- a/swh/graph/tests/test_cli.py
+++ b/swh/graph/tests/test_cli.py
@@ -1,58 +1,58 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Dict
from click.testing import CliRunner
import yaml
from swh.graph.cli import graph_cli_group
DATA_DIR = Path(__file__).parents[0] / "dataset"
def read_properties(properties_fname) -> Dict[str, str]:
"""read a Java .properties file"""
with open(properties_fname) as f:
keyvalues = (
line.split("=", maxsplit=1)
for line in f
if not line.strip().startswith("#")
)
return dict((k.strip(), v.strip()) for (k, v) in keyvalues)
def test_pipeline():
"""run full compression pipeline"""
# bare bone configuration, to allow testing the compression pipeline
# with minimum RAM requirements on trivial graphs
config = {"graph": {"compress": {"batch_size": 1000}}}
runner = CliRunner()
with TemporaryDirectory(suffix=".swh-graph-test") as tmpdir:
config_path = Path(tmpdir, "config.yml")
config_path.write_text(yaml.dump(config))
result = runner.invoke(
graph_cli_group,
[
"--config-file",
config_path,
"compress",
"--input-dataset",
DATA_DIR / "orc",
"--output-directory",
tmpdir,
"--graph-name",
"example",
],
)
assert result.exit_code == 0, result
properties = read_properties(Path(tmpdir) / "example.properties")
- assert int(properties["nodes"]) == 21
- assert int(properties["arcs"]) == 23
+ assert int(properties["nodes"]) == 24
+ assert int(properties["arcs"]) == 28
diff --git a/swh/graph/tests/test_grpc.py b/swh/graph/tests/test_grpc.py
index a98c549..73ae271 100644
--- a/swh/graph/tests/test_grpc.py
+++ b/swh/graph/tests/test_grpc.py
@@ -1,129 +1,130 @@
# Copyright (c) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
from google.protobuf.field_mask_pb2 import FieldMask
from swh.graph.grpc.swhgraph_pb2 import (
GraphDirection,
NodeFilter,
StatsRequest,
TraversalRequest,
)
TEST_ORIGIN_ID = "swh:1:ori:{}".format(
hashlib.sha1(b"https://example.com/swh/graph").hexdigest()
)
def test_stats(graph_grpc_stub):
stats = graph_grpc_stub.Stats(StatsRequest())
- assert stats.num_nodes == 21
- assert stats.num_edges == 23
+ assert stats.num_nodes == 24
+ assert stats.num_edges == 28
assert isinstance(stats.compression_ratio, float)
assert isinstance(stats.bits_per_node, float)
assert isinstance(stats.bits_per_edge, float)
assert isinstance(stats.avg_locality, float)
assert stats.indegree_min == 0
- assert stats.indegree_max == 3
+ assert stats.indegree_max == 4
assert isinstance(stats.indegree_avg, float)
assert stats.outdegree_min == 0
assert stats.outdegree_max == 3
assert isinstance(stats.outdegree_avg, float)
def test_leaves(graph_grpc_stub):
request = graph_grpc_stub.Traverse(
TraversalRequest(
src=[TEST_ORIGIN_ID],
mask=FieldMask(paths=["swhid"]),
return_nodes=NodeFilter(types="cnt"),
)
)
actual = [node.swhid for node in request]
expected = [
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
"swh:1:cnt:0000000000000000000000000000000000000007",
]
assert set(actual) == set(expected)
def test_neighbors(graph_grpc_stub):
request = graph_grpc_stub.Traverse(
TraversalRequest(
src=["swh:1:rev:0000000000000000000000000000000000000009"],
direction=GraphDirection.BACKWARD,
mask=FieldMask(paths=["swhid"]),
min_depth=1,
max_depth=1,
)
)
actual = [node.swhid for node in request]
expected = [
+ "swh:1:snp:0000000000000000000000000000000000000022",
"swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000013",
]
assert set(actual) == set(expected)
def test_visit_nodes(graph_grpc_stub):
request = graph_grpc_stub.Traverse(
TraversalRequest(
src=["swh:1:rel:0000000000000000000000000000000000000010"],
mask=FieldMask(paths=["swhid"]),
edges="rel:rev,rev:rev",
)
)
actual = [node.swhid for node in request]
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
]
assert set(actual) == set(expected)
def test_visit_nodes_filtered(graph_grpc_stub):
request = graph_grpc_stub.Traverse(
TraversalRequest(
src=["swh:1:rel:0000000000000000000000000000000000000010"],
mask=FieldMask(paths=["swhid"]),
return_nodes=NodeFilter(types="dir"),
)
)
actual = [node.swhid for node in request]
expected = [
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
]
assert set(actual) == set(expected)
def test_visit_nodes_filtered_star(graph_grpc_stub):
request = graph_grpc_stub.Traverse(
TraversalRequest(
src=["swh:1:rel:0000000000000000000000000000000000000010"],
mask=FieldMask(paths=["swhid"]),
)
)
actual = [node.swhid for node in request]
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
]
assert set(actual) == set(expected)
diff --git a/swh/graph/tests/test_http_client.py b/swh/graph/tests/test_http_client.py
index 1b8cb6e..8b5a6d0 100644
--- a/swh/graph/tests/test_http_client.py
+++ b/swh/graph/tests/test_http_client.py
@@ -1,450 +1,456 @@
# Copyright (c) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
import pytest
from pytest import raises
from swh.core.api import RemoteException
from swh.graph.http_client import GraphArgumentException
TEST_ORIGIN_ID = "swh:1:ori:{}".format(
hashlib.sha1(b"https://example.com/swh/graph").hexdigest()
)
def test_stats(graph_client):
stats = graph_client.stats()
- assert stats["num_nodes"] == 21
- assert stats["num_edges"] == 23
+ assert stats["num_nodes"] == 24
+ assert stats["num_edges"] == 28
assert isinstance(stats["compression_ratio"], float)
assert isinstance(stats["bits_per_node"], float)
assert isinstance(stats["bits_per_edge"], float)
assert isinstance(stats["avg_locality"], float)
assert stats["indegree_min"] == 0
- assert stats["indegree_max"] == 3
+ assert stats["indegree_max"] == 4
assert isinstance(stats["indegree_avg"], float)
assert stats["outdegree_min"] == 0
assert stats["outdegree_max"] == 3
assert isinstance(stats["outdegree_avg"], float)
def test_leaves(graph_client):
actual = list(graph_client.leaves(TEST_ORIGIN_ID))
expected = [
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
"swh:1:cnt:0000000000000000000000000000000000000007",
]
assert set(actual) == set(expected)
@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
def test_leaves_with_limit(graph_client, max_matching_nodes):
actual = list(
graph_client.leaves(TEST_ORIGIN_ID, max_matching_nodes=max_matching_nodes)
)
expected = [
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
"swh:1:cnt:0000000000000000000000000000000000000007",
]
if max_matching_nodes == 0:
assert set(actual) == set(expected)
else:
assert set(actual) <= set(expected)
assert len(actual) == min(4, max_matching_nodes)
def test_neighbors(graph_client):
actual = list(
graph_client.neighbors(
"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
)
)
expected = [
+ "swh:1:snp:0000000000000000000000000000000000000022",
"swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000013",
]
assert set(actual) == set(expected)
def test_visit_nodes(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev",
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
]
assert set(actual) == set(expected)
@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
def test_visit_nodes_limit(graph_client, max_matching_nodes):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev",
max_matching_nodes=max_matching_nodes,
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
]
if max_matching_nodes == 0:
assert set(actual) == set(expected)
else:
assert set(actual) <= set(expected)
assert len(actual) == min(3, max_matching_nodes)
def test_visit_nodes_filtered(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
return_types="dir",
)
)
expected = [
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
]
assert set(actual) == set(expected)
@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
def test_visit_nodes_filtered_limit(graph_client, max_matching_nodes):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
return_types="dir",
max_matching_nodes=max_matching_nodes,
)
)
expected = [
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
]
if max_matching_nodes == 0:
assert set(actual) == set(expected)
else:
assert set(actual) <= set(expected)
assert len(actual) == min(3, max_matching_nodes)
def test_visit_nodes_filtered_star(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
return_types="*",
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
]
assert set(actual) == set(expected)
def test_visit_edges(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev,rev:dir",
)
)
expected = [
(
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
]
assert set(actual) == set(expected)
def test_visit_edges_limited(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rel:0000000000000000000000000000000000000010",
max_edges=4,
edges="rel:rev,rev:rev,rev:dir",
)
)
expected = [
(
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
]
+
# As there are four valid answers (up to reordering), we cannot check for
- # equality. Instead, we check the client returned all edges but one.
+ # equality. Instead, we check the client returned either
+ # * all edges but one, or
+ # * all edges
+ # and the right answer depends on which edges were traversed, which is
+ # non-deterministic
assert set(actual).issubset(set(expected))
- assert len(actual) == 3
+ assert 3 <= len(actual) <= 4
def test_visit_edges_diamond_pattern(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rev:0000000000000000000000000000000000000009",
edges="*",
)
)
expected = [
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
(
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
),
(
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
),
(
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005",
),
]
assert set(actual) == set(expected)
@pytest.mark.skip(reason="currently disabled due to T1969")
def test_walk(graph_client):
args = ("swh:1:dir:0000000000000000000000000000000000000016", "rel")
kwargs = {
"edges": "dir:dir,dir:rev,rev:*",
"direction": "backward",
"traversal": "bfs",
}
actual = list(graph_client.walk(*args, **kwargs))
expected = [
"swh:1:dir:0000000000000000000000000000000000000016",
"swh:1:dir:0000000000000000000000000000000000000017",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019",
]
assert set(actual) == set(expected)
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.walk(*args, **kwargs2))
expected = ["swh:1:rel:0000000000000000000000000000000000000019"]
assert set(actual) == set(expected)
kwargs2 = kwargs.copy()
kwargs2["limit"] = 2
actual = list(graph_client.walk(*args, **kwargs2))
expected = [
"swh:1:dir:0000000000000000000000000000000000000016",
"swh:1:dir:0000000000000000000000000000000000000017",
]
assert set(actual) == set(expected)
@pytest.mark.skip(reason="Random walk is deprecated")
def test_random_walk_dst_is_type(graph_client):
"""as the walk is random, we test a visit from a cnt node to a release
reachable from every single path in the backward graph, and only check the
final node of the path (i.e., the release)
"""
args = ("swh:1:cnt:0000000000000000000000000000000000000015", "rel")
kwargs = {"direction": "backward"}
expected_root = "swh:1:rel:0000000000000000000000000000000000000019"
actual = list(graph_client.random_walk(*args, **kwargs))
assert len(actual) > 1 # no release directly links to a content
assert actual[0] == args[0]
assert actual[-1] == expected_root
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.random_walk(*args, **kwargs2))
assert actual == [expected_root]
kwargs2["limit"] = -2
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 2
assert actual[-1] == expected_root
kwargs2["limit"] = 3
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 3
@pytest.mark.skip(reason="Random walk is deprecated")
def test_random_walk_dst_is_node(graph_client):
"""Same as test_random_walk_dst_is_type, but we target the specific release
node instead of a type
"""
args = (
"swh:1:cnt:0000000000000000000000000000000000000015",
"swh:1:rel:0000000000000000000000000000000000000019",
)
kwargs = {"direction": "backward"}
expected_root = "swh:1:rel:0000000000000000000000000000000000000019"
actual = list(graph_client.random_walk(*args, **kwargs))
assert len(actual) > 1 # no origin directly links to a content
assert actual[0] == args[0]
assert actual[-1] == expected_root
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.random_walk(*args, **kwargs2))
assert actual == [expected_root]
kwargs2["limit"] = -2
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 2
assert actual[-1] == expected_root
kwargs2["limit"] = 3
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 3
def test_count(graph_client):
actual = graph_client.count_leaves(TEST_ORIGIN_ID)
assert actual == 4
actual = graph_client.count_visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev"
)
assert actual == 3
actual = graph_client.count_neighbors(
"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
)
- assert actual == 3
+ assert actual == 4
@pytest.mark.parametrize("max_matching_nodes", [0, 1, 2, 3, 4, 5, 10, 1 << 31])
def test_count_with_limit(graph_client, max_matching_nodes):
actual = graph_client.count_leaves(
TEST_ORIGIN_ID, max_matching_nodes=max_matching_nodes
)
if max_matching_nodes == 0:
assert actual == 4
else:
assert actual == min(4, max_matching_nodes)
def test_param_validation(graph_client):
with raises(GraphArgumentException) as exc_info: # SWHID not found
list(graph_client.leaves("swh:1:rel:00ffffffff000000000000000000000000000010"))
if exc_info.value.response:
assert exc_info.value.response.status_code == 404
with raises(GraphArgumentException) as exc_info: # malformed SWHID
list(
graph_client.neighbors("swh:1:rel:00ffffffff00000000zzzzzzz000000000000010")
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
with raises(GraphArgumentException) as exc_info: # malformed edge specificaiton
list(
graph_client.visit_nodes(
"swh:1:dir:0000000000000000000000000000000000000016",
edges="dir:notanodetype,dir:rev,rev:*",
direction="backward",
)
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
with raises(GraphArgumentException) as exc_info: # malformed direction
list(
graph_client.visit_nodes(
"swh:1:dir:0000000000000000000000000000000000000016",
edges="dir:dir,dir:rev,rev:*",
direction="notadirection",
)
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
@pytest.mark.skip(reason="currently disabled due to T1969")
def test_param_validation_walk(graph_client):
"""test validation of walk-specific parameters only"""
with raises(RemoteException) as exc_info: # malformed traversal order
list(
graph_client.walk(
"swh:1:dir:0000000000000000000000000000000000000016",
"rel",
edges="dir:dir,dir:rev,rev:*",
direction="backward",
traversal="notatraversalorder",
)
)
assert exc_info.value.response.status_code == 400
diff --git a/swh/graph/tests/test_luigi.py b/swh/graph/tests/test_luigi.py
index 232479e..1cce95c 100644
--- a/swh/graph/tests/test_luigi.py
+++ b/swh/graph/tests/test_luigi.py
@@ -1,36 +1,36 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
from pathlib import Path
from swh.graph.luigi import CompressGraph
from .test_cli import read_properties
DATA_DIR = Path(__file__).parents[0] / "dataset"
def test_compressgraph(tmpdir):
tmpdir = Path(tmpdir)
task = CompressGraph(
local_export_path=DATA_DIR,
local_graph_path=tmpdir / "compressed_graph",
batch_size=1000, # go fast on the trivial dataset
)
task.run()
properties = read_properties(tmpdir / "compressed_graph" / "graph.properties")
- assert int(properties["nodes"]) == 21
- assert int(properties["arcs"]) == 23
+ assert int(properties["nodes"]) == 24
+ assert int(properties["arcs"]) == 28
export_meta_path = tmpdir / "compressed_graph/meta/export.json"
assert export_meta_path.read_bytes() == (DATA_DIR / "meta/export.json").read_bytes()
compression_meta_path = tmpdir / "compressed_graph/meta/compression.json"
assert json.load(compression_meta_path.open())[0]["conf"] == {"batch_size": 1000}
diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py
index b696af3..dccb45c 100644
--- a/swh/graph/tests/test_origin_contributors.py
+++ b/swh/graph/tests/test_origin_contributors.py
@@ -1,180 +1,187 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
from pathlib import Path
import subprocess
from swh.graph.luigi import (
DeanonymizeOriginContributors,
ExportDeanonymizationTable,
ListOriginContributors,
)
from swh.model.model import (
ObjectType,
Person,
Release,
Revision,
RevisionType,
TimestampWithTimezone,
)
from .test_toposort import EXPECTED as TOPOLOGICAL_ORDER
DATA_DIR = Path(__file__).parents[0] / "dataset"
# FIXME: do not hardcode ids here; they should be dynamically loaded
# from the test graph
ORIGIN_CONTRIBUTORS = """\
origin_SWHID,person_id
swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,0
swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,2
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,0
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,null
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,2
"""
DEANONYMIZATION_TABLE = """\
sha256_base64,base64,escaped
8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
aZA9TeLhVzqVDQHQOd53UABAZYyek0tY3vTo6VSlA4U=,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe <jdoe@example.com>
UaCrgAZBvn1LBd2sAinmdNvAX/G4sjo1aJA9GDd9UUs=,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5uZXQ+,Jane Doe <jdoe@example.net>
""" # noqa
PERSONS = """\
aZA9TeLhVzqVDQHQOd53UABAZYyek0tY3vTo6VSlA4U=
UaCrgAZBvn1LBd2sAinmdNvAX/G4sjo1aJA9GDd9UUs=
8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=
"""
DEANONYMIZED_ORIGIN_CONTRIBUTORS = """\
origin_SWHID,person_base64,person_escaped
swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe <jdoe@example.com>
swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe <jdoe@example.com>
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5uZXQ+,Jane Doe <jdoe@example.net>
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
""" # noqa
def test_list_origin_contributors(tmpdir):
tmpdir = Path(tmpdir)
topological_order_path = tmpdir / "topo_order.csv.zst"
origin_contributors_path = tmpdir / "origin_contributors.csv.zst"
subprocess.run(
["zstdmt", "-o", topological_order_path],
input=TOPOLOGICAL_ORDER.encode(),
check=True,
)
task = ListOriginContributors(
local_graph_path=DATA_DIR / "compressed",
topological_order_path=topological_order_path,
origin_contributors_path=origin_contributors_path,
graph_name="example",
)
task.run()
csv_text = subprocess.check_output(["zstdcat", origin_contributors_path]).decode()
assert csv_text == ORIGIN_CONTRIBUTORS
def test_export_deanonymization_table(tmpdir, swh_storage_postgresql, swh_storage):
tmpdir = Path(tmpdir)
tstz = TimestampWithTimezone.from_datetime(
datetime.datetime.now(tz=datetime.timezone.utc)
)
swh_storage.release_add(
[
Release(
name=b"v1.0",
message=b"first release",
author=Person.from_fullname(b"John Doe <jdoe@example.org>"),
target=b"\x00" * 20,
target_type=ObjectType.REVISION,
synthetic=True,
)
]
)
swh_storage.revision_add(
[
Revision(
message=b"first commit",
author=Person.from_fullname(b"Jane Doe <jdoe@example.com>"),
committer=Person.from_fullname(b"Jane Doe <jdoe@example.net>"),
date=tstz,
committer_date=tstz,
directory=b"\x00" * 20,
type=RevisionType.GIT,
synthetic=True,
)
]
)
deanonymization_table_path = tmpdir / "person_sha256_to_names.csv.zst"
task = ExportDeanonymizationTable(
storage_dsn=swh_storage_postgresql.dsn,
deanonymization_table_path=deanonymization_table_path,
)
task.run()
csv_text = subprocess.check_output(["zstdcat", deanonymization_table_path]).decode()
(header, *rows) = csv_text.split("\n")
(expected_header, *expected_rows) = DEANONYMIZATION_TABLE.split("\n")
assert header == expected_header
assert rows.pop() == "", "Missing trailing newline"
expected_rows.pop()
assert set(rows) == set(expected_rows)
def test_deanonymize_origin_contributors(tmpdir):
tmpdir = Path(tmpdir)
persons_path = tmpdir / "example.persons.csv.zst"
origin_contributors_path = tmpdir / "origin_contributors.csv.zst"
deanonymization_table_path = tmpdir / "person_sha256_to_names.csv.zst"
deanonymized_origin_contributors_path = (
tmpdir / "origin_contributors.deanonymized.csv.zst"
)
subprocess.run(
["zstdmt", "-o", origin_contributors_path],
input=ORIGIN_CONTRIBUTORS.encode(),
check=True,
)
subprocess.run(
["zstdmt", "-o", persons_path],
input=PERSONS.encode(),
check=True,
)
subprocess.run(
["zstdmt", "-o", deanonymization_table_path],
input=DEANONYMIZATION_TABLE.encode(),
check=True,
)
task = DeanonymizeOriginContributors(
local_graph_path=tmpdir,
origin_contributors_path=origin_contributors_path,
deanonymization_table_path=deanonymization_table_path,
deanonymized_origin_contributors_path=deanonymized_origin_contributors_path,
graph_name="example",
)
task.run()
csv_text = subprocess.check_output(
["zstdcat", deanonymized_origin_contributors_path]
).decode()
assert csv_text == DEANONYMIZED_ORIGIN_CONTRIBUTORS
diff --git a/swh/graph/tests/test_toposort.py b/swh/graph/tests/test_toposort.py
index 1c5e2ed..6d35628 100644
--- a/swh/graph/tests/test_toposort.py
+++ b/swh/graph/tests/test_toposort.py
@@ -1,59 +1,67 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
import subprocess
from swh.graph.luigi import TopoSort
DATA_DIR = Path(__file__).parents[0] / "dataset"
+# FIXME: the order of sample ancestors should not be hardcoded
+# FIXME: swh:1:snp:0000000000000000000000000000000000000022,3,1,swh has three possible
+# sample ancestors; they should not be hardecoded here
EXPECTED = """\
SWHID,ancestors,successors,sample_ancestor1,sample_ancestor2
swh:1:rev:0000000000000000000000000000000000000003,0,1,,
-swh:1:rev:0000000000000000000000000000000000000009,1,3,swh:1:rev:0000000000000000000000000000000000000003,
-swh:1:rel:0000000000000000000000000000000000000010,1,1,swh:1:rev:0000000000000000000000000000000000000009,
+swh:1:rev:0000000000000000000000000000000000000009,1,4,swh:1:rev:0000000000000000000000000000000000000003,
+swh:1:rel:0000000000000000000000000000000000000010,1,2,swh:1:rev:0000000000000000000000000000000000000009,
swh:1:snp:0000000000000000000000000000000000000020,2,1,swh:1:rev:0000000000000000000000000000000000000009,swh:1:rel:0000000000000000000000000000000000000010
swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,1,0,swh:1:snp:0000000000000000000000000000000000000020,
swh:1:rev:0000000000000000000000000000000000000013,1,1,swh:1:rev:0000000000000000000000000000000000000009,
-swh:1:rev:0000000000000000000000000000000000000018,1,1,swh:1:rev:0000000000000000000000000000000000000013,
+swh:1:rev:0000000000000000000000000000000000000018,1,2,swh:1:rev:0000000000000000000000000000000000000013,
swh:1:rel:0000000000000000000000000000000000000019,1,0,swh:1:rev:0000000000000000000000000000000000000018,
+swh:1:rel:0000000000000000000000000000000000000021,1,1,swh:1:rev:0000000000000000000000000000000000000018,
+swh:1:snp:0000000000000000000000000000000000000022,3,1,swh:1:rev:0000000000000000000000000000000000000009,swh:1:rel:0000000000000000000000000000000000000010
+swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0,swh:1:snp:0000000000000000000000000000000000000022,
"""
def test_toposort(tmpdir):
tmpdir = Path(tmpdir)
topological_order_path = tmpdir / "topo_order.csv.zst"
task = TopoSort(
local_graph_path=DATA_DIR / "compressed",
topological_order_path=topological_order_path,
graph_name="example",
)
task.run()
csv_text = subprocess.check_output(["zstdcat", topological_order_path]).decode()
(header, *rows) = csv_text.split("\n")
(expected_header, *expected_lines) = EXPECTED.split("\n")
assert header == expected_header
# The only possible first line
assert rows[0] == "swh:1:rev:0000000000000000000000000000000000000003,0,1,,"
assert set(rows) == set(expected_lines)
assert rows.pop() == "", "Missing trailing newline"
- # The only two possible last lines
+ # The only three possible last lines
assert rows[-1] in [
"swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,1,0"
",swh:1:snp:0000000000000000000000000000000000000020,",
+ "swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0"
+ ",swh:1:snp:0000000000000000000000000000000000000022,",
"swh:1:rel:0000000000000000000000000000000000000019,1,0"
",swh:1:rev:0000000000000000000000000000000000000018,",
]

File Metadata

Mime Type
application/octet-stream
Expires
Fri, Apr 26, 2:27 PM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3146652

Event Timeline