Page MenuHomeSoftware Heritage

No OneTemporary

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java b/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java
new file mode 100644
index 0000000..3e094e8
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java
@@ -0,0 +1,51 @@
+package org.softwareheritage.graph.utils;
+
+import com.martiansoftware.jsap.*;
+import it.unimi.dsi.Util;
+import it.unimi.dsi.fastutil.io.BinIO;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * CLI program used to compose two on-disk permutations.
+ *
+ * It takes two on-disk permutations as parameters, p1 and p2, and writes on disk (p1 o p2) at the
+ * given location. This is useful for multi-step compression (e.g. Unordered -> BFS -> LLP), as it
+ * can be used to merge all the intermediate permutations.
+ */
+public class ComposePermutations {
+ private static JSAPResult parse_args(String[] args) {
+ JSAPResult config = null;
+ try {
+ SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
+ new UnflaggedOption("firstPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The first permutation"),
+ new UnflaggedOption("secondPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED,
+ "The second permutation"),
+ new UnflaggedOption("outputPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED,
+ "The output permutation"),});
+
+ config = jsap.parse(args);
+ if (jsap.messagePrinted()) {
+ System.exit(1);
+ }
+ } catch (JSAPException e) {
+ e.printStackTrace();
+ }
+ return config;
+ }
+
+ public static void main(String[] args) throws IOException, ClassNotFoundException {
+ JSAPResult config = parse_args(args);
+ String firstPermFilename = config.getString("firstPermutation");
+ String secondPermFilename = config.getString("secondPermutation");
+ String outputPermFilename = config.getString("outputPermutation");
+
+ long[][] firstPerm = BinIO.loadLongsBig(new File(firstPermFilename));
+ long[][] secondPerm = BinIO.loadLongsBig(new File(secondPermFilename));
+
+ long[][] outputPerm = Util.composePermutationsInPlace(firstPerm, secondPerm);
+
+ BinIO.storeLongs(outputPerm, outputPermFilename);
+ }
+}
diff --git a/swh/graph/cli.py b/swh/graph/cli.py
index 8bd11d3..7d399ac 100644
--- a/swh/graph/cli.py
+++ b/swh/graph/cli.py
@@ -1,446 +1,447 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING, Any, Dict, Set, Tuple
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import click
from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
from swh.core.cli import swh as swh_cli_group
if TYPE_CHECKING:
from swh.graph.webgraph import CompressionStep # noqa
class StepOption(click.ParamType):
"""click type for specifying a compression step on the CLI
parse either individual steps, specified as step names or integers, or step
ranges
"""
name = "compression step"
def convert(self, value, param, ctx): # type: (...) -> Set[CompressionStep]
from swh.graph.webgraph import COMP_SEQ, CompressionStep # noqa
steps: Set[CompressionStep] = set()
specs = value.split(",")
for spec in specs:
if "-" in spec: # step range
(raw_l, raw_r) = spec.split("-", maxsplit=1)
if raw_l == "": # no left endpoint
raw_l = COMP_SEQ[0].name
if raw_r == "": # no right endpoint
raw_r = COMP_SEQ[-1].name
l_step = self.convert(raw_l, param, ctx)
r_step = self.convert(raw_r, param, ctx)
if len(l_step) != 1 or len(r_step) != 1:
self.fail(f"invalid step specification: {value}, " f"see --help")
l_idx = l_step.pop()
r_idx = r_step.pop()
steps = steps.union(
set(CompressionStep(i) for i in range(l_idx.value, r_idx.value + 1))
)
else: # singleton step
try:
steps.add(CompressionStep(int(spec))) # integer step
except ValueError:
try:
steps.add(CompressionStep[spec.upper()]) # step name
except KeyError:
self.fail(
f"invalid step specification: {value}, " f"see --help"
)
return steps
class PathlibPath(click.Path):
"""A Click path argument that returns a pathlib Path, not a string"""
def convert(self, value, param, ctx):
return Path(super().convert(value, param, ctx))
DEFAULT_CONFIG: Dict[str, Tuple[str, Any]] = {"graph": ("dict", {})}
@swh_cli_group.group(name="graph", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup)
@click.option(
"--config-file",
"-C",
default=None,
type=click.Path(exists=True, dir_okay=False,),
help="YAML configuration file",
)
@click.pass_context
def graph_cli_group(ctx, config_file):
"""Software Heritage graph tools."""
from swh.core import config
ctx.ensure_object(dict)
conf = config.read(config_file, DEFAULT_CONFIG)
if "graph" not in conf:
raise ValueError(
'no "graph" stanza found in configuration file %s' % config_file
)
ctx.obj["config"] = conf
@graph_cli_group.command("api-client")
@click.option("--host", default="localhost", help="Graph server host")
@click.option("--port", default="5009", help="Graph server port")
@click.pass_context
def api_client(ctx, host, port):
"""client for the graph RPC service"""
from swh.graph import client
url = "http://{}:{}".format(host, port)
app = client.RemoteGraphClient(url)
# TODO: run web app
print(app.stats())
@graph_cli_group.group("map")
@click.pass_context
def map(ctx):
"""Manage swh-graph on-disk maps"""
pass
def dump_swhid2node(filename):
from swh.graph.swhid import SwhidToNodeMap
for (swhid, int) in SwhidToNodeMap(filename):
print("{}\t{}".format(swhid, int))
def dump_node2swhid(filename):
from swh.graph.swhid import NodeToSwhidMap
for (int, swhid) in NodeToSwhidMap(filename):
print("{}\t{}".format(int, swhid))
def restore_swhid2node(filename):
"""read a textual SWHID->int map from stdin and write its binary version to
filename
"""
from swh.graph.swhid import SwhidToNodeMap
with open(filename, "wb") as dst:
for line in sys.stdin:
(str_swhid, str_int) = line.split()
SwhidToNodeMap.write_record(dst, str_swhid, int(str_int))
def restore_node2swhid(filename, length):
"""read a textual int->SWHID map from stdin and write its binary version to
filename
"""
from swh.graph.swhid import NodeToSwhidMap
node2swhid = NodeToSwhidMap(filename, mode="wb", length=length)
for line in sys.stdin:
(str_int, str_swhid) = line.split()
node2swhid[int(str_int)] = str_swhid
node2swhid.close()
@map.command("dump")
@click.option(
"--type",
"-t",
"map_type",
required=True,
type=click.Choice(["swhid2node", "node2swhid"]),
help="type of map to dump",
)
@click.argument("filename", required=True, type=click.Path(exists=True))
@click.pass_context
def dump_map(ctx, map_type, filename):
"""Dump a binary SWHID<->node map to textual format."""
if map_type == "swhid2node":
dump_swhid2node(filename)
elif map_type == "node2swhid":
dump_node2swhid(filename)
else:
raise ValueError("invalid map type: " + map_type)
pass
@map.command("restore")
@click.option(
"--type",
"-t",
"map_type",
required=True,
type=click.Choice(["swhid2node", "node2swhid"]),
help="type of map to dump",
)
@click.option(
"--length",
"-l",
type=int,
help="""map size in number of logical records
(required for node2swhid maps)""",
)
@click.argument("filename", required=True, type=click.Path())
@click.pass_context
def restore_map(ctx, map_type, length, filename):
"""Restore a binary SWHID<->node map from textual format."""
if map_type == "swhid2node":
restore_swhid2node(filename)
elif map_type == "node2swhid":
if length is None:
raise click.UsageError(
"map length is required when restoring {} maps".format(map_type), ctx
)
restore_node2swhid(filename, length)
else:
raise ValueError("invalid map type: " + map_type)
@map.command("write")
@click.option(
"--type",
"-t",
"map_type",
required=True,
type=click.Choice(["swhid2node", "node2swhid"]),
help="type of map to write",
)
@click.argument("filename", required=True, type=click.Path())
@click.pass_context
def write(ctx, map_type, filename):
"""Write a map to disk sequentially.
read from stdin a textual SWHID->node mapping (for swhid2node, or a simple
sequence of SWHIDs for node2swhid) and write it to disk in the requested binary
map format
note that no sorting is applied, so the input should already be sorted as
required by the chosen map type (by SWHID for swhid2node, by int for node2swhid)
"""
from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap
with open(filename, "wb") as f:
if map_type == "swhid2node":
for line in sys.stdin:
(swhid, int_str) = line.rstrip().split(maxsplit=1)
SwhidToNodeMap.write_record(f, swhid, int(int_str))
elif map_type == "node2swhid":
for line in sys.stdin:
swhid = line.rstrip()
NodeToSwhidMap.write_record(f, swhid)
else:
raise ValueError("invalid map type: " + map_type)
@map.command("lookup")
@click.option(
"--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename"
)
@click.argument("identifiers", nargs=-1)
def map_lookup(graph, identifiers):
"""Lookup identifiers using on-disk maps.
Depending on the identifier type lookup either a SWHID into a SWHID->node (and
return the node integer identifier) or, vice-versa, lookup a node integer
identifier into a node->SWHID (and return the SWHID). The desired behavior is
chosen depending on the syntax of each given identifier.
Identifiers can be passed either directly on the command line or on
standard input, separate by blanks. Logical lines (as returned by
readline()) in stdin will be preserved in stdout.
"""
from swh.graph.backend import NODE2SWHID_EXT, SWHID2NODE_EXT
from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap
import swh.model.exceptions
from swh.model.swhids import ExtendedSWHID
success = True # no identifiers failed to be looked up
swhid2node = SwhidToNodeMap(f"{graph}.{SWHID2NODE_EXT}")
node2swhid = NodeToSwhidMap(f"{graph}.{NODE2SWHID_EXT}")
def lookup(identifier):
nonlocal success, swhid2node, node2swhid
is_swhid = None
try:
int(identifier)
is_swhid = False
except ValueError:
try:
ExtendedSWHID.from_string(identifier)
is_swhid = True
except swh.model.exceptions.ValidationError:
success = False
logging.error(f'invalid identifier: "{identifier}", skipping')
try:
if is_swhid:
return str(swhid2node[identifier])
else:
return node2swhid[int(identifier)]
except KeyError:
success = False
logging.error(f'identifier not found: "{identifier}", skipping')
if identifiers: # lookup identifiers passed via CLI
for identifier in identifiers:
print(lookup(identifier))
else: # lookup identifiers passed via stdin, preserving logical lines
for line in sys.stdin:
results = [lookup(id) for id in line.rstrip().split()]
if results: # might be empty if all IDs on the same line failed
print(" ".join(results))
sys.exit(0 if success else 1)
@graph_cli_group.command(name="rpc-serve")
@click.option(
"--host",
"-h",
default="0.0.0.0",
metavar="IP",
show_default=True,
help="host IP address to bind the server on",
)
@click.option(
"--port",
"-p",
default=5009,
type=click.INT,
metavar="PORT",
show_default=True,
help="port to bind the server on",
)
@click.option(
"--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename"
)
@click.pass_context
def serve(ctx, host, port, graph):
"""run the graph RPC service"""
import aiohttp
from swh.graph.server.app import make_app
config = ctx.obj["config"]
config.setdefault("graph", {})
config["graph"]["path"] = graph
app = make_app(config=config)
aiohttp.web.run_app(app, host=host, port=port)
@graph_cli_group.command()
@click.option(
"--graph",
"-g",
required=True,
metavar="GRAPH",
type=PathlibPath(),
help="input graph basename",
)
@click.option(
"--outdir",
"-o",
"out_dir",
required=True,
metavar="DIR",
type=PathlibPath(),
help="directory where to store compressed graph",
)
@click.option(
"--steps",
"-s",
metavar="STEPS",
type=StepOption(),
help="run only these compression steps (default: all steps)",
)
@click.pass_context
def compress(ctx, graph, out_dir, steps):
"""Compress a graph using WebGraph
Input: a pair of files g.nodes.csv.gz, g.edges.csv.gz
Output: a directory containing a WebGraph compressed graph
- Compression steps are: (1) mph, (2) bv, (3) bv_obl, (4) bfs, (5) permute,
- (6) permute_obl, (7) stats, (8) transpose, (9) transpose_obl, (10) maps,
- (11) clean_tmp. Compression steps can be selected by name or number using
+ Compression steps are: (1) mph, (2) bv, (3) bfs, (4) permute_bfs,
+ (5) transpose_bfs, (6) simplify, (7) llp, (8) permute_llp, (9) obl, (10)
+ compose_orders, (11) stats, (12) transpose, (13) transpose_obl, (14) maps,
+ (15) clean_tmp. Compression steps can be selected by name or number using
--steps, separating them with commas; step ranges (e.g., 3-9, 6-, etc.) are
also supported.
"""
from swh.graph import webgraph
graph_name = graph.name
in_dir = graph.parent
try:
conf = ctx.obj["config"]["graph"]["compress"]
except KeyError:
conf = {} # use defaults
webgraph.compress(graph_name, in_dir, out_dir, steps, conf)
@graph_cli_group.command(name="cachemount")
@click.option(
"--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename"
)
@click.option(
"--cache",
"-c",
default="/dev/shm/swh-graph/default",
metavar="CACHE",
type=PathlibPath(),
help="Memory cache path (defaults to /dev/shm/swh-graph/default)",
)
@click.pass_context
def cachemount(ctx, graph, cache):
"""
Cache the mmapped files of the compressed graph in a tmpfs.
This command creates a new directory at the path given by CACHE that has
the same structure as the compressed graph basename, except it copies the
files that require mmap access (:file:`{*}.graph`) but uses symlinks from the source
for all the other files (:file:`{*}.map`, :file:`{*}.bin`, ...).
The command outputs the path to the memory cache directory (particularly
useful when relying on the default value).
"""
import shutil
cache.mkdir(parents=True)
for src in Path(graph).parent.glob("*"):
dst = cache / src.name
if src.suffix == ".graph":
shutil.copy2(src, dst)
else:
dst.symlink_to(src.resolve())
print(cache)
def main():
return graph_cli_group(auto_envvar_prefix="SWH_GRAPH")
if __name__ == "__main__":
main()
diff --git a/swh/graph/config.py b/swh/graph/config.py
index 0d52b3f..f144f26 100644
--- a/swh/graph/config.py
+++ b/swh/graph/config.py
@@ -1,113 +1,115 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from pathlib import Path
import sys
import psutil
def find_graph_jar():
"""find swh-graph.jar, containing the Java part of swh-graph
look both in development directories and installed data (for in-production
deployments who fecthed the JAR from pypi)
"""
swh_graph_root = Path(__file__).parents[2]
try_paths = [
swh_graph_root / "java/target/",
Path(sys.prefix) / "share/swh-graph/",
Path(sys.prefix) / "local/share/swh-graph/",
]
for path in try_paths:
glob = list(path.glob("swh-graph-*.jar"))
if glob:
if len(glob) > 1:
logging.warn(
"found multiple swh-graph JARs, " "arbitrarily picking one"
)
logging.info("using swh-graph JAR: {0}".format(glob[0]))
return str(glob[0])
raise RuntimeError("swh-graph JAR not found. Have you run `make java`?")
def check_config(conf):
"""check configuration and propagate defaults"""
conf = conf.copy()
if "batch_size" not in conf:
# Use 0.1% of the RAM as a batch size:
# ~1 billion for big servers, ~10 million for small desktop machines
conf["batch_size"] = int(psutil.virtual_memory().total / 1000)
+ if "llp_gammas" not in conf:
+ conf["llp_gammas"] = "-0,-1,-2,-3,-4"
if "max_ram" not in conf:
conf["max_ram"] = str(psutil.virtual_memory().total)
if "java_tool_options" not in conf:
conf["java_tool_options"] = " ".join(
[
"-Xmx{max_ram}",
"-XX:PretenureSizeThreshold=512M",
"-XX:MaxNewSize=4G",
"-XX:+UseLargePages",
"-XX:+UseTransparentHugePages",
"-XX:+UseNUMA",
"-XX:+UseTLAB",
"-XX:+ResizeTLAB",
]
)
conf["java_tool_options"] = conf["java_tool_options"].format(
max_ram=conf["max_ram"]
)
if "java" not in conf:
conf["java"] = "java"
if "classpath" not in conf:
conf["classpath"] = find_graph_jar()
return conf
def check_config_compress(config, graph_name, in_dir, out_dir):
"""check compression-specific configuration and initialize its execution
environment.
"""
conf = check_config(config)
conf["graph_name"] = graph_name
conf["in_dir"] = str(in_dir)
conf["out_dir"] = str(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
if "tmp_dir" not in conf:
tmp_dir = out_dir / "tmp"
conf["tmp_dir"] = str(tmp_dir)
else:
tmp_dir = Path(conf["tmp_dir"])
tmp_dir.mkdir(parents=True, exist_ok=True)
if "logback" not in conf:
logback_confpath = tmp_dir / "logback.xml"
with open(logback_confpath, "w") as conffile:
conffile.write(
"""
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d %r %p [%t] %logger{1} - %m%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT"/>
</root>
</configuration>
"""
)
conf["logback"] = str(logback_confpath)
conf["java_tool_options"] += " -Dlogback.configurationFile={logback}"
conf["java_tool_options"] += " -Djava.io.tmpdir={tmp_dir}"
conf["java_tool_options"] = conf["java_tool_options"].format(
logback=conf["logback"], tmp_dir=conf["tmp_dir"],
)
return conf
diff --git a/swh/graph/tests/dataset/output/example-transposed.graph b/swh/graph/tests/dataset/output/example-transposed.graph
index 5460ea4..ad5756e 100644
--- a/swh/graph/tests/dataset/output/example-transposed.graph
+++ b/swh/graph/tests/dataset/output/example-transposed.graph
@@ -1 +1 @@
-[):¤+Åãuâñ6ü¾Mjk¥Òé5Öº
\ No newline at end of file
+zÏ.—hÑ®ëÄ×I®–‰tõÄëµì€{‹ÅÐ
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example-transposed.obl b/swh/graph/tests/dataset/output/example-transposed.obl
index 1291af6..54f0ac8 100644
Binary files a/swh/graph/tests/dataset/output/example-transposed.obl and b/swh/graph/tests/dataset/output/example-transposed.obl differ
diff --git a/swh/graph/tests/dataset/output/example-transposed.offsets b/swh/graph/tests/dataset/output/example-transposed.offsets
index 0b2e742..92c2947 100644
--- a/swh/graph/tests/dataset/output/example-transposed.offsets
+++ b/swh/graph/tests/dataset/output/example-transposed.offsets
@@ -1 +1,2 @@
-Ž Š(ãp¡BERB€
\ No newline at end of file
+„…
+ RqG4PTP¤(
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example-transposed.properties b/swh/graph/tests/dataset/output/example-transposed.properties
index 1f6c525..512ce9d 100644
--- a/swh/graph/tests/dataset/output/example-transposed.properties
+++ b/swh/graph/tests/dataset/output/example-transposed.properties
@@ -1,35 +1,35 @@
#BVGraph properties
-#Fri Apr 02 13:56:41 UTC 2021
-bitsforreferences=27
+#Sat Dec 04 01:37:28 CET 2021
+bitsforreferences=31
avgbitsforintervals=0.714
graphclass=it.unimi.dsi.big.webgraph.BVGraph
-avgdist=0.381
-successoravggap=6.739
-residualexpstats=6,5,2,1,2,1
+avgdist=0.571
+successoravggap=6.478
+residualexpstats=6,6,2,2,2
arcs=23
minintervallength=4
bitsforoutdegrees=61
-residualavgloggap=2.2888731039272048
+residualavgloggap=2.1534522798004265
avgbitsforoutdegrees=2.905
-bitsforresiduals=83
-successoravgloggap=2.2822834512468524
+bitsforresiduals=85
+successoravgloggap=2.3226776741991215
maxrefcount=3
-successorexpstats=7,6,5,3,1,1
-residualarcs=17
-avgbitsforresiduals=3.952
-avgbitsforblocks=0.286
+successorexpstats=7,6,4,3,3
+residualarcs=18
+avgbitsforresiduals=4.048
+avgbitsforblocks=0.238
windowsize=7
-residualavggap=7.971
-copiedarcs=6
-avgbitsforreferences=1.286
+residualavggap=5.667
+copiedarcs=5
+avgbitsforreferences=1.476
version=0
-compratio=1.515
-bitsperlink=8.348
+compratio=1.554
+bitsperlink=8.565
compressionflags=
nodes=21
-avgref=0.333
+avgref=0.238
zetak=3
bitsforintervals=15
intervalisedarcs=0
-bitspernode=9.143
-bitsforblocks=6
+bitspernode=9.381
+bitsforblocks=5
diff --git a/swh/graph/tests/dataset/output/example.graph b/swh/graph/tests/dataset/output/example.graph
index e13c173..621b9b7 100644
--- a/swh/graph/tests/dataset/output/example.graph
+++ b/swh/graph/tests/dataset/output/example.graph
@@ -1 +1 @@
-}Çì]/Iì‹÷ü÷ª#z‚îWu².Þ¥`
\ No newline at end of file
+'t}UûOGϹ]Þ°—].—¯dP}Rð
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example.mph b/swh/graph/tests/dataset/output/example.mph
index 7838165..c6f9e19 100644
Binary files a/swh/graph/tests/dataset/output/example.mph and b/swh/graph/tests/dataset/output/example.mph differ
diff --git a/swh/graph/tests/dataset/output/example.node2swhid.bin b/swh/graph/tests/dataset/output/example.node2swhid.bin
index 63cecba..9cc50b2 100644
Binary files a/swh/graph/tests/dataset/output/example.node2swhid.bin and b/swh/graph/tests/dataset/output/example.node2swhid.bin differ
diff --git a/swh/graph/tests/dataset/output/example.node2type.map b/swh/graph/tests/dataset/output/example.node2type.map
index 0a0a609..6b91c37 100644
Binary files a/swh/graph/tests/dataset/output/example.node2type.map and b/swh/graph/tests/dataset/output/example.node2type.map differ
diff --git a/swh/graph/tests/dataset/output/example.obl b/swh/graph/tests/dataset/output/example.obl
index 456c6ef..1b4fd2e 100644
Binary files a/swh/graph/tests/dataset/output/example.obl and b/swh/graph/tests/dataset/output/example.obl differ
diff --git a/swh/graph/tests/dataset/output/example.offsets b/swh/graph/tests/dataset/output/example.offsets
index f7d2333..407e1a6 100644
--- a/swh/graph/tests/dataset/output/example.offsets
+++ b/swh/graph/tests/dataset/output/example.offsets
@@ -1,2 +1 @@
-(PHPԒ
-…PÔ)
\ No newline at end of file
+…Ò†ŠBU!B…‡
diff --git a/swh/graph/tests/dataset/output/example.order b/swh/graph/tests/dataset/output/example.order
index 5e99fea..2cb5540 100644
Binary files a/swh/graph/tests/dataset/output/example.order and b/swh/graph/tests/dataset/output/example.order differ
diff --git a/swh/graph/tests/dataset/output/example.properties b/swh/graph/tests/dataset/output/example.properties
index 5b48508..cb6975a 100644
--- a/swh/graph/tests/dataset/output/example.properties
+++ b/swh/graph/tests/dataset/output/example.properties
@@ -1,35 +1,35 @@
#BVGraph properties
-#Fri Apr 02 13:56:08 UTC 2021
+#Sat Dec 04 01:37:26 CET 2021
bitsforreferences=14
avgbitsforintervals=0.667
graphclass=it.unimi.dsi.big.webgraph.BVGraph
avgdist=0
-successoravggap=7.652
-residualexpstats=7,5,5,3,2,1
+successoravggap=7.391
+residualexpstats=7,7,3,3,2,1
arcs=23
minintervallength=4
bitsforoutdegrees=51
-residualavgloggap=2.40434236090153
+residualavgloggap=2.32668281341601
avgbitsforoutdegrees=2.429
-bitsforresiduals=108
-successoravgloggap=2.40434236090153
+bitsforresiduals=111
+successoravgloggap=2.32668281341601
maxrefcount=3
-successorexpstats=7,5,5,3,2,1
+successorexpstats=7,7,3,3,2,1
residualarcs=23
-avgbitsforresiduals=5.143
+avgbitsforresiduals=5.286
avgbitsforblocks=0
windowsize=7
-residualavggap=7.652
+residualavggap=7.391
copiedarcs=0
avgbitsforreferences=0.667
version=0
-compratio=1.475
-bitsperlink=8.13
+compratio=1.499
+bitsperlink=8.261
compressionflags=
nodes=21
avgref=0
zetak=3
bitsforintervals=14
intervalisedarcs=0
-bitspernode=8.905
+bitspernode=9.048
bitsforblocks=0
diff --git a/swh/graph/tests/dataset/output/example.stats b/swh/graph/tests/dataset/output/example.stats
index 8b1eb1c..a58d3e2 100644
--- a/swh/graph/tests/dataset/output/example.stats
+++ b/swh/graph/tests/dataset/output/example.stats
@@ -1,20 +1,20 @@
nodes=21
arcs=23
loops=0
-successoravggap=5.765
-avglocality=3.826
+successoravggap=7.765
+avglocality=3.783
minoutdegree=0
maxoutdegree=3
-minoutdegreenode=7
-maxoutdegreenode=4
+minoutdegreenode=1
+maxoutdegreenode=0
dangling=7
terminal=7
percdangling=33.333333333333336
avgoutdegree=1.0952380952380953
-successorlogdeltastats=7,9,3,3,1
-successoravglogdelta=1.020
+successorlogdeltastats=11,7,1,3,1
+successoravglogdelta=0.911
minindegree=0
maxindegree=3
-minindegreenode=19
-maxindegreenode=1
+minindegreenode=17
+maxindegreenode=18
avgindegree=1.0952380952380953
diff --git a/swh/graph/webgraph.py b/swh/graph/webgraph.py
index 87c5341..24bb4b5 100644
--- a/swh/graph/webgraph.py
+++ b/swh/graph/webgraph.py
@@ -1,229 +1,280 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""WebGraph driver
"""
from datetime import datetime
from enum import Enum
import logging
import os
from pathlib import Path
import subprocess
from typing import Dict, List, Set
from swh.graph.config import check_config_compress
class CompressionStep(Enum):
MPH = 1
BV = 2
- BV_OBL = 3
- BFS = 4
- PERMUTE = 5
- PERMUTE_OBL = 6
- STATS = 7
- TRANSPOSE = 8
- TRANSPOSE_OBL = 9
- MAPS = 10
- CLEAN_TMP = 11
+ BFS = 3
+ PERMUTE_BFS = 4
+ TRANSPOSE_BFS = 5
+ SIMPLIFY = 6
+ LLP = 7
+ PERMUTE_LLP = 8
+ OBL = 9
+ COMPOSE_ORDERS = 10
+ STATS = 11
+ TRANSPOSE = 12
+ TRANSPOSE_OBL = 13
+ MAPS = 14
+ CLEAN_TMP = 15
def __str__(self):
return self.name
# full compression pipeline
COMP_SEQ = list(CompressionStep)
# Mapping from compression steps to shell commands implementing them. Commands
# will be executed by the shell, so be careful with meta characters. They are
# specified here as lists of tokens that will be joined together only for ease
# of line splitting. In commands, {tokens} will be interpolated with
# configuration values, see :func:`compress`.
STEP_ARGV: Dict[CompressionStep, List[str]] = {
CompressionStep.MPH: [
"{java}",
"it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction",
"--byte-array",
"--temp-dir",
"{tmp_dir}",
"{out_dir}/{graph_name}.mph",
"<( zstdcat {in_dir}/{graph_name}.nodes.csv.zst )",
],
# use process substitution (and hence FIFO) above as MPH class load the
# entire file in memory when reading from stdin
CompressionStep.BV: [
"zstdcat",
"{in_dir}/{graph_name}.edges.csv.zst",
"|",
"cut -d' ' -f1,2",
"|",
"{java}",
"it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph",
"--byte-array",
"--temp-dir",
"{tmp_dir}",
"--function",
"{out_dir}/{graph_name}.mph",
- "{out_dir}/{graph_name}-bv",
- ],
- CompressionStep.BV_OBL: [
- "{java}",
- "it.unimi.dsi.big.webgraph.BVGraph",
- "--list",
- "{out_dir}/{graph_name}-bv",
+ "{out_dir}/{graph_name}-base",
],
CompressionStep.BFS: [
"{java}",
"it.unimi.dsi.law.big.graph.BFS",
- "{out_dir}/{graph_name}-bv",
- "{out_dir}/{graph_name}.order",
+ "{out_dir}/{graph_name}-base",
+ "{out_dir}/{graph_name}-bfs.order",
+ ],
+ CompressionStep.PERMUTE_BFS: [
+ "{java}",
+ "it.unimi.dsi.big.webgraph.Transform",
+ "mapOffline",
+ "{out_dir}/{graph_name}-base",
+ "{out_dir}/{graph_name}-bfs",
+ "{out_dir}/{graph_name}-bfs.order",
+ "{batch_size}",
+ "{tmp_dir}",
+ ],
+ CompressionStep.TRANSPOSE_BFS: [
+ "{java}",
+ "it.unimi.dsi.big.webgraph.Transform",
+ "transposeOffline",
+ "{out_dir}/{graph_name}-bfs",
+ "{out_dir}/{graph_name}-bfs-transposed",
+ "{batch_size}",
+ "{tmp_dir}",
],
- CompressionStep.PERMUTE: [
+ CompressionStep.SIMPLIFY: [
+ "{java}",
+ "it.unimi.dsi.big.webgraph.Transform",
+ "simplify",
+ "{out_dir}/{graph_name}-bfs",
+ "{out_dir}/{graph_name}-bfs-transposed",
+ "{out_dir}/{graph_name}-bfs-simplified",
+ ],
+ CompressionStep.LLP: [
+ "{java}",
+ "it.unimi.dsi.law.big.graph.LayeredLabelPropagation",
+ "-g",
+ "{llp_gammas}",
+ "{out_dir}/{graph_name}-bfs-simplified",
+ "{out_dir}/{graph_name}-llp.order",
+ ],
+ CompressionStep.PERMUTE_LLP: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"mapOffline",
- "{out_dir}/{graph_name}-bv",
+ "{out_dir}/{graph_name}-bfs",
"{out_dir}/{graph_name}",
- "{out_dir}/{graph_name}.order",
+ "{out_dir}/{graph_name}-llp.order",
"{batch_size}",
"{tmp_dir}",
],
- CompressionStep.PERMUTE_OBL: [
+ CompressionStep.OBL: [
"{java}",
"it.unimi.dsi.big.webgraph.BVGraph",
"--list",
"{out_dir}/{graph_name}",
],
+ CompressionStep.COMPOSE_ORDERS: [
+ "{java}",
+ "org.softwareheritage.graph.utils.ComposePermutations",
+ "{out_dir}/{graph_name}-bfs.order",
+ "{out_dir}/{graph_name}-llp.order",
+ "{out_dir}/{graph_name}.order",
+ ],
CompressionStep.STATS: [
"{java}",
"it.unimi.dsi.big.webgraph.Stats",
"{out_dir}/{graph_name}",
],
CompressionStep.TRANSPOSE: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"transposeOffline",
"{out_dir}/{graph_name}",
"{out_dir}/{graph_name}-transposed",
"{batch_size}",
"{tmp_dir}",
],
CompressionStep.TRANSPOSE_OBL: [
"{java}",
"it.unimi.dsi.big.webgraph.BVGraph",
"--list",
"{out_dir}/{graph_name}-transposed",
],
CompressionStep.MAPS: [
"zstdcat",
"{in_dir}/{graph_name}.nodes.csv.zst",
"|",
"{java}",
"org.softwareheritage.graph.maps.NodeMapBuilder",
"{out_dir}/{graph_name}",
"{tmp_dir}",
],
CompressionStep.CLEAN_TMP: [
"rm",
"-rf",
- "{out_dir}/{graph_name}-bv.graph",
- "{out_dir}/{graph_name}-bv.obl",
- "{out_dir}/{graph_name}-bv.offsets",
+ "{out_dir}/{graph_name}-base.graph",
+ "{out_dir}/{graph_name}-base.offsets",
+ "{out_dir}/{graph_name}-base.properties",
+ "{out_dir}/{graph_name}-bfs-simplified.graph",
+ "{out_dir}/{graph_name}-bfs-simplified.offsets",
+ "{out_dir}/{graph_name}-bfs-simplified.properties",
+ "{out_dir}/{graph_name}-bfs-transposed.graph",
+ "{out_dir}/{graph_name}-bfs-transposed.offsets",
+ "{out_dir}/{graph_name}-bfs-transposed.properties",
+ "{out_dir}/{graph_name}-bfs.graph",
+ "{out_dir}/{graph_name}-bfs.offsets",
+ "{out_dir}/{graph_name}-bfs.order",
+ "{out_dir}/{graph_name}-bfs.properties",
+ "{out_dir}/{graph_name}-llp.order",
"{tmp_dir}",
],
}
def do_step(step, conf):
cmd = " ".join(STEP_ARGV[step]).format(**conf)
cmd_env = os.environ.copy()
cmd_env["JAVA_TOOL_OPTIONS"] = conf["java_tool_options"]
cmd_env["CLASSPATH"] = conf["classpath"]
logging.info(f"running: {cmd}")
process = subprocess.Popen(
["/bin/bash", "-c", cmd],
env=cmd_env,
encoding="utf8",
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
with process.stdout as stdout:
for line in stdout:
logging.info(line.rstrip())
rc = process.wait()
if rc != 0:
raise RuntimeError(
f"compression step {step} returned non-zero " f"exit code {rc}"
)
else:
return rc
def compress(
graph_name: str,
in_dir: Path,
out_dir: Path,
steps: Set[CompressionStep] = set(COMP_SEQ),
conf: Dict[str, str] = {},
):
"""graph compression pipeline driver from nodes/edges files to compressed
on-disk representation
Args:
graph_name: graph base name, relative to in_dir
in_dir: input directory, where the uncompressed graph can be found
out_dir: output directory, where the compressed graph will be stored
steps: compression steps to run (default: all steps)
conf: compression configuration, supporting the following keys (all are
optional, so an empty configuration is fine and is the default)
- batch_size: batch size for `WebGraph transformations
<http://webgraph.di.unimi.it/docs/it/unimi/dsi/webgraph/Transform.html>`_;
defaults to 1 billion
- classpath: java classpath, defaults to swh-graph JAR only
- java: command to run java VM, defaults to "java"
- java_tool_options: value for JAVA_TOOL_OPTIONS environment
variable; defaults to various settings for high memory machines
- logback: path to a logback.xml configuration file; if not provided
a temporary one will be created and used
- max_ram: maximum RAM to use for compression; defaults to available
virtual memory
- tmp_dir: temporary directory, defaults to the "tmp" subdir of
out_dir
"""
if not steps:
steps = set(COMP_SEQ)
conf = check_config_compress(conf, graph_name, in_dir, out_dir)
compression_start_time = datetime.now()
logging.info(f"starting compression at {compression_start_time}")
seq_no = 0
for step in COMP_SEQ:
if step not in steps:
logging.debug(f"skipping compression step {step}")
continue
seq_no += 1
step_start_time = datetime.now()
logging.info(
f"starting compression step {step} "
f"({seq_no}/{len(steps)}) at {step_start_time}"
)
do_step(step, conf)
step_end_time = datetime.now()
step_duration = step_end_time - step_start_time
logging.info(
f"completed compression step {step} "
f"({seq_no}/{len(steps)}) "
f"at {step_end_time} in {step_duration}"
)
compression_end_time = datetime.now()
compression_duration = compression_end_time - compression_start_time
logging.info(f"completed compression in {compression_duration}")

File Metadata

Mime Type
application/octet-stream
Expires
Sun, Apr 6, 11:11 PM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3282138

Event Timeline