personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph");
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ SwhOrcTable revisionTable = dataset.getTable("revision");
+
+ int[][] personArray = IntBigArrays.newBigArray(numNodes);
+
+ // Author IDs
+ BigArrays.fill(personArray, -1);
+ releaseTable.readBytes64Column("author", (swhid, personBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
+ });
+ revisionTable.readBytes64Column("author", (swhid, personBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
+ });
+ BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin");
+
+ // Committer IDs
+ BigArrays.fill(personArray, -1);
+ revisionTable.readBytes64Column("committer", (swhid, personBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
+ });
+ BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin");
+ }
+
+ public void writeMessages() throws IOException {
+ logger.info("Writing messages for release + revision, and URLs for origins");
+
+ long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes);
+ BigArrays.fill(messageOffsetArray, -1);
+
+ FastBufferedOutputStream messageStream = new FastBufferedOutputStream(
+ new FileOutputStream(graphBasename + ".property.message.bin"));
+ AtomicLong offset = new AtomicLong(0L);
+
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ releaseTable.readBytes64Column("message", (swhid, messageBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ messageStream.write(messageBase64);
+ messageStream.write('\n');
+ BigArrays.set(messageOffsetArray, id, offset.longValue());
+ offset.addAndGet(messageBase64.length + 1);
+ });
+
+ SwhOrcTable revisionTable = dataset.getTable("revision");
+ revisionTable.readBytes64Column("message", (swhid, messageBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ messageStream.write(messageBase64);
+ messageStream.write('\n');
+ BigArrays.set(messageOffsetArray, id, offset.longValue());
+ offset.addAndGet(messageBase64.length + 1);
+ });
+
+ OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin");
+ originTable.readURLs((swhid, messageBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ messageStream.write(messageBase64);
+ messageStream.write('\n');
+ BigArrays.set(messageOffsetArray, id, offset.longValue());
+ offset.addAndGet(messageBase64.length + 1);
+ });
+
+ // TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file
+ BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin");
+ // EliasFanoLongBigList messageOffsetEF = new
+ // EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray));
+ // BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin");
+ messageStream.close();
+ }
+
+ public void writeTagNames() throws IOException {
+ logger.info("Writing tag names for release");
+
+ long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes);
+ BigArrays.fill(tagNameOffsetArray, -1);
+
+ FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream(
+ new FileOutputStream(graphBasename + ".property.tag_name.bin"));
+ AtomicLong offset = new AtomicLong(0L);
+
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ tagNameStream.write(tagNameBase64);
+ tagNameStream.write('\n');
+ BigArrays.set(tagNameOffsetArray, id, offset.longValue());
+ offset.addAndGet(tagNameBase64.length + 1);
+ });
+
+ BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin");
+ // EliasFanoLongBigList tagNameOffsetEF = new
+ // EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray));
+ // BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin");
+ tagNameStream.close();
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java
deleted file mode 100644
index d71d7d8..0000000
--- a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java
+++ /dev/null
@@ -1,103 +0,0 @@
-// TODO: should be in webgraph upstream
-// package it.unimi.dsi.big.webgraph.labelling;
-package org.softwareheritage.graph.labels;
-
-/*
- * Copyright (C) 2020 TODO
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see .
- *
- */
-
-import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
-import it.unimi.dsi.big.webgraph.labelling.Label;
-
-import java.util.Arrays;
-
-/**
- * An abstract (single-attribute) list-of-longs label.
- *
- *
- * This class provides basic methods for a label holding a list of longs. Concrete implementations
- * may impose further requirements on the long.
- *
- *
- * Implementing subclasses must provide constructors, {@link Label#copy()},
- * {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)},
- * {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} and possibly override
- * {@link #toString()}.
- */
-
-public abstract class AbstractLongListLabel extends AbstractLabel implements Label {
- /** The key of the attribute represented by this label. */
- protected final String key;
- /** The values of the attribute represented by this label. */
- public long[] value;
-
- /**
- * Creates an long label with given key and value.
- *
- * @param key the (only) key of this label.
- * @param value the value of this label.
- */
- public AbstractLongListLabel(String key, long[] value) {
- this.key = key;
- this.value = value;
- }
-
- @Override
- public String wellKnownAttributeKey() {
- return key;
- }
-
- @Override
- public String[] attributeKeys() {
- return new String[]{key};
- }
-
- @Override
- public Class>[] attributeTypes() {
- return new Class[]{long[].class};
- }
-
- @Override
- public Object get(String key) {
- if (this.key.equals(key))
- return value;
- throw new IllegalArgumentException();
- }
-
- @Override
- public Object get() {
- return value;
- }
-
- @Override
- public String toString() {
- return key + ":" + Arrays.toString(value);
- }
-
- @Override
- public boolean equals(Object x) {
- if (x instanceof AbstractLongListLabel)
- return Arrays.equals(value, ((AbstractLongListLabel) x).value);
- else
- return false;
- }
-
- @Override
- public int hashCode() {
- return Arrays.hashCode(value);
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java
deleted file mode 100644
index f1672d9..0000000
--- a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java
+++ /dev/null
@@ -1,115 +0,0 @@
-// TODO: should be in webgraph upstream
-// package it.unimi.dsi.big.webgraph.labelling;
-package org.softwareheritage.graph.labels;
-
-/*
- * Copyright (C) 2020 TODO
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see .
- *
- */
-
-import it.unimi.dsi.big.webgraph.labelling.Label;
-import it.unimi.dsi.fastutil.longs.LongArrays;
-import it.unimi.dsi.io.InputBitStream;
-import it.unimi.dsi.io.OutputBitStream;
-
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * A list of longs represented in fixed width. Each list is prefixed by its length written in
- * {@linkplain OutputBitStream#writeGamma(int) γ coding}.
- */
-
-public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel {
- /** The bit width used to represent the value of this label. */
- private final int width;
-
- /**
- * Creates a new fixed-width long label.
- *
- * @param key the (only) key of this label.
- * @param width the label width (in bits).
- * @param value the value of this label.
- */
- public FixedWidthLongListLabel(String key, int width, long[] value) {
- super(key, value);
- for (int i = value.length; i-- != 0;)
- if (value[i] < 0 || value[i] >= 1L << width)
- throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i]));
- this.width = width;
- }
-
- /**
- * Creates a new fixed-width label with an empty list.
- *
- * @param key the (only) key of this label.
- * @param width the label width (in bits).
- */
- public FixedWidthLongListLabel(String key, int width) {
- this(key, width, LongArrays.EMPTY_ARRAY);
- }
-
- /**
- * Creates a new fixed-width long label using the given key and width with an empty list.
- *
- * @param arg two strings containing the key and the width of this label.
- */
- public FixedWidthLongListLabel(String... arg) {
- this(arg[0], Integer.parseInt(arg[1]));
- }
-
- @Override
- public Label copy() {
- return new FixedWidthLongListLabel(key, width, value.clone());
- }
-
- @Override
- public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
- long readBits = inputBitStream.readBits();
- value = new long[inputBitStream.readGamma()];
- for (int i = 0; i < value.length; i++)
- value[i] = inputBitStream.readLong(width);
- return (int) (inputBitStream.readBits() - readBits);
- }
-
- @Override
- public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
- int bits = outputBitStream.writeGamma(value.length);
- for (int i = 0; i < value.length; i++)
- bits += outputBitStream.writeLong(value[i], width);
- return bits;
- }
-
- /**
- * Returns -1 (the fixed width refers to a single long, not to the entire list).
- *
- * @return -1;
- */
- @Override
- public int fixedWidth() {
- return -1;
- }
-
- @Override
- public String toString() {
- return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
- }
-
- @Override
- public String toSpec() {
- return this.getClass().getName() + "(" + key + "," + width + ")";
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
index 41f7d5f..c84cfec 100644
--- a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
+++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
@@ -1,109 +1,110 @@
package org.softwareheritage.graph.labels;
import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
+import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel;
import it.unimi.dsi.big.webgraph.labelling.Label;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import java.io.IOException;
import java.util.Arrays;
/**
* Software Heritage graph edge labels following Webgraph labels convention.
*
* @author The Software Heritage developers
*/
public class SwhLabel extends AbstractLabel {
private final String key;
private final int width;
// TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry
// to store branch names)
public DirEntry[] value;
// Use existing Webgraph class to represent a list of DirEntry as a list of encoded long
private final FixedWidthLongListLabel longList;
private static final DirEntry[] EMPTY_ARRAY = {};
public SwhLabel(String key, int width, DirEntry[] value) {
this.key = key;
this.width = width;
this.value = value;
long[] valueEncoded = new long[value.length];
for (int i = 0; i < value.length; i++)
valueEncoded[i] = value[i].toEncoded();
this.longList = new FixedWidthLongListLabel(key, width, valueEncoded);
}
public SwhLabel(String key, int width) {
this(key, width, EMPTY_ARRAY);
}
public SwhLabel(String... arg) {
this(arg[0], Integer.parseInt(arg[1]));
}
@Override
public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
int ret = longList.fromBitStream(inputBitStream, sourceUnused);
// Decode values from their internal long representation
value = new DirEntry[longList.value.length];
for (int i = 0; i < value.length; i++)
value[i] = new DirEntry(longList.value[i]);
return ret;
}
@Override
public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
// Values have already been encoded in the SwhLabel constructor
return longList.toBitStream(outputBitStream, sourceUnused);
}
@Override
public String wellKnownAttributeKey() {
return key;
}
@Override
public String[] attributeKeys() {
return new String[]{key};
}
@Override
public Class>[] attributeTypes() {
return new Class[]{DirEntry[].class};
}
@Override
public Object get(String s) {
- if (this.key.equals(key))
+ if (this.key.equals(s))
return value;
throw new IllegalArgumentException();
}
@Override
public Object get() {
return value;
}
@Override
public Label copy() {
return new SwhLabel(key, width, value.clone());
}
@Override
public int fixedWidth() {
return -1;
}
@Override
public String toString() {
return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
}
@Override
public String toSpec() {
return this.getClass().getName() + "(" + key + "," + width + ")";
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
index 1b4c917..93e1ebb 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
@@ -1,183 +1,191 @@
package org.softwareheritage.graph.maps;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.util.ByteBufferLongBigList;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.compress.NodeMapBuilder;
-import java.io.FileInputStream;
+import java.io.File;
import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
/**
* Mapping between internal long node id and external SWHID.
*
* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using
* an mmap()-ed .order file containing the graph permutation.
*
* The node -> SWHID reverse mapping is pre-computed and dumped on disk in the
* {@link NodeMapBuilder} class, then it is loaded here using mmap().
*
* @author The Software Heritage developers
* @see NodeMapBuilder
*/
-public class NodeIdMap {
+public class NodeIdMap implements Size64 {
/** Fixed length of binary SWHID buffer */
public static final int SWHID_BIN_SIZE = 22;
/** File extension for the long node id to SWHID map */
public static final String NODE_TO_SWHID = ".node2swhid.bin";
/** Graph path and basename */
String graphPath;
/** mmap()-ed NODE_TO_SWHID file */
MapFile nodeToSwhMap;
/** Minimal perfect hash (MPH) function SWHID -> initial order */
Object2LongFunction mph;
/** mmap()-ed long list with the permutation initial order -> graph order */
LongBigList orderMap;
- /** FileInputStream containing the permutation */
- FileInputStream orderInputStream;
/**
* Constructor.
*
* @param graphPath full graph path
*/
public NodeIdMap(String graphPath) throws IOException {
this.graphPath = graphPath;
// node -> SWHID
this.nodeToSwhMap = new MapFile(graphPath + NODE_TO_SWHID, SWHID_BIN_SIZE);
// SWHID -> node
this.mph = loadMph(graphPath + ".mph");
- this.orderInputStream = new FileInputStream(graphPath + ".order");
- this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel());
+
+ try (RandomAccessFile mapFile = new RandomAccessFile(new File(graphPath + ".order"), "r")) {
+ FileChannel fileChannel = mapFile.getChannel();
+ this.orderMap = ByteBufferLongBigList.map(fileChannel);
+ }
}
@SuppressWarnings("unchecked")
public static Object2LongFunction loadMph(String path) throws IOException {
Object obj;
try {
obj = BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e.getMessage());
}
Object2LongFunction res = (Object2LongFunction) obj;
// Backward-compatibility for old maps parametrized with .
// New maps should be parametrized with , which is faster.
try {
// Try to call it with bytes, will fail if it's a O2LF.
res.getLong("42".getBytes(StandardCharsets.UTF_8));
} catch (ClassCastException e) {
class StringCompatibleByteFunction implements Object2LongFunction, Size64 {
private final Object2LongFunction legacyFunction;
public StringCompatibleByteFunction(Object2LongFunction legacyFunction) {
this.legacyFunction = legacyFunction;
}
@Override
public long getLong(Object o) {
byte[] bi = (byte[]) o;
return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8));
}
@Override
public int size() {
return legacyFunction.size();
}
@Override
public long size64() {
return (legacyFunction instanceof Size64)
? ((Size64) legacyFunction).size64()
: legacyFunction.size();
}
}
Object2LongFunction mphLegacy = (Object2LongFunction) obj;
return new StringCompatibleByteFunction(mphLegacy);
}
// End of backward-compatibility block
return res;
}
/**
* Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the
* SWHID is valid.
*
* @param swhid node represented as bytes
* @return corresponding node as a long id
*/
public long getNodeId(byte[] swhid) {
// 1. Hash the SWHID with the MPH to get its original ID
long origNodeId = mph.getLong(swhid);
// 2. Use the order permutation to get the position in the permuted graph
return this.orderMap.getLong(origNodeId);
}
/**
* Converts SWHID to corresponding long node id.
*
* @param swhid node represented as a {@link SWHID}
* @param checkExists if true, error if the SWHID is not present in the graph, if false the check
* will be skipped and invalid data will be returned for non-existing SWHIDs.
* @return corresponding node as a long id
* @see SWHID
*/
public long getNodeId(SWHID swhid, boolean checkExists) {
// Convert the SWHID to bytes and call getNodeId()
long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII));
// Check that the position effectively corresponds to a real node using the reverse map.
// This is necessary because the MPH makes no guarantees on whether the input SWHID is valid.
if (!checkExists || getSWHID(nodeId).equals(swhid)) {
return nodeId;
} else {
throw new IllegalArgumentException("Unknown SWHID: " + swhid);
}
}
public long getNodeId(SWHID swhid) {
return getNodeId(swhid, true);
}
/**
* Converts a node long id to corresponding SWHID.
*
* @param nodeId node as a long id
* @return corresponding node as a {@link SWHID}
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
/*
* Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's
* swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid
*/
if (nodeId < 0 || nodeId >= nodeToSwhMap.size()) {
throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nodeToSwhMap.size());
}
return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId));
}
/**
* Closes the mapping files.
*/
public void close() throws IOException {
- orderInputStream.close();
nodeToSwhMap.close();
}
+
+ /** Return the number of nodes in the map. */
+ @Override
+ public long size64() {
+ return nodeToSwhMap.size();
+ }
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
new file mode 100644
index 0000000..a50f983
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
@@ -0,0 +1,74 @@
+package org.softwareheritage.graph.utils;
+
+import it.unimi.dsi.big.webgraph.NodeIterator;
+import org.softwareheritage.graph.SwhUnidirectionalGraph;
+import org.softwareheritage.graph.labels.DirEntry;
+
+import java.io.IOException;
+
+public class DumpProperties {
+ public static void main(String[] args) throws IOException {
+ String graphPath = args[0];
+
+ SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.loadLabelled(graphPath);
+ graph.loadContentLength();
+ graph.loadContentIsSkipped();
+ graph.loadPersonIds();
+ graph.loadAuthorTimestamps();
+ graph.loadCommitterTimestamps();
+ graph.loadMessages();
+ graph.loadTagNames();
+ graph.loadLabelNames();
+
+ NodeIterator it = graph.nodeIterator();
+ while (it.hasNext()) {
+ long node = it.nextLong();
+ System.out.format("%s: %s\n", node, graph.getSWHID(node));
+
+ var s = graph.labelledSuccessors(node);
+ long succ;
+ System.out.println(" successors:");
+ while ((succ = s.nextLong()) >= 0) {
+ DirEntry[] labels = (DirEntry[]) s.label().get();
+ if (labels.length > 0) {
+ for (DirEntry label : labels) {
+ System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ),
+ new String(graph.getLabelName(label.filenameId)), label.permission);
+ }
+ } else {
+ System.out.format(" %s\n", graph.getSWHID(succ));
+ }
+ }
+
+ switch (graph.getNodeType(node)) {
+ case CNT:
+ System.out.format(" length: %s\n", graph.getContentLength(node));
+ System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node));
+ break;
+ case REV:
+ System.out.format(" author: %s\n", graph.getAuthorId(node));
+ System.out.format(" committer: %s\n", graph.getCommitterId(node));
+ System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
+ graph.getAuthorTimestampOffset(node));
+ System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node),
+ graph.getCommitterTimestampOffset(node));
+ byte[] msg = graph.getMessage(node);
+ if (msg != null) {
+ System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n"));
+ }
+ break;
+ case REL:
+ System.out.format(" author: %s\n", graph.getAuthorId(node));
+ System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
+ graph.getAuthorTimestamp(node));
+ System.out.format(" message: %s\n", (new String(graph.getMessage(node))).replace("\n", "\\n"));
+ System.out.format(" tag name: %s\n", new String(graph.getTagName(node)));
+ break;
+ case ORI:
+ System.out.format(" url: %s\n", graph.getUrl(node));
+ }
+
+ System.out.println();
+ }
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
index b7461a3..4a2bb79 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
@@ -1,113 +1,110 @@
package org.softwareheritage.graph.utils;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.fastutil.BigArrays;
-import it.unimi.dsi.fastutil.io.BinIO;
import org.softwareheritage.graph.*;
import java.io.IOException;
import java.time.Duration;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Stack;
/* sample invocation on granet.internal.softwareheritage.org for benchmarking
* purposes, with the main swh-graph service already running:
*
* $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-0.3.0.jar -Xmx300G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.FindEarliestRevision --timing /dev/shm/swh-graph/default/graph
*
*/
public class FindEarliestRevision {
public static void main(String[] args) throws IOException, ClassNotFoundException {
String graphPath = args[0];
boolean timing = false;
long ts, elapsedNanos;
Duration elapsed;
if (args.length >= 2 && (args[0].equals("-t") || args[0].equals("--timing"))) {
timing = true;
graphPath = args[1];
System.err.println("started with timing option, will keep track of elapsed time");
}
System.err.println("loading transposed graph...");
ts = System.nanoTime();
SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath).transpose();
elapsed = Duration.ofNanos(System.nanoTime() - ts);
System.err.println(String.format("transposed graph loaded (duration: %s).", elapsed));
System.err.println("loading revision timestamps...");
ts = System.nanoTime();
- long[][] committerTimestamps = BinIO.loadLongsBig(graphPath + "-rev_committer_timestamps.bin");
+ graph.loadCommitterTimestamps();
elapsed = Duration.ofNanos(System.nanoTime() - ts);
System.err.println(String.format("revision timestamps loaded (duration: %s).", elapsed));
Scanner stdin = new Scanner(System.in);
AllowedEdges edges = new AllowedEdges("cnt:dir,dir:dir,dir:rev");
String rawSWHID = null;
SWHID srcSWHID = null;
long lineCount = 0;
long srcNodeId = -1;
if (timing) {
System.err.println("starting SWHID processing...");
elapsed = Duration.ZERO;
}
while (stdin.hasNextLine()) {
if (timing)
ts = System.nanoTime();
rawSWHID = stdin.nextLine().strip();
lineCount++;
try {
srcSWHID = new SWHID(rawSWHID);
srcNodeId = graph.getNodeId(srcSWHID);
} catch (IllegalArgumentException e) {
System.err
.println(String.format("skipping invalid or unknown SWHID %s on line %d", rawSWHID, lineCount));
continue;
}
if (timing)
System.err.println("starting traversal for: " + srcSWHID.toString());
Stack stack = new Stack<>();
HashSet visited = new HashSet<>();
stack.push(srcNodeId);
visited.add(srcNodeId);
long minRevId = -1;
long minTimestamp = Long.MAX_VALUE;
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
if (graph.getNodeType(currentNodeId) == Node.Type.REV) {
- long committerTs = BigArrays.get(committerTimestamps, currentNodeId);
+ long committerTs = graph.getCommitterTimestamp(currentNodeId);
if (committerTs < minTimestamp) {
minRevId = currentNodeId;
minTimestamp = committerTs;
}
}
LazyLongIterator it = Traversal.filterSuccessors(graph, currentNodeId, edges);
for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
}
}
}
if (minRevId == -1) {
System.err.println("no revision found containing: " + srcSWHID.toString());
} else {
System.out.println(srcSWHID.toString() + "\t" + graph.getSWHID(minRevId).toString());
}
if (timing) {
elapsedNanos = System.nanoTime() - ts; // processing time for current SWHID
elapsed = elapsed.plus(Duration.ofNanos(elapsedNanos)); // cumulative processing time for all SWHIDs
- System.err.println(String.format("visit time (s):\t%.6f", (double) elapsedNanos / 1_000_000_000));
+ System.err.printf("visit time (s):\t%.6f\n", (double) elapsedNanos / 1_000_000_000);
}
}
if (timing)
- System.err.println(String.format("processed %d SWHIDs in %s (%s avg)", lineCount, elapsed,
- elapsed.dividedBy(lineCount)));
+ System.err.printf("processed %d SWHIDs in %s (%s avg)\n", lineCount, elapsed, elapsed.dividedBy(lineCount));
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
index ace8c94..998cbce 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
@@ -1,27 +1,25 @@
package org.softwareheritage.graph.utils;
-import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.NodeIterator;
-import org.softwareheritage.graph.maps.NodeIdMap;
+import org.softwareheritage.graph.SwhUnidirectionalGraph;
import java.io.IOException;
public class ReadGraph {
public static void main(String[] args) throws IOException {
String graphPath = args[0];
- ImmutableGraph graph = ImmutableGraph.load(graphPath);
- NodeIdMap nodeMap = new NodeIdMap(graphPath);
+ SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.load(graphPath);
NodeIterator it = graph.nodeIterator();
while (it.hasNext()) {
long srcNode = it.nextLong();
var s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
- System.out.format("%s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode));
+ System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode));
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
index 179edf1..a1fcfe4 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
@@ -1,40 +1,35 @@
package org.softwareheritage.graph.utils;
-import it.unimi.dsi.big.util.FrontCodedStringBigList;
-import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
-import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
-import it.unimi.dsi.fastutil.io.BinIO;
+import org.softwareheritage.graph.SwhUnidirectionalGraph;
import org.softwareheritage.graph.labels.DirEntry;
-import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
public class ReadLabelledGraph {
public static void main(String[] args) throws IOException, ClassNotFoundException {
String graphPath = args[0];
- ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled");
- NodeIdMap nodeMap = new NodeIdMap(graphPath);
- FrontCodedStringBigList filenameMap = (FrontCodedStringBigList) BinIO.loadObject(graphPath + "-labels.fcl");
+ SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.loadLabelled(graphPath);
+ graph.properties.loadLabelNames();
- ArcLabelledNodeIterator it = graph.nodeIterator();
+ ArcLabelledNodeIterator it = graph.labelledNodeIterator();
while (it.hasNext()) {
long srcNode = it.nextLong();
ArcLabelledNodeIterator.LabelledArcIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
DirEntry[] labels = (DirEntry[]) s.label().get();
if (labels.length > 0) {
for (DirEntry label : labels) {
- System.out.format("%s %s %s %d\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode),
- filenameMap.get(label.filenameId), label.permission);
+ System.out.format("%s %s %s %d\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode),
+ new String(graph.properties.getLabelName(label.filenameId)), label.permission);
}
} else {
- System.out.format("%s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode));
+ System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode));
}
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/Sort.java b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java
new file mode 100644
index 0000000..4ece391
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java
@@ -0,0 +1,25 @@
+package org.softwareheritage.graph.utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class Sort {
+ public static Process spawnSort(String sortBufferSize, String sortTmpDir) throws IOException {
+ ProcessBuilder sortProcessBuilder = new ProcessBuilder();
+ sortProcessBuilder.redirectError(ProcessBuilder.Redirect.INHERIT);
+ ArrayList command = new ArrayList<>(List.of("sort", "-u", "--buffer-size", sortBufferSize));
+ if (sortTmpDir != null) {
+ command.add("--temporary-directory");
+ command.add(sortTmpDir);
+ }
+ sortProcessBuilder.command(command);
+ Map env = sortProcessBuilder.environment();
+ env.put("LC_ALL", "C");
+ env.put("LC_COLLATE", "C");
+ env.put("LANG", "C");
+
+ return sortProcessBuilder.start();
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
deleted file mode 100644
index 7d35574..0000000
--- a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
+++ /dev/null
@@ -1,53 +0,0 @@
-package org.softwareheritage.graph.utils;
-
-import it.unimi.dsi.fastutil.BigArrays;
-import it.unimi.dsi.fastutil.Size64;
-import it.unimi.dsi.fastutil.io.BinIO;
-import it.unimi.dsi.fastutil.longs.LongBigArrays;
-import it.unimi.dsi.fastutil.objects.Object2LongFunction;
-import it.unimi.dsi.io.FastBufferedReader;
-import it.unimi.dsi.io.LineIterator;
-import org.softwareheritage.graph.maps.NodeIdMap;
-
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-
-public class WriteRevisionTimestamps {
- public static void main(String[] args) throws IOException, ClassNotFoundException {
- System.err.print("Loading everything...");
- String graphPath = args[0];
- String outputFile = args[1];
- Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
- long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
- long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order");
- // NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds);
- long[][] timestampArray = LongBigArrays.newBigArray(nbIds);
- BigArrays.fill(timestampArray, Long.MIN_VALUE);
- System.err.println(" done.");
-
- // TODO: wasteful to convert to/from bytes
- FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
- LineIterator lineIterator = new LineIterator(buffer);
-
- while (lineIterator.hasNext()) {
- String line = lineIterator.next().toString();
- String[] line_elements = line.split("[ \\t]");
-
- // SWHID currentRev = new SWHID(line_elements[0].strip());
- long revId = -1;
- long timestamp = -1;
- try {
- // revId = nodeIdMap.getNodeId(currentRev);
- long revHash = mphMap.getLong(line_elements[0].strip().getBytes(StandardCharsets.US_ASCII));
- revId = BigArrays.get(nodePerm, revHash);
- timestamp = Long.parseLong(line_elements[1].strip());
- } catch (IllegalArgumentException e) {
- continue;
- }
- BigArrays.set(timestampArray, revId, timestamp);
- // System.err.println(revId + " " + timestamp);
- }
- BinIO.storeLongs(timestampArray, outputFile);
- }
-}
diff --git a/java/src/test/java/org/softwareheritage/graph/GraphTest.java b/java/src/test/java/org/softwareheritage/graph/GraphTest.java
index 42c84a2..9b7a64b 100644
--- a/java/src/test/java/org/softwareheritage/graph/GraphTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/GraphTest.java
@@ -1,44 +1,53 @@
package org.softwareheritage.graph;
+import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
+import com.github.luben.zstd.ZstdInputStream;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.LazyLongIterators;
import org.hamcrest.MatcherAssert;
import org.junit.jupiter.api.BeforeAll;
import static org.hamcrest.collection.IsIterableContainingInAnyOrder.containsInAnyOrder;
public class GraphTest {
static SwhBidirectionalGraph graph;
+ final protected String TEST_ORIGIN_ID = "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054";
+
@BeforeAll
public static void setUp() throws IOException {
- Path graphPath = Paths.get("..", "swh", "graph", "tests", "dataset", "output", "example");
+ Path graphPath = Paths.get("..", "swh", "graph", "tests", "dataset", "compressed", "example");
graph = SwhBidirectionalGraph.loadMapped(graphPath.toString());
}
public SwhBidirectionalGraph getGraph() {
return graph;
}
public static SWHID fakeSWHID(String type, int num) {
return new SWHID(String.format("swh:1:%s:%040d", type, num));
}
public static void assertEqualsAnyOrder(Collection expecteds, Collection actuals) {
MatcherAssert.assertThat(expecteds, containsInAnyOrder(actuals.toArray()));
}
public static ArrayList lazyLongIteratorToList(LazyLongIterator input) {
ArrayList inputList = new ArrayList<>();
Iterator inputIt = LazyLongIterators.eager(input);
inputIt.forEachRemaining(inputList::add);
return inputList;
}
+
+ public static String[] readZstFile(Path zstFile) throws IOException {
+ ZstdInputStream zis = new ZstdInputStream(new FileInputStream(zstFile.toFile()));
+ return (new String(zis.readAllBytes())).split("\n");
+ }
}
diff --git a/java/src/test/java/org/softwareheritage/graph/LeavesTest.java b/java/src/test/java/org/softwareheritage/graph/LeavesTest.java
index ae8765b..1da30ad 100644
--- a/java/src/test/java/org/softwareheritage/graph/LeavesTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/LeavesTest.java
@@ -1,107 +1,107 @@
package org.softwareheritage.graph;
import java.util.ArrayList;
import org.junit.jupiter.api.Test;
import org.softwareheritage.graph.server.Endpoint;
// Avoid warnings concerning Endpoint.Output.result manual cast
@SuppressWarnings("unchecked")
public class LeavesTest extends GraphTest {
@Test
public void forwardFromSnp() {
SwhBidirectionalGraph graph = getGraph();
SWHID src = new SWHID("swh:1:snp:0000000000000000000000000000000000000020");
Endpoint endpoint = new Endpoint(graph, "forward", "*");
ArrayList expectedLeaves = new ArrayList<>();
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"));
ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves);
}
@Test
public void forwardFromRel() {
SwhBidirectionalGraph graph = getGraph();
SWHID src = new SWHID("swh:1:rel:0000000000000000000000000000000000000019");
Endpoint endpoint = new Endpoint(graph, "forward", "*");
ArrayList expectedLeaves = new ArrayList<>();
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000015"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000011"));
ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves);
}
@Test
public void backwardFromLeaf() {
SwhBidirectionalGraph graph = getGraph();
Endpoint endpoint1 = new Endpoint(graph, "backward", "*");
SWHID src1 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000015");
ArrayList expectedLeaves1 = new ArrayList<>();
expectedLeaves1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019"));
ArrayList actualLeaves1 = (ArrayList) endpoint1.leaves(new Endpoint.Input(src1)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves1, actualLeaves1);
Endpoint endpoint2 = new Endpoint(graph, "backward", "*");
SWHID src2 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004");
ArrayList expectedLeaves2 = new ArrayList<>();
- expectedLeaves2.add(new SWHID("swh:1:ori:0000000000000000000000000000000000000021"));
+ expectedLeaves2.add(new SWHID(TEST_ORIGIN_ID));
expectedLeaves2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019"));
ArrayList actualLeaves2 = (ArrayList) endpoint2.leaves(new Endpoint.Input(src2)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves2, actualLeaves2);
}
@Test
public void forwardRevToRevOnly() {
SwhBidirectionalGraph graph = getGraph();
SWHID src = new SWHID("swh:1:rev:0000000000000000000000000000000000000018");
Endpoint endpoint = new Endpoint(graph, "forward", "rev:rev");
ArrayList expectedLeaves = new ArrayList<>();
expectedLeaves.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003"));
ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves);
}
@Test
public void forwardDirToAll() {
SwhBidirectionalGraph graph = getGraph();
SWHID src = new SWHID("swh:1:dir:0000000000000000000000000000000000000008");
Endpoint endpoint = new Endpoint(graph, "forward", "dir:*");
ArrayList expectedLeaves = new ArrayList<>();
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"));
expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"));
ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves);
}
@Test
public void backwardCntToDirDirToDir() {
SwhBidirectionalGraph graph = getGraph();
SWHID src = new SWHID("swh:1:cnt:0000000000000000000000000000000000000005");
Endpoint endpoint = new Endpoint(graph, "backward", "cnt:dir,dir:dir");
ArrayList expectedLeaves = new ArrayList<>();
expectedLeaves.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000012"));
ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result;
GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java b/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java
index 86bbda9..9c6225b 100644
--- a/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java
@@ -1,141 +1,141 @@
package org.softwareheritage.graph;
import java.util.ArrayList;
import org.junit.jupiter.api.Test;
import org.softwareheritage.graph.server.Endpoint;
// Avoid warnings concerning Endpoint.Output.result manual cast
@SuppressWarnings("unchecked")
public class NeighborsTest extends GraphTest {
@Test
public void zeroNeighbor() {
SwhBidirectionalGraph graph = getGraph();
ArrayList expectedNodes = new ArrayList<>();
- SWHID src1 = new SWHID("swh:1:ori:0000000000000000000000000000000000000021");
+ SWHID src1 = new SWHID(TEST_ORIGIN_ID);
Endpoint endpoint1 = new Endpoint(graph, "backward", "*");
ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes, actuals1);
SWHID src2 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004");
Endpoint endpoint2 = new Endpoint(graph, "forward", "*");
ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes, actuals2);
SWHID src3 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000015");
Endpoint endpoint3 = new Endpoint(graph, "forward", "*");
ArrayList actuals3 = (ArrayList) endpoint3.neighbors(new Endpoint.Input(src3)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes, actuals3);
SWHID src4 = new SWHID("swh:1:rel:0000000000000000000000000000000000000019");
Endpoint endpoint4 = new Endpoint(graph, "backward", "*");
ArrayList actuals4 = (ArrayList) endpoint4.neighbors(new Endpoint.Input(src4)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes, actuals4);
SWHID src5 = new SWHID("swh:1:dir:0000000000000000000000000000000000000008");
Endpoint endpoint5 = new Endpoint(graph, "forward", "snp:*,rev:*,rel:*");
ArrayList actuals5 = (ArrayList) endpoint5.neighbors(new Endpoint.Input(src5)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes, actuals5);
}
@Test
public void oneNeighbor() {
SwhBidirectionalGraph graph = getGraph();
SWHID src1 = new SWHID("swh:1:rev:0000000000000000000000000000000000000003");
Endpoint endpoint1 = new Endpoint(graph, "forward", "*");
ArrayList expectedNodes1 = new ArrayList<>();
expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002"));
ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1);
SWHID src2 = new SWHID("swh:1:dir:0000000000000000000000000000000000000017");
Endpoint endpoint2 = new Endpoint(graph, "forward", "dir:cnt");
ArrayList expectedNodes2 = new ArrayList<>();
expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014"));
ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2);
SWHID src3 = new SWHID("swh:1:dir:0000000000000000000000000000000000000012");
Endpoint endpoint3 = new Endpoint(graph, "backward", "*");
ArrayList expectedNodes3 = new ArrayList<>();
expectedNodes3.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013"));
ArrayList actuals3 = (ArrayList) endpoint3.neighbors(new Endpoint.Input(src3)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3);
SWHID src4 = new SWHID("swh:1:rev:0000000000000000000000000000000000000009");
Endpoint endpoint4 = new Endpoint(graph, "backward", "rev:rev");
ArrayList expectedNodes4 = new ArrayList<>();
expectedNodes4.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013"));
ArrayList actuals4 = (ArrayList) endpoint4.neighbors(new Endpoint.Input(src4)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4);
SWHID src5 = new SWHID("swh:1:snp:0000000000000000000000000000000000000020");
Endpoint endpoint5 = new Endpoint(graph, "backward", "*");
ArrayList expectedNodes5 = new ArrayList<>();
- expectedNodes5.add(new SWHID("swh:1:ori:0000000000000000000000000000000000000021"));
+ expectedNodes5.add(new SWHID(TEST_ORIGIN_ID));
ArrayList actuals5 = (ArrayList) endpoint5.neighbors(new Endpoint.Input(src5)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes5, actuals5);
}
@Test
public void twoNeighbors() {
SwhBidirectionalGraph graph = getGraph();
SWHID src1 = new SWHID("swh:1:snp:0000000000000000000000000000000000000020");
Endpoint endpoint1 = new Endpoint(graph, "forward", "*");
ArrayList expectedNodes1 = new ArrayList<>();
expectedNodes1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010"));
expectedNodes1.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009"));
ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1);
SWHID src2 = new SWHID("swh:1:dir:0000000000000000000000000000000000000008");
Endpoint endpoint2 = new Endpoint(graph, "forward", "dir:cnt");
ArrayList expectedNodes2 = new ArrayList<>();
expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"));
expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"));
ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2);
SWHID src3 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000001");
Endpoint endpoint3 = new Endpoint(graph, "backward", "*");
ArrayList expectedNodes3 = new ArrayList<>();
expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000008"));
expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002"));
ArrayList actuals3 = (ArrayList) endpoint3.neighbors(new Endpoint.Input(src3)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3);
SWHID src4 = new SWHID("swh:1:rev:0000000000000000000000000000000000000009");
Endpoint endpoint4 = new Endpoint(graph, "backward", "rev:snp,rev:rel");
ArrayList expectedNodes4 = new ArrayList<>();
expectedNodes4.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020"));
expectedNodes4.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010"));
ArrayList actuals4 = (ArrayList) endpoint4.neighbors(new Endpoint.Input(src4)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4);
}
@Test
public void threeNeighbors() {
SwhBidirectionalGraph graph = getGraph();
SWHID src1 = new SWHID("swh:1:dir:0000000000000000000000000000000000000008");
Endpoint endpoint1 = new Endpoint(graph, "forward", "*");
ArrayList expectedNodes1 = new ArrayList<>();
expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000006"));
expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"));
expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"));
ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1);
SWHID src2 = new SWHID("swh:1:rev:0000000000000000000000000000000000000009");
Endpoint endpoint2 = new Endpoint(graph, "backward", "*");
ArrayList expectedNodes2 = new ArrayList<>();
expectedNodes2.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020"));
expectedNodes2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010"));
expectedNodes2.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013"));
ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result;
GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
index b561de9..e471799 100644
--- a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
@@ -1,85 +1,85 @@
package org.softwareheritage.graph;
import java.util.*;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class SubgraphTest extends GraphTest {
@Test
public void noFilter() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("*"));
for (long i = 0; i < g.numNodes(); ++i) {
Assertions.assertEquals(g.outdegree(i), sg.outdegree(i));
}
}
@Test
public void missingNode() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
SWHID rev1 = fakeSWHID("rev", 18);
Assertions.assertThrows(IllegalArgumentException.class, () -> {
sg.outdegree(sg.getNodeId(rev1));
});
Assertions.assertThrows(IllegalArgumentException.class, () -> {
sg.successors(sg.getNodeId(rev1));
});
}
@Test
public void outdegreeOnlyDirOri() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
SWHID dir1 = fakeSWHID("dir", 17);
Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir1)));
Assertions.assertEquals(1, sg.outdegree(sg.getNodeId(dir1)));
SWHID dir2 = fakeSWHID("dir", 6);
Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir2)));
Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(dir2)));
- SWHID ori1 = fakeSWHID("ori", 21);
+ SWHID ori1 = new SWHID(TEST_ORIGIN_ID);
Assertions.assertEquals(1, g.outdegree(g.getNodeId(ori1)));
Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(ori1)));
}
@Test
public void successorsOnlyDirOri() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
SWHID dir1 = fakeSWHID("dir", 17);
assertEqualsAnyOrder(Collections.singletonList(sg.getNodeId(fakeSWHID("dir", 16))),
lazyLongIteratorToList(sg.successors(sg.getNodeId(dir1))));
SWHID dir2 = fakeSWHID("dir", 6);
assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(dir2))));
- SWHID ori1 = fakeSWHID("ori", 21);
+ SWHID ori1 = new SWHID(TEST_ORIGIN_ID);
assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(ori1))));
}
@Test
public void nodeIteratorOnlyOriDir() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
ArrayList nodeList = new ArrayList<>();
Iterator nodeIt = sg.nodeIterator();
nodeIt.forEachRemaining(nodeList::add);
- assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(fakeSWHID("ori", 21)), sg.getNodeId(fakeSWHID("dir", 2)),
+ assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(new SWHID(TEST_ORIGIN_ID)), sg.getNodeId(fakeSWHID("dir", 2)),
sg.getNodeId(fakeSWHID("dir", 6)), sg.getNodeId(fakeSWHID("dir", 8)),
sg.getNodeId(fakeSWHID("dir", 12)), sg.getNodeId(fakeSWHID("dir", 16)),
sg.getNodeId(fakeSWHID("dir", 17))), nodeList);
sg = new Subgraph(g, new AllowedNodes("snp,rel"));
nodeList = new ArrayList<>();
nodeIt = sg.nodeIterator();
nodeIt.forEachRemaining(nodeList::add);
assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(fakeSWHID("snp", 20)), sg.getNodeId(fakeSWHID("rel", 10)),
sg.getNodeId(fakeSWHID("rel", 19))), nodeList);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/VisitTest.java b/java/src/test/java/org/softwareheritage/graph/VisitTest.java
index 6144890..aef4b8d 100644
--- a/java/src/test/java/org/softwareheritage/graph/VisitTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/VisitTest.java
@@ -1,420 +1,408 @@
package org.softwareheritage.graph;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import org.junit.jupiter.api.Test;
import org.softwareheritage.graph.server.Endpoint;
// Avoid warnings concerning Endpoint.Output.result manual cast
@SuppressWarnings("unchecked")
public class VisitTest extends GraphTest {
private void assertSameNodesFromPaths(ArrayList paths, ArrayList nodes) {
Set expectedNodes = new HashSet();
for (SwhPath path : paths) {
expectedNodes.addAll(path.getPath());
}
GraphTest.assertEqualsAnyOrder(expectedNodes, nodes);
}
@Test
public void forwardFromRoot() {
SwhBidirectionalGraph graph = getGraph();
- SWHID swhid = new SWHID("swh:1:ori:0000000000000000000000000000000000000021");
+ SWHID swhid = new SWHID(TEST_ORIGIN_ID);
Endpoint endpoint1 = new Endpoint(graph, "forward", "*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005"));
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021",
- "swh:1:snp:0000000000000000000000000000000000000020",
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID, "swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardFromMiddle() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:dir:0000000000000000000000000000000000000012");
Endpoint endpoint1 = new Endpoint(graph, "forward", "*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007"));
expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001"));
expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004"));
expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005"));
expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:cnt:0000000000000000000000000000000000000011"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardFromLeaf() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004");
Endpoint endpoint1 = new Endpoint(graph, "forward", "*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void backwardFromRoot() {
SwhBidirectionalGraph graph = getGraph();
- SWHID swhid = new SWHID("swh:1:ori:0000000000000000000000000000000000000021");
+ SWHID swhid = new SWHID(TEST_ORIGIN_ID);
Endpoint endpoint1 = new Endpoint(graph, "backward", "*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "backward", "*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
- expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021"));
+ expectedPaths.add(new SwhPath(TEST_ORIGIN_ID));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void backwardFromMiddle() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:dir:0000000000000000000000000000000000000012");
Endpoint endpoint1 = new Endpoint(graph, "backward", "*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "backward", "*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void backwardFromLeaf() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004");
Endpoint endpoint1 = new Endpoint(graph, "backward", "*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "backward", "*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019"));
expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019"));
expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:rev:0000000000000000000000000000000000000009",
- "swh:1:snp:0000000000000000000000000000000000000020",
- "swh:1:ori:0000000000000000000000000000000000000021"));
+ "swh:1:snp:0000000000000000000000000000000000000020", TEST_ORIGIN_ID));
expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rel:0000000000000000000000000000000000000010",
- "swh:1:snp:0000000000000000000000000000000000000020",
- "swh:1:ori:0000000000000000000000000000000000000021"));
+ "swh:1:snp:0000000000000000000000000000000000000020", TEST_ORIGIN_ID));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardSnpToRev() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:snp:0000000000000000000000000000000000000020");
Endpoint endpoint1 = new Endpoint(graph, "forward", "snp:rev");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "snp:rev");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardRelToRevRevToRev() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:rel:0000000000000000000000000000000000000010");
Endpoint endpoint1 = new Endpoint(graph, "forward", "rel:rev,rev:rev");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "rel:rev,rev:rev");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardRevToAllDirToAll() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:rev:0000000000000000000000000000000000000013");
Endpoint endpoint1 = new Endpoint(graph, "forward", "rev:*,dir:*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "rev:*,dir:*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:cnt:0000000000000000000000000000000000000011"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:dir:0000000000000000000000000000000000000012",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardSnpToAllRevToAll() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:snp:0000000000000000000000000000000000000020");
Endpoint endpoint1 = new Endpoint(graph, "forward", "snp:*,rev:*");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "snp:*,rev:*");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002"));
expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008"));
expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardNoEdges() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:snp:0000000000000000000000000000000000000020");
Endpoint endpoint1 = new Endpoint(graph, "forward", "");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "forward", "");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void backwardRevToRevRevToRel() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:rev:0000000000000000000000000000000000000003");
Endpoint endpoint1 = new Endpoint(graph, "backward", "rev:rev,rev:rel");
ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result;
Endpoint endpoint2 = new Endpoint(graph, "backward", "rev:rev,rev:rel");
ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedPaths = new ArrayList();
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000013",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019"));
expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rel:0000000000000000000000000000000000000010"));
GraphTest.assertEqualsAnyOrder(expectedPaths, paths);
assertSameNodesFromPaths(expectedPaths, nodes);
}
@Test
public void forwardFromRootNodesOnly() {
SwhBidirectionalGraph graph = getGraph();
- SWHID swhid = new SWHID("swh:1:ori:0000000000000000000000000000000000000021");
+ SWHID swhid = new SWHID(TEST_ORIGIN_ID);
Endpoint endpoint = new Endpoint(graph, "forward", "*");
ArrayList nodes = (ArrayList) endpoint.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedNodes = new ArrayList();
- expectedNodes.add(new SWHID("swh:1:ori:0000000000000000000000000000000000000021"));
+ expectedNodes.add(new SWHID(TEST_ORIGIN_ID));
expectedNodes.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020"));
expectedNodes.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010"));
expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009"));
expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003"));
expectedNodes.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002"));
expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"));
expectedNodes.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000008"));
expectedNodes.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000006"));
expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"));
expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005"));
expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"));
GraphTest.assertEqualsAnyOrder(expectedNodes, nodes);
}
@Test
public void backwardRevToAllNodesOnly() {
SwhBidirectionalGraph graph = getGraph();
SWHID swhid = new SWHID("swh:1:rev:0000000000000000000000000000000000000003");
Endpoint endpoint = new Endpoint(graph, "backward", "rev:*");
ArrayList nodes = (ArrayList) endpoint.visitNodes(new Endpoint.Input(swhid)).result;
ArrayList expectedNodes = new ArrayList();
expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003"));
expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009"));
expectedNodes.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020"));
expectedNodes.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010"));
expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013"));
expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000018"));
expectedNodes.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019"));
GraphTest.assertEqualsAnyOrder(expectedNodes, nodes);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java
new file mode 100644
index 0000000..3f13b9d
--- /dev/null
+++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java
@@ -0,0 +1,106 @@
+package org.softwareheritage.graph.compress;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.softwareheritage.graph.GraphTest;
+import org.softwareheritage.graph.Node;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.TreeSet;
+
+public class ExtractNodesTest extends GraphTest {
+ /** Generate a fake SWHID for a given node type and numeric ID */
+ private static byte[] f(String type, int id) {
+ String hash = new String(DigestUtils.sha1Hex(type + id).getBytes());
+ return String.format("swh:1:%s:%s", type, hash).getBytes();
+ }
+
+ static class FakeDataset implements GraphDataset {
+ @Override
+ public void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException {
+ // For each node type, write nodes {1..4} as present in the graph
+ for (Node.Type type : Node.Type.values()) {
+ for (int i = 1; i <= 4; i++) {
+ byte[] node = f(type.toString().toLowerCase(), i);
+ nodeCb.onNode(node);
+ }
+ }
+
+ edgeCb.onEdge(f("ori", 1), f("snp", 1), null, -1);
+ edgeCb.onEdge(f("ori", 2), f("snp", 2), null, -1);
+ edgeCb.onEdge(f("ori", 3), f("snp", 3), null, -1);
+ edgeCb.onEdge(f("ori", 4), f("snp", 404), null, -1);
+
+ edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup1".getBytes(), -1);
+ edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup2".getBytes(), -1);
+ edgeCb.onEdge(f("snp", 3), f("cnt", 1), "c1".getBytes(), -1);
+ edgeCb.onEdge(f("snp", 4), f("rel", 1), "r1".getBytes(), -1);
+
+ edgeCb.onEdge(f("rel", 1), f("rel", 2), null, -1);
+ edgeCb.onEdge(f("rel", 2), f("rev", 1), null, -1);
+ edgeCb.onEdge(f("rel", 3), f("rev", 2), null, -1);
+ edgeCb.onEdge(f("rel", 4), f("dir", 1), null, -1);
+
+ edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1);
+ edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1);
+ edgeCb.onEdge(f("rev", 1), f("rev", 2), null, -1);
+ edgeCb.onEdge(f("rev", 2), f("rev", 404), null, -1);
+ edgeCb.onEdge(f("rev", 3), f("rev", 2), null, -1);
+ edgeCb.onEdge(f("rev", 4), f("dir", 1), null, -1);
+
+ edgeCb.onEdge(f("dir", 1), f("cnt", 1), "c1".getBytes(), 42);
+ edgeCb.onEdge(f("dir", 1), f("dir", 1), "d1".getBytes(), 1337);
+ edgeCb.onEdge(f("dir", 1), f("rev", 1), "r1".getBytes(), 0);
+ }
+ }
+
+ @Test
+ public void testExtractNodes(@TempDir Path outputDir, @TempDir Path sortTmpDir)
+ throws IOException, InterruptedException {
+ FakeDataset dataset = new FakeDataset();
+ ExtractNodes.extractNodes(dataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toString());
+
+ // Check count files
+ Long nodeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.nodes.count.txt")).strip());
+ Long edgeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.edges.count.txt")).strip());
+ Long labelCount = Long.parseLong(Files.readString(outputDir.resolve("graph.labels.count.txt")).strip());
+ Assertions.assertEquals(26L, nodeCount);
+ Assertions.assertEquals(21L, edgeCount);
+ Assertions.assertEquals(5L, labelCount);
+
+ // Check stat files
+ List nodeStats = Files.readAllLines(outputDir.resolve("graph.nodes.stats.txt"));
+ List edgeStats = Files.readAllLines(outputDir.resolve("graph.edges.stats.txt"));
+ Assertions.assertEquals(nodeStats, List.of("cnt 4", "dir 4", "ori 4", "rel 4", "rev 5", "snp 5"));
+ Assertions.assertEquals(edgeStats, List.of("dir:cnt 1", "dir:dir 1", "dir:rev 1", "ori:snp 4", "rel:dir 1",
+ "rel:rel 1", "rel:rev 2", "rev:dir 1", "rev:rev 5", "snp:cnt 1", "snp:rel 1", "snp:rev 2"));
+
+ // Build ordered set of expected node IDs
+ TreeSet expectedNodes = new TreeSet<>();
+ for (Node.Type type : Node.Type.values()) {
+ for (int i = 1; i <= 4; i++) {
+ byte[] node = f(type.toString().toLowerCase(), i);
+ expectedNodes.add(new String(node));
+ }
+ }
+ expectedNodes.add(new String(f("snp", 404)));
+ expectedNodes.add(new String(f("rev", 404)));
+ String[] nodeLines = readZstFile(outputDir.resolve("graph.nodes.csv.zst"));
+ Assertions.assertArrayEquals(expectedNodes.toArray(new String[0]), nodeLines);
+
+ // Build ordered set of expected label IDs
+ TreeSet expectedLabels = new TreeSet<>();
+ expectedLabels.add("dup1");
+ expectedLabels.add("dup2");
+ expectedLabels.add("c1");
+ expectedLabels.add("r1");
+ expectedLabels.add("d1");
+ String[] labelLines = readZstFile(outputDir.resolve("graph.labels.csv.zst"));
+ Assertions.assertArrayEquals(expectedLabels.toArray(new String[0]), labelLines);
+ }
+}
diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java
new file mode 100644
index 0000000..9089d0d
--- /dev/null
+++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java
@@ -0,0 +1,76 @@
+package org.softwareheritage.graph.compress;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.softwareheritage.graph.GraphTest;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+public class ExtractPersonsTest extends GraphTest {
+ private static class FakeORCDataset extends ORCGraphDataset {
+ private static class FakeSwhOrcTable extends ORCGraphDataset.SwhOrcTable {
+ private final String tableName;
+
+ public FakeSwhOrcTable(String tableName) {
+ this.tableName = tableName;
+ }
+
+ @Override
+ public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException {
+ if (tableName.equals("revision") && longColumn.equals("author")) {
+ cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_author_1".getBytes());
+ cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_author_1".getBytes());
+ cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_author_2".getBytes());
+ cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_1".getBytes());
+ cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_3".getBytes());
+ } else if (tableName.equals("revision") && longColumn.equals("committer")) {
+ cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_committer_1".getBytes());
+ cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_committer_1".getBytes());
+ cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_committer_2".getBytes());
+ cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_2".getBytes());
+ cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_1".getBytes());
+ cb.onBytes(fakeSWHID("rev", 6).toBytes(), "rev_committer_1".getBytes());
+ } else if (tableName.equals("release") && longColumn.equals("author")) {
+ cb.onBytes(fakeSWHID("rel", 1).toBytes(), "rel_committer_1".getBytes());
+ cb.onBytes(fakeSWHID("rel", 2).toBytes(), "rel_committer_1".getBytes());
+ cb.onBytes(fakeSWHID("rel", 3).toBytes(), "rel_committer_2".getBytes());
+ cb.onBytes(fakeSWHID("rel", 4).toBytes(), "rev_author_2".getBytes());
+ cb.onBytes(fakeSWHID("rel", 5).toBytes(), "rev_author_1".getBytes());
+ cb.onBytes(fakeSWHID("rel", 6).toBytes(), "rev_committer_1".getBytes());
+ cb.onBytes(fakeSWHID("rel", 7).toBytes(), "rel_committer_1".getBytes());
+ } else {
+ throw new RuntimeException("Unknown table/column: " + tableName + "/" + longColumn);
+ }
+ }
+ }
+
+ public SwhOrcTable getTable(String tableName) {
+ return new FakeSwhOrcTable(tableName);
+ }
+ }
+
+ @Test
+ public void testExtractPersons(@TempDir Path outputDir, @TempDir Path sortTmpDir)
+ throws IOException, InterruptedException {
+
+ FakeORCDataset fakeORCDataset = new FakeORCDataset();
+ ExtractPersons.extractPersons(fakeORCDataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toString());
+
+ ArrayList expectedPersons = new ArrayList<>(Arrays.asList("rev_author_1", "rev_author_2",
+ "rev_author_3", "rev_committer_1", "rev_committer_2", "rel_committer_1", "rel_committer_2"));
+
+ // Check count files
+ Long personsCount = Long.parseLong(Files.readString(outputDir.resolve("graph.persons.count.txt")).strip());
+ Assertions.assertEquals(expectedPersons.size(), personsCount);
+
+ // Check persons
+ expectedPersons.sort(String::compareTo);
+ String[] personLines = readZstFile(outputDir.resolve("graph.persons.csv.zst"));
+ Assertions.assertArrayEquals(expectedPersons.toArray(new String[0]), personLines);
+ }
+}
diff --git a/swh/graph/backend.py b/swh/graph/backend.py
index 5fb82f5..16fa14e 100644
--- a/swh/graph/backend.py
+++ b/swh/graph/backend.py
@@ -1,176 +1,176 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import asyncio
import contextlib
import io
import os
import re
import subprocess
import sys
import tempfile
from py4j.java_gateway import JavaGateway
from py4j.protocol import Py4JJavaError
from swh.graph.config import check_config
BUF_LINES = 1024
def _get_pipe_stderr():
# Get stderr if possible, or pipe to stdout if running with Jupyter.
try:
sys.stderr.fileno()
except io.UnsupportedOperation:
return subprocess.STDOUT
else:
return sys.stderr
class Backend:
def __init__(self, graph_path, config=None):
self.gateway = None
self.entry = None
self.graph_path = graph_path
self.config = check_config(config or {})
def start_gateway(self):
self.gateway = JavaGateway.launch_gateway(
java_path=None,
javaopts=self.config["java_tool_options"].split(),
classpath=self.config["classpath"],
die_on_exit=True,
redirect_stdout=sys.stdout,
redirect_stderr=_get_pipe_stderr(),
)
self.entry = self.gateway.jvm.org.softwareheritage.graph.Entry()
self.entry.load_graph(self.graph_path)
self.stream_proxy = JavaStreamProxy(self.entry)
def stop_gateway(self):
self.gateway.shutdown()
def __enter__(self):
self.start_gateway()
return self
def __exit__(self, exc_type, exc_value, tb):
self.stop_gateway()
def stats(self):
return self.entry.stats()
def check_swhid(self, swhid):
try:
self.entry.check_swhid(swhid)
except Py4JJavaError as e:
m = re.search(r"malformed SWHID: (\w+)", str(e))
if m:
raise ValueError(f"malformed SWHID: {m[1]}")
- m = re.search(r"Unknown SWHID: (\w+)", str(e))
+ m = re.search(r"Unknown SWHID: ([:\w]+)", str(e))
if m:
raise NameError(f"Unknown SWHID: {m[1]}")
raise
def count(self, ttype, *args):
method = getattr(self.entry, "count_" + ttype)
return method(*args)
async def traversal(self, ttype, *args):
method = getattr(self.stream_proxy, ttype)
async for line in method(*args):
yield line.decode().rstrip("\n")
class JavaStreamProxy:
"""A proxy class for the org.softwareheritage.graph.Entry Java class that
takes care of the setup and teardown of the named-pipe FIFO communication
between Python and Java.
Initialize JavaStreamProxy using:
proxy = JavaStreamProxy(swh_entry_class_instance)
Then you can call an Entry method and iterate on the FIFO results like
this:
async for value in proxy.java_method(arg1, arg2):
print(value)
"""
def __init__(self, entry):
self.entry = entry
async def read_node_ids(self, fname):
loop = asyncio.get_event_loop()
open_thread = loop.run_in_executor(None, open, fname, "rb")
# Since the open() call on the FIFO is blocking until it is also opened
# on the Java side, we await it with a timeout in case there is an
# exception that prevents the write-side open().
with (await asyncio.wait_for(open_thread, timeout=2)) as f:
def read_n_lines(f, n):
buf = []
for _ in range(n):
try:
buf.append(next(f))
except StopIteration:
break
return buf
while True:
lines = await loop.run_in_executor(None, read_n_lines, f, BUF_LINES)
if not lines:
break
for line in lines:
yield line
class _HandlerWrapper:
def __init__(self, handler):
self._handler = handler
def __getattr__(self, name):
func = getattr(self._handler, name)
async def java_call(*args, **kwargs):
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: func(*args, **kwargs))
def java_task(*args, **kwargs):
return asyncio.create_task(java_call(*args, **kwargs))
return java_task
@contextlib.contextmanager
def get_handler(self):
with tempfile.TemporaryDirectory(prefix="swh-graph-") as tmpdirname:
cli_fifo = os.path.join(tmpdirname, "swh-graph.fifo")
os.mkfifo(cli_fifo)
reader = self.read_node_ids(cli_fifo)
query_handler = self.entry.get_handler(cli_fifo)
handler = self._HandlerWrapper(query_handler)
yield (handler, reader)
def __getattr__(self, name):
async def java_call_iterator(*args, **kwargs):
with self.get_handler() as (handler, reader):
java_task = getattr(handler, name)(*args, **kwargs)
try:
async for value in reader:
yield value
except asyncio.TimeoutError:
# If the read-side open() timeouts, an exception on the
# Java side probably happened that prevented the
# write-side open(). We propagate this exception here if
# that is the case.
task_exc = java_task.exception()
if task_exc:
raise task_exc
raise
await java_task
return java_call_iterator
diff --git a/swh/graph/cli.py b/swh/graph/cli.py
index 7d399ac..82cf1fd 100644
--- a/swh/graph/cli.py
+++ b/swh/graph/cli.py
@@ -1,447 +1,449 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING, Any, Dict, Set, Tuple
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import click
from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
from swh.core.cli import swh as swh_cli_group
if TYPE_CHECKING:
from swh.graph.webgraph import CompressionStep # noqa
class StepOption(click.ParamType):
"""click type for specifying a compression step on the CLI
parse either individual steps, specified as step names or integers, or step
ranges
"""
name = "compression step"
def convert(self, value, param, ctx): # type: (...) -> Set[CompressionStep]
from swh.graph.webgraph import COMP_SEQ, CompressionStep # noqa
steps: Set[CompressionStep] = set()
specs = value.split(",")
for spec in specs:
if "-" in spec: # step range
(raw_l, raw_r) = spec.split("-", maxsplit=1)
if raw_l == "": # no left endpoint
raw_l = COMP_SEQ[0].name
if raw_r == "": # no right endpoint
raw_r = COMP_SEQ[-1].name
l_step = self.convert(raw_l, param, ctx)
r_step = self.convert(raw_r, param, ctx)
if len(l_step) != 1 or len(r_step) != 1:
self.fail(f"invalid step specification: {value}, " f"see --help")
l_idx = l_step.pop()
r_idx = r_step.pop()
steps = steps.union(
set(CompressionStep(i) for i in range(l_idx.value, r_idx.value + 1))
)
else: # singleton step
try:
steps.add(CompressionStep(int(spec))) # integer step
except ValueError:
try:
steps.add(CompressionStep[spec.upper()]) # step name
except KeyError:
self.fail(
f"invalid step specification: {value}, " f"see --help"
)
return steps
class PathlibPath(click.Path):
"""A Click path argument that returns a pathlib Path, not a string"""
def convert(self, value, param, ctx):
return Path(super().convert(value, param, ctx))
DEFAULT_CONFIG: Dict[str, Tuple[str, Any]] = {"graph": ("dict", {})}
@swh_cli_group.group(name="graph", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup)
@click.option(
"--config-file",
"-C",
default=None,
type=click.Path(exists=True, dir_okay=False,),
help="YAML configuration file",
)
@click.pass_context
def graph_cli_group(ctx, config_file):
"""Software Heritage graph tools."""
from swh.core import config
ctx.ensure_object(dict)
conf = config.read(config_file, DEFAULT_CONFIG)
if "graph" not in conf:
raise ValueError(
'no "graph" stanza found in configuration file %s' % config_file
)
ctx.obj["config"] = conf
@graph_cli_group.command("api-client")
@click.option("--host", default="localhost", help="Graph server host")
@click.option("--port", default="5009", help="Graph server port")
@click.pass_context
def api_client(ctx, host, port):
"""client for the graph RPC service"""
from swh.graph import client
url = "http://{}:{}".format(host, port)
app = client.RemoteGraphClient(url)
# TODO: run web app
print(app.stats())
@graph_cli_group.group("map")
@click.pass_context
def map(ctx):
"""Manage swh-graph on-disk maps"""
pass
def dump_swhid2node(filename):
from swh.graph.swhid import SwhidToNodeMap
for (swhid, int) in SwhidToNodeMap(filename):
print("{}\t{}".format(swhid, int))
def dump_node2swhid(filename):
from swh.graph.swhid import NodeToSwhidMap
for (int, swhid) in NodeToSwhidMap(filename):
print("{}\t{}".format(int, swhid))
def restore_swhid2node(filename):
"""read a textual SWHID->int map from stdin and write its binary version to
filename
"""
from swh.graph.swhid import SwhidToNodeMap
with open(filename, "wb") as dst:
for line in sys.stdin:
(str_swhid, str_int) = line.split()
SwhidToNodeMap.write_record(dst, str_swhid, int(str_int))
def restore_node2swhid(filename, length):
"""read a textual int->SWHID map from stdin and write its binary version to
filename
"""
from swh.graph.swhid import NodeToSwhidMap
node2swhid = NodeToSwhidMap(filename, mode="wb", length=length)
for line in sys.stdin:
(str_int, str_swhid) = line.split()
node2swhid[int(str_int)] = str_swhid
node2swhid.close()
@map.command("dump")
@click.option(
"--type",
"-t",
"map_type",
required=True,
type=click.Choice(["swhid2node", "node2swhid"]),
help="type of map to dump",
)
@click.argument("filename", required=True, type=click.Path(exists=True))
@click.pass_context
def dump_map(ctx, map_type, filename):
"""Dump a binary SWHID<->node map to textual format."""
if map_type == "swhid2node":
dump_swhid2node(filename)
elif map_type == "node2swhid":
dump_node2swhid(filename)
else:
raise ValueError("invalid map type: " + map_type)
pass
@map.command("restore")
@click.option(
"--type",
"-t",
"map_type",
required=True,
type=click.Choice(["swhid2node", "node2swhid"]),
help="type of map to dump",
)
@click.option(
"--length",
"-l",
type=int,
help="""map size in number of logical records
(required for node2swhid maps)""",
)
@click.argument("filename", required=True, type=click.Path())
@click.pass_context
def restore_map(ctx, map_type, length, filename):
"""Restore a binary SWHID<->node map from textual format."""
if map_type == "swhid2node":
restore_swhid2node(filename)
elif map_type == "node2swhid":
if length is None:
raise click.UsageError(
"map length is required when restoring {} maps".format(map_type), ctx
)
restore_node2swhid(filename, length)
else:
raise ValueError("invalid map type: " + map_type)
@map.command("write")
@click.option(
"--type",
"-t",
"map_type",
required=True,
type=click.Choice(["swhid2node", "node2swhid"]),
help="type of map to write",
)
@click.argument("filename", required=True, type=click.Path())
@click.pass_context
def write(ctx, map_type, filename):
"""Write a map to disk sequentially.
read from stdin a textual SWHID->node mapping (for swhid2node, or a simple
sequence of SWHIDs for node2swhid) and write it to disk in the requested binary
map format
note that no sorting is applied, so the input should already be sorted as
required by the chosen map type (by SWHID for swhid2node, by int for node2swhid)
"""
from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap
with open(filename, "wb") as f:
if map_type == "swhid2node":
for line in sys.stdin:
(swhid, int_str) = line.rstrip().split(maxsplit=1)
SwhidToNodeMap.write_record(f, swhid, int(int_str))
elif map_type == "node2swhid":
for line in sys.stdin:
swhid = line.rstrip()
NodeToSwhidMap.write_record(f, swhid)
else:
raise ValueError("invalid map type: " + map_type)
@map.command("lookup")
@click.option(
"--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename"
)
@click.argument("identifiers", nargs=-1)
def map_lookup(graph, identifiers):
"""Lookup identifiers using on-disk maps.
Depending on the identifier type lookup either a SWHID into a SWHID->node (and
return the node integer identifier) or, vice-versa, lookup a node integer
identifier into a node->SWHID (and return the SWHID). The desired behavior is
chosen depending on the syntax of each given identifier.
Identifiers can be passed either directly on the command line or on
standard input, separate by blanks. Logical lines (as returned by
readline()) in stdin will be preserved in stdout.
"""
from swh.graph.backend import NODE2SWHID_EXT, SWHID2NODE_EXT
from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap
import swh.model.exceptions
from swh.model.swhids import ExtendedSWHID
success = True # no identifiers failed to be looked up
swhid2node = SwhidToNodeMap(f"{graph}.{SWHID2NODE_EXT}")
node2swhid = NodeToSwhidMap(f"{graph}.{NODE2SWHID_EXT}")
def lookup(identifier):
nonlocal success, swhid2node, node2swhid
is_swhid = None
try:
int(identifier)
is_swhid = False
except ValueError:
try:
ExtendedSWHID.from_string(identifier)
is_swhid = True
except swh.model.exceptions.ValidationError:
success = False
logging.error(f'invalid identifier: "{identifier}", skipping')
try:
if is_swhid:
return str(swhid2node[identifier])
else:
return node2swhid[int(identifier)]
except KeyError:
success = False
logging.error(f'identifier not found: "{identifier}", skipping')
if identifiers: # lookup identifiers passed via CLI
for identifier in identifiers:
print(lookup(identifier))
else: # lookup identifiers passed via stdin, preserving logical lines
for line in sys.stdin:
results = [lookup(id) for id in line.rstrip().split()]
if results: # might be empty if all IDs on the same line failed
print(" ".join(results))
sys.exit(0 if success else 1)
@graph_cli_group.command(name="rpc-serve")
@click.option(
"--host",
"-h",
default="0.0.0.0",
metavar="IP",
show_default=True,
help="host IP address to bind the server on",
)
@click.option(
"--port",
"-p",
default=5009,
type=click.INT,
metavar="PORT",
show_default=True,
help="port to bind the server on",
)
@click.option(
"--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename"
)
@click.pass_context
def serve(ctx, host, port, graph):
"""run the graph RPC service"""
import aiohttp
from swh.graph.server.app import make_app
config = ctx.obj["config"]
config.setdefault("graph", {})
config["graph"]["path"] = graph
app = make_app(config=config)
aiohttp.web.run_app(app, host=host, port=port)
@graph_cli_group.command()
@click.option(
- "--graph",
- "-g",
+ "--input-dataset",
+ "-i",
required=True,
- metavar="GRAPH",
type=PathlibPath(),
- help="input graph basename",
+ help="graph dataset directory, in ORC format",
)
@click.option(
- "--outdir",
+ "--output-directory",
"-o",
- "out_dir",
required=True,
- metavar="DIR",
+ type=PathlibPath(),
+ help="directory where to store compressed graph",
+)
+@click.option(
+ "--graph-name",
+ "-g",
+ default="graph",
type=PathlibPath(),
help="directory where to store compressed graph",
)
@click.option(
"--steps",
"-s",
metavar="STEPS",
type=StepOption(),
help="run only these compression steps (default: all steps)",
)
@click.pass_context
-def compress(ctx, graph, out_dir, steps):
+def compress(ctx, input_dataset, output_directory, graph_name, steps):
"""Compress a graph using WebGraph
Input: a pair of files g.nodes.csv.gz, g.edges.csv.gz
Output: a directory containing a WebGraph compressed graph
Compression steps are: (1) mph, (2) bv, (3) bfs, (4) permute_bfs,
(5) transpose_bfs, (6) simplify, (7) llp, (8) permute_llp, (9) obl, (10)
compose_orders, (11) stats, (12) transpose, (13) transpose_obl, (14) maps,
(15) clean_tmp. Compression steps can be selected by name or number using
--steps, separating them with commas; step ranges (e.g., 3-9, 6-, etc.) are
also supported.
"""
from swh.graph import webgraph
- graph_name = graph.name
- in_dir = graph.parent
try:
conf = ctx.obj["config"]["graph"]["compress"]
except KeyError:
conf = {} # use defaults
- webgraph.compress(graph_name, in_dir, out_dir, steps, conf)
+ webgraph.compress(graph_name, input_dataset, output_directory, steps, conf)
@graph_cli_group.command(name="cachemount")
@click.option(
"--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename"
)
@click.option(
"--cache",
"-c",
default="/dev/shm/swh-graph/default",
metavar="CACHE",
type=PathlibPath(),
help="Memory cache path (defaults to /dev/shm/swh-graph/default)",
)
@click.pass_context
def cachemount(ctx, graph, cache):
"""
Cache the mmapped files of the compressed graph in a tmpfs.
This command creates a new directory at the path given by CACHE that has
the same structure as the compressed graph basename, except it copies the
files that require mmap access (:file:`{*}.graph`) but uses symlinks from the source
for all the other files (:file:`{*}.map`, :file:`{*}.bin`, ...).
The command outputs the path to the memory cache directory (particularly
useful when relying on the default value).
"""
import shutil
cache.mkdir(parents=True)
for src in Path(graph).parent.glob("*"):
dst = cache / src.name
if src.suffix == ".graph":
shutil.copy2(src, dst)
else:
dst.symlink_to(src.resolve())
print(cache)
def main():
return graph_cli_group(auto_envvar_prefix="SWH_GRAPH")
if __name__ == "__main__":
main()
diff --git a/swh/graph/config.py b/swh/graph/config.py
index f144f26..b47e48b 100644
--- a/swh/graph/config.py
+++ b/swh/graph/config.py
@@ -1,115 +1,116 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from pathlib import Path
import sys
import psutil
def find_graph_jar():
"""find swh-graph.jar, containing the Java part of swh-graph
look both in development directories and installed data (for in-production
deployments who fecthed the JAR from pypi)
"""
swh_graph_root = Path(__file__).parents[2]
try_paths = [
swh_graph_root / "java/target/",
Path(sys.prefix) / "share/swh-graph/",
Path(sys.prefix) / "local/share/swh-graph/",
]
for path in try_paths:
glob = list(path.glob("swh-graph-*.jar"))
if glob:
if len(glob) > 1:
logging.warn(
"found multiple swh-graph JARs, " "arbitrarily picking one"
)
logging.info("using swh-graph JAR: {0}".format(glob[0]))
return str(glob[0])
raise RuntimeError("swh-graph JAR not found. Have you run `make java`?")
def check_config(conf):
"""check configuration and propagate defaults"""
conf = conf.copy()
if "batch_size" not in conf:
# Use 0.1% of the RAM as a batch size:
# ~1 billion for big servers, ~10 million for small desktop machines
conf["batch_size"] = int(psutil.virtual_memory().total / 1000)
if "llp_gammas" not in conf:
conf["llp_gammas"] = "-0,-1,-2,-3,-4"
if "max_ram" not in conf:
conf["max_ram"] = str(psutil.virtual_memory().total)
if "java_tool_options" not in conf:
conf["java_tool_options"] = " ".join(
[
"-Xmx{max_ram}",
"-XX:PretenureSizeThreshold=512M",
"-XX:MaxNewSize=4G",
"-XX:+UseLargePages",
"-XX:+UseTransparentHugePages",
"-XX:+UseNUMA",
"-XX:+UseTLAB",
"-XX:+ResizeTLAB",
]
)
conf["java_tool_options"] = conf["java_tool_options"].format(
max_ram=conf["max_ram"]
)
if "java" not in conf:
conf["java"] = "java"
if "classpath" not in conf:
conf["classpath"] = find_graph_jar()
return conf
def check_config_compress(config, graph_name, in_dir, out_dir):
"""check compression-specific configuration and initialize its execution
environment.
"""
conf = check_config(config)
conf["graph_name"] = graph_name
conf["in_dir"] = str(in_dir)
conf["out_dir"] = str(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
if "tmp_dir" not in conf:
tmp_dir = out_dir / "tmp"
conf["tmp_dir"] = str(tmp_dir)
else:
tmp_dir = Path(conf["tmp_dir"])
tmp_dir.mkdir(parents=True, exist_ok=True)
if "logback" not in conf:
logback_confpath = tmp_dir / "logback.xml"
with open(logback_confpath, "w") as conffile:
conffile.write(
"""
-
+
%d %r %p [%t] %logger{1} - %m%n
+ System.err
-
+
"""
)
conf["logback"] = str(logback_confpath)
conf["java_tool_options"] += " -Dlogback.configurationFile={logback}"
conf["java_tool_options"] += " -Djava.io.tmpdir={tmp_dir}"
conf["java_tool_options"] = conf["java_tool_options"].format(
logback=conf["logback"], tmp_dir=conf["tmp_dir"],
)
return conf
diff --git a/swh/graph/tests/conftest.py b/swh/graph/tests/conftest.py
index fed877b..09038e7 100644
--- a/swh/graph/tests/conftest.py
+++ b/swh/graph/tests/conftest.py
@@ -1,59 +1,71 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import csv
import multiprocessing
from pathlib import Path
+import subprocess
from aiohttp.test_utils import TestClient, TestServer, loop_context
import pytest
from swh.graph.client import RemoteGraphClient
from swh.graph.naive_client import NaiveClient
SWH_GRAPH_TESTS_ROOT = Path(__file__).parents[0]
-TEST_GRAPH_PATH = SWH_GRAPH_TESTS_ROOT / "dataset/output/example"
+TEST_GRAPH_PATH = SWH_GRAPH_TESTS_ROOT / "dataset/compressed/example"
class GraphServerProcess(multiprocessing.Process):
def __init__(self, q, *args, **kwargs):
self.q = q
super().__init__(*args, **kwargs)
def run(self):
# Lazy import to allow debian packaging
from swh.graph.backend import Backend
from swh.graph.server.app import make_app
try:
backend = Backend(graph_path=str(TEST_GRAPH_PATH))
with loop_context() as loop:
app = make_app(backend=backend, debug=True)
client = TestClient(TestServer(app), loop=loop)
loop.run_until_complete(client.start_server())
url = client.make_url("/graph/")
self.q.put(url)
loop.run_forever()
except Exception as e:
self.q.put(e)
@pytest.fixture(scope="module", params=["remote", "naive"])
def graph_client(request):
if request.param == "remote":
queue = multiprocessing.Queue()
server = GraphServerProcess(queue)
server.start()
res = queue.get()
if isinstance(res, Exception):
raise res
yield RemoteGraphClient(str(res))
server.terminate()
else:
- with open(SWH_GRAPH_TESTS_ROOT / "dataset/example.nodes.csv") as fd:
- nodes = [node for (node,) in csv.reader(fd, delimiter=" ")]
- with open(SWH_GRAPH_TESTS_ROOT / "dataset/example.edges.csv") as fd:
- edges = list(csv.reader(fd, delimiter=" "))
- yield NaiveClient(nodes=nodes, edges=edges)
+
+ def zstdcat(*files):
+ p = subprocess.run(["zstdcat", *files], stdout=subprocess.PIPE)
+ return p.stdout.decode()
+
+ edges_dataset = SWH_GRAPH_TESTS_ROOT / "dataset/edges"
+ edge_files = edges_dataset.glob("*/*.edges.csv.zst")
+ node_files = edges_dataset.glob("*/*.nodes.csv.zst")
+
+ nodes = set(zstdcat(*node_files).strip().split("\n"))
+ edge_lines = [line.split() for line in zstdcat(*edge_files).strip().split("\n")]
+ edges = [(src, dst) for src, dst, *_ in edge_lines]
+ for src, dst in edges:
+ nodes.add(src)
+ nodes.add(dst)
+
+ yield NaiveClient(nodes=list(nodes), edges=edges)
diff --git a/swh/graph/tests/dataset/.gitignore b/swh/graph/tests/dataset/.gitignore
deleted file mode 100644
index 531c841..0000000
--- a/swh/graph/tests/dataset/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-docker/
-output/*-bv.*
-output/stderr
-output/stdout
-output/compression.log
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
new file mode 100644
index 0000000..bd285c7
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets
@@ -0,0 +1,2 @@
+鰝
+Á=0
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labels b/swh/graph/tests/dataset/compressed/example-labelled.labels
new file mode 100644
index 0000000..48d3dcf
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example-labelled.labels
@@ -0,0 +1 @@
+D$@&(pPu
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-labelled.properties b/swh/graph/tests/dataset/compressed/example-labelled.properties
new file mode 100644
index 0000000..4f4c55a
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example-labelled.properties
@@ -0,0 +1,3 @@
+graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph
+labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6)
+underlyinggraph = example
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.graph b/swh/graph/tests/dataset/compressed/example-transposed.graph
new file mode 100644
index 0000000..cfb586b
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example-transposed.graph
@@ -0,0 +1 @@
+^utZk"5rk bt
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example-transposed.obl b/swh/graph/tests/dataset/compressed/example-transposed.obl
similarity index 77%
rename from swh/graph/tests/dataset/output/example-transposed.obl
rename to swh/graph/tests/dataset/compressed/example-transposed.obl
index 54f0ac8..aca6075 100644
Binary files a/swh/graph/tests/dataset/output/example-transposed.obl and b/swh/graph/tests/dataset/compressed/example-transposed.obl differ
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.offsets b/swh/graph/tests/dataset/compressed/example-transposed.offsets
new file mode 100644
index 0000000..593648b
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example-transposed.offsets
@@ -0,0 +1,2 @@
+(J(P
+ t(
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example-transposed.properties b/swh/graph/tests/dataset/compressed/example-transposed.properties
new file mode 100644
index 0000000..1d93cdb
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example-transposed.properties
@@ -0,0 +1,35 @@
+#BVGraph properties
+#Tue Mar 29 15:31:01 CEST 2022
+bitsforreferences=29
+avgbitsforintervals=0.762
+graphclass=it.unimi.dsi.big.webgraph.BVGraph
+avgdist=0.476
+successoravggap=4.978
+residualexpstats=10,3,2,2,2
+arcs=23
+minintervallength=4
+bitsforoutdegrees=61
+residualavgloggap=1.9652671721176243
+avgbitsforoutdegrees=2.905
+bitsforresiduals=89
+successoravgloggap=1.9474472774811822
+maxrefcount=3
+successorexpstats=12,4,2,3,2
+residualarcs=19
+avgbitsforresiduals=4.238
+avgbitsforblocks=0.19
+windowsize=7
+residualavggap=5.184
+copiedarcs=4
+avgbitsforreferences=1.381
+version=0
+compratio=1.57
+bitsperlink=8.652
+compressionflags=
+nodes=21
+avgref=0.19
+zetak=3
+bitsforintervals=16
+intervalisedarcs=0
+bitspernode=9.476
+bitsforblocks=4
diff --git a/swh/graph/tests/dataset/compressed/example.edges.count.txt b/swh/graph/tests/dataset/compressed/example.edges.count.txt
new file mode 100644
index 0000000..4099407
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.edges.count.txt
@@ -0,0 +1 @@
+23
diff --git a/swh/graph/tests/dataset/compressed/example.edges.stats.txt b/swh/graph/tests/dataset/compressed/example.edges.stats.txt
new file mode 100644
index 0000000..c9b8ac7
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.edges.stats.txt
@@ -0,0 +1,8 @@
+dir:cnt 8
+dir:dir 3
+ori:snp 1
+rel:rev 2
+rev:dir 4
+rev:rev 3
+snp:rel 1
+snp:rev 1
diff --git a/swh/graph/tests/dataset/compressed/example.graph b/swh/graph/tests/dataset/compressed/example.graph
new file mode 100644
index 0000000..0874b6f
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.graph
@@ -0,0 +1 @@
+]ߗ]'s^KT
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example.indegree b/swh/graph/tests/dataset/compressed/example.indegree
similarity index 100%
rename from swh/graph/tests/dataset/output/example.indegree
rename to swh/graph/tests/dataset/compressed/example.indegree
diff --git a/swh/graph/tests/dataset/compressed/example.labels.count.txt b/swh/graph/tests/dataset/compressed/example.labels.count.txt
new file mode 100644
index 0000000..45a4fb7
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.labels.count.txt
@@ -0,0 +1 @@
+8
diff --git a/swh/graph/tests/dataset/compressed/example.labels.csv.zst b/swh/graph/tests/dataset/compressed/example.labels.csv.zst
new file mode 100644
index 0000000..1cc8931
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.csv.zst differ
diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray
new file mode 100644
index 0000000..01451e0
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray differ
diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers
new file mode 100644
index 0000000..755c4c7
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers differ
diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.properties b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
new file mode 100644
index 0000000..deeac3a
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties
@@ -0,0 +1,2 @@
+n=8
+ratio=4
diff --git a/swh/graph/tests/dataset/compressed/example.labels.mph b/swh/graph/tests/dataset/compressed/example.labels.mph
new file mode 100644
index 0000000..58f7d19
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.mph differ
diff --git a/swh/graph/tests/dataset/output/example.mph b/swh/graph/tests/dataset/compressed/example.mph
similarity index 80%
copy from swh/graph/tests/dataset/output/example.mph
copy to swh/graph/tests/dataset/compressed/example.mph
index c6f9e19..c1dbe5c 100644
Binary files a/swh/graph/tests/dataset/output/example.mph and b/swh/graph/tests/dataset/compressed/example.mph differ
diff --git a/swh/graph/tests/dataset/compressed/example.node2swhid.bin b/swh/graph/tests/dataset/compressed/example.node2swhid.bin
new file mode 100644
index 0000000..3775a83
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.node2swhid.bin differ
diff --git a/swh/graph/tests/dataset/output/example.node2type.map b/swh/graph/tests/dataset/compressed/example.node2type.map
similarity index 90%
rename from swh/graph/tests/dataset/output/example.node2type.map
rename to swh/graph/tests/dataset/compressed/example.node2type.map
index 6b91c37..a77c1b3 100644
Binary files a/swh/graph/tests/dataset/output/example.node2type.map and b/swh/graph/tests/dataset/compressed/example.node2type.map differ
diff --git a/swh/graph/tests/dataset/compressed/example.nodes.count.txt b/swh/graph/tests/dataset/compressed/example.nodes.count.txt
new file mode 100644
index 0000000..aabe6ec
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.nodes.count.txt
@@ -0,0 +1 @@
+21
diff --git a/swh/graph/tests/dataset/compressed/example.nodes.csv.zst b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst
new file mode 100644
index 0000000..0559f37
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/compressed/example.nodes.stats.txt b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
new file mode 100644
index 0000000..097e698
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt
@@ -0,0 +1,6 @@
+cnt 7
+dir 6
+ori 1
+rel 2
+rev 4
+snp 1
diff --git a/swh/graph/tests/dataset/output/example.obl b/swh/graph/tests/dataset/compressed/example.obl
similarity index 61%
rename from swh/graph/tests/dataset/output/example.obl
rename to swh/graph/tests/dataset/compressed/example.obl
index 1b4fd2e..f6128c6 100644
Binary files a/swh/graph/tests/dataset/output/example.obl and b/swh/graph/tests/dataset/compressed/example.obl differ
diff --git a/swh/graph/tests/dataset/compressed/example.offsets b/swh/graph/tests/dataset/compressed/example.offsets
new file mode 100644
index 0000000..b0c1eb4
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.offsets
@@ -0,0 +1 @@
+CPcR(
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/compressed/example.order b/swh/graph/tests/dataset/compressed/example.order
new file mode 100644
index 0000000..4f9078b
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.order differ
diff --git a/swh/graph/tests/dataset/output/example.outdegree b/swh/graph/tests/dataset/compressed/example.outdegree
similarity index 100%
rename from swh/graph/tests/dataset/output/example.outdegree
rename to swh/graph/tests/dataset/compressed/example.outdegree
diff --git a/swh/graph/tests/dataset/compressed/example.persons.count.txt b/swh/graph/tests/dataset/compressed/example.persons.count.txt
new file mode 100644
index 0000000..00750ed
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.persons.count.txt
@@ -0,0 +1 @@
+3
diff --git a/swh/graph/tests/dataset/compressed/example.persons.csv.zst b/swh/graph/tests/dataset/compressed/example.persons.csv.zst
new file mode 100644
index 0000000..0da9b20
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.persons.csv.zst differ
diff --git a/swh/graph/tests/dataset/output/example.mph b/swh/graph/tests/dataset/compressed/example.persons.mph
similarity index 73%
rename from swh/graph/tests/dataset/output/example.mph
rename to swh/graph/tests/dataset/compressed/example.persons.mph
index c6f9e19..89de923 100644
Binary files a/swh/graph/tests/dataset/output/example.mph and b/swh/graph/tests/dataset/compressed/example.persons.mph differ
diff --git a/swh/graph/tests/dataset/output/example.properties b/swh/graph/tests/dataset/compressed/example.properties
similarity index 57%
rename from swh/graph/tests/dataset/output/example.properties
rename to swh/graph/tests/dataset/compressed/example.properties
index cb6975a..dbdffd2 100644
--- a/swh/graph/tests/dataset/output/example.properties
+++ b/swh/graph/tests/dataset/compressed/example.properties
@@ -1,35 +1,35 @@
#BVGraph properties
-#Sat Dec 04 01:37:26 CET 2021
+#Tue Mar 29 15:31:00 CEST 2022
bitsforreferences=14
avgbitsforintervals=0.667
graphclass=it.unimi.dsi.big.webgraph.BVGraph
avgdist=0
-successoravggap=7.391
-residualexpstats=7,7,3,3,2,1
+successoravggap=4.261
+residualexpstats=3,13,5,1,1
arcs=23
minintervallength=4
bitsforoutdegrees=51
-residualavgloggap=2.32668281341601
+residualavgloggap=2.0981034178845324
avgbitsforoutdegrees=2.429
-bitsforresiduals=111
-successoravgloggap=2.32668281341601
+bitsforresiduals=97
+successoravgloggap=2.0981034178845324
maxrefcount=3
-successorexpstats=7,7,3,3,2,1
+successorexpstats=3,13,5,1,1
residualarcs=23
-avgbitsforresiduals=5.286
+avgbitsforresiduals=4.619
avgbitsforblocks=0
windowsize=7
-residualavggap=7.391
+residualavggap=4.261
copiedarcs=0
avgbitsforreferences=0.667
version=0
-compratio=1.499
-bitsperlink=8.261
+compratio=1.388
+bitsperlink=7.652
compressionflags=
nodes=21
avgref=0
zetak=3
bitsforintervals=14
intervalisedarcs=0
-bitspernode=9.048
+bitspernode=8.381
bitsforblocks=0
diff --git a/swh/graph/tests/dataset/compressed/example.property.author_id.bin b/swh/graph/tests/dataset/compressed/example.property.author_id.bin
new file mode 100644
index 0000000..a955a2e
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.author_id.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin
new file mode 100644
index 0000000..3f73379
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin
new file mode 100644
index 0000000..6411ad4
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_id.bin b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin
new file mode 100644
index 0000000..5fb1646
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin
new file mode 100644
index 0000000..889c7a6
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin
new file mode 100644
index 0000000..2bfbfcf
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin
new file mode 100644
index 0000000..32494ff
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.content.length.bin b/swh/graph/tests/dataset/compressed/example.property.content.length.bin
new file mode 100644
index 0000000..4309bf6
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.content.length.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.message.bin b/swh/graph/tests/dataset/compressed/example.property.message.bin
new file mode 100644
index 0000000..5d50ccf
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.property.message.bin
@@ -0,0 +1,7 @@
+VmVyc2lvbiAxLjA=
+VmVyc2lvbiAyLjA=
+SW5pdGlhbCBjb21taXQ=
+QWRkIHBhcnNlcg==
+QWRkIHRlc3Rz
+UmVmYWN0b3IgY29kZWJhc2U=
+aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=
diff --git a/swh/graph/tests/dataset/compressed/example.property.message.offset.bin b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin
new file mode 100644
index 0000000..50ab491
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin differ
diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
new file mode 100644
index 0000000..ba37d43
--- /dev/null
+++ b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin
@@ -0,0 +1,2 @@
+djEuMA==
+djIuMA==
diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin
new file mode 100644
index 0000000..03fe768
Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin differ
diff --git a/swh/graph/tests/dataset/output/example.stats b/swh/graph/tests/dataset/compressed/example.stats
similarity index 68%
rename from swh/graph/tests/dataset/output/example.stats
rename to swh/graph/tests/dataset/compressed/example.stats
index a58d3e2..b5bddda 100644
--- a/swh/graph/tests/dataset/output/example.stats
+++ b/swh/graph/tests/dataset/compressed/example.stats
@@ -1,20 +1,20 @@
nodes=21
arcs=23
loops=0
-successoravggap=7.765
-avglocality=3.783
+successoravggap=4.000
+avglocality=2.522
minoutdegree=0
maxoutdegree=3
minoutdegreenode=1
-maxoutdegreenode=0
+maxoutdegreenode=13
dangling=7
terminal=7
percdangling=33.333333333333336
avgoutdegree=1.0952380952380953
-successorlogdeltastats=11,7,1,3,1
-successoravglogdelta=0.911
+successorlogdeltastats=13,6,2,2
+successoravglogdelta=0.794
minindegree=0
maxindegree=3
minindegreenode=17
maxindegreenode=18
avgindegree=1.0952380952380953
diff --git a/swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst
new file mode 100644
index 0000000..e58c09d
Binary files /dev/null and b/swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst
new file mode 100644
index 0000000..779ad79
Binary files /dev/null and b/swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst
new file mode 100644
index 0000000..7b0e7e8
Binary files /dev/null and b/swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst
new file mode 100644
index 0000000..57ad7ac
Binary files /dev/null and b/swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst
new file mode 100644
index 0000000..11bf2e2
Binary files /dev/null and b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst
new file mode 100644
index 0000000..850e058
Binary files /dev/null and b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst
new file mode 100644
index 0000000..59b5b0e
Binary files /dev/null and b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst
new file mode 100644
index 0000000..11bfce7
Binary files /dev/null and b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst
new file mode 100644
index 0000000..72f8cbb
Binary files /dev/null and b/swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst
new file mode 100644
index 0000000..56acc8b
Binary files /dev/null and b/swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst
new file mode 100644
index 0000000..97db59f
Binary files /dev/null and b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst differ
diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst
new file mode 100644
index 0000000..5cd8295
Binary files /dev/null and b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst differ
diff --git a/swh/graph/tests/dataset/example.edges.csv b/swh/graph/tests/dataset/example.edges.csv
deleted file mode 100644
index a91c083..0000000
--- a/swh/graph/tests/dataset/example.edges.csv
+++ /dev/null
@@ -1,23 +0,0 @@
-swh:1:dir:0000000000000000000000000000000000000002 swh:1:cnt:0000000000000000000000000000000000000001
-swh:1:rev:0000000000000000000000000000000000000003 swh:1:dir:0000000000000000000000000000000000000002
-swh:1:dir:0000000000000000000000000000000000000008 swh:1:cnt:0000000000000000000000000000000000000001
-swh:1:dir:0000000000000000000000000000000000000008 swh:1:dir:0000000000000000000000000000000000000006
-swh:1:dir:0000000000000000000000000000000000000006 swh:1:cnt:0000000000000000000000000000000000000004
-swh:1:dir:0000000000000000000000000000000000000006 swh:1:cnt:0000000000000000000000000000000000000005
-swh:1:dir:0000000000000000000000000000000000000008 swh:1:cnt:0000000000000000000000000000000000000007
-swh:1:rev:0000000000000000000000000000000000000009 swh:1:dir:0000000000000000000000000000000000000008
-swh:1:rel:0000000000000000000000000000000000000010 swh:1:rev:0000000000000000000000000000000000000009
-swh:1:rev:0000000000000000000000000000000000000009 swh:1:rev:0000000000000000000000000000000000000003
-swh:1:dir:0000000000000000000000000000000000000012 swh:1:cnt:0000000000000000000000000000000000000011
-swh:1:dir:0000000000000000000000000000000000000012 swh:1:dir:0000000000000000000000000000000000000008
-swh:1:rev:0000000000000000000000000000000000000013 swh:1:dir:0000000000000000000000000000000000000012
-swh:1:rev:0000000000000000000000000000000000000013 swh:1:rev:0000000000000000000000000000000000000009
-swh:1:dir:0000000000000000000000000000000000000017 swh:1:cnt:0000000000000000000000000000000000000014
-swh:1:dir:0000000000000000000000000000000000000017 swh:1:dir:0000000000000000000000000000000000000016
-swh:1:dir:0000000000000000000000000000000000000016 swh:1:cnt:0000000000000000000000000000000000000015
-swh:1:rev:0000000000000000000000000000000000000018 swh:1:dir:0000000000000000000000000000000000000017
-swh:1:rev:0000000000000000000000000000000000000018 swh:1:rev:0000000000000000000000000000000000000013
-swh:1:rel:0000000000000000000000000000000000000019 swh:1:rev:0000000000000000000000000000000000000018
-swh:1:snp:0000000000000000000000000000000000000020 swh:1:rev:0000000000000000000000000000000000000009
-swh:1:snp:0000000000000000000000000000000000000020 swh:1:rel:0000000000000000000000000000000000000010
-swh:1:ori:0000000000000000000000000000000000000021 swh:1:snp:0000000000000000000000000000000000000020
diff --git a/swh/graph/tests/dataset/example.edges.csv.zst b/swh/graph/tests/dataset/example.edges.csv.zst
deleted file mode 100644
index 41b40c1..0000000
Binary files a/swh/graph/tests/dataset/example.edges.csv.zst and /dev/null differ
diff --git a/swh/graph/tests/dataset/example.nodes.csv b/swh/graph/tests/dataset/example.nodes.csv
deleted file mode 100644
index 4105555..0000000
--- a/swh/graph/tests/dataset/example.nodes.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-swh:1:cnt:0000000000000000000000000000000000000001
-swh:1:cnt:0000000000000000000000000000000000000004
-swh:1:cnt:0000000000000000000000000000000000000005
-swh:1:cnt:0000000000000000000000000000000000000007
-swh:1:cnt:0000000000000000000000000000000000000011
-swh:1:cnt:0000000000000000000000000000000000000014
-swh:1:cnt:0000000000000000000000000000000000000015
-swh:1:dir:0000000000000000000000000000000000000002
-swh:1:dir:0000000000000000000000000000000000000006
-swh:1:dir:0000000000000000000000000000000000000008
-swh:1:dir:0000000000000000000000000000000000000012
-swh:1:dir:0000000000000000000000000000000000000016
-swh:1:dir:0000000000000000000000000000000000000017
-swh:1:ori:0000000000000000000000000000000000000021
-swh:1:rel:0000000000000000000000000000000000000010
-swh:1:rel:0000000000000000000000000000000000000019
-swh:1:rev:0000000000000000000000000000000000000003
-swh:1:rev:0000000000000000000000000000000000000009
-swh:1:rev:0000000000000000000000000000000000000013
-swh:1:rev:0000000000000000000000000000000000000018
-swh:1:snp:0000000000000000000000000000000000000020
diff --git a/swh/graph/tests/dataset/example.nodes.csv.zst b/swh/graph/tests/dataset/example.nodes.csv.zst
deleted file mode 100644
index 00cb5f4..0000000
Binary files a/swh/graph/tests/dataset/example.nodes.csv.zst and /dev/null differ
diff --git a/swh/graph/tests/dataset/generate_dataset.py b/swh/graph/tests/dataset/generate_dataset.py
new file mode 100755
index 0000000..5178397
--- /dev/null
+++ b/swh/graph/tests/dataset/generate_dataset.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import argparse
+import datetime
+import logging
+from pathlib import Path
+
+from swh.dataset.exporters.edges import GraphEdgesExporter
+from swh.dataset.exporters.orc import ORCExporter
+from swh.graph.webgraph import compress
+from swh.model.model import (
+ Content,
+ Directory,
+ DirectoryEntry,
+ ObjectType,
+ Origin,
+ OriginVisit,
+ OriginVisitStatus,
+ Person,
+ Release,
+ Revision,
+ RevisionType,
+ SkippedContent,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ Timestamp,
+ TimestampWithTimezone,
+)
+
+
+def h(id: int, width=40) -> bytes:
+ return bytes.fromhex(f"{id:0{width}}")
+
+
+PERSONS = [
+ Person(fullname=b"foo", name=b"foo", email=b""),
+ Person(fullname=b"bar", name=b"bar", email=b""),
+ Person(fullname=b"baz", name=b"baz", email=b""),
+]
+
+TEST_DATASET = [
+ Content(sha1_git=h(1), sha1=h(1), sha256=h(1, 64), blake2s256=h(1, 64), length=42),
+ Content(sha1_git=h(4), sha1=h(4), sha256=h(4, 64), blake2s256=h(4, 64), length=404),
+ Content(
+ sha1_git=h(5), sha1=h(5), sha256=h(5, 64), blake2s256=h(5, 64), length=1337
+ ),
+ Content(sha1_git=h(7), sha1=h(7), sha256=h(7, 64), blake2s256=h(7, 64), length=666),
+ Content(
+ sha1_git=h(11), sha1=h(11), sha256=h(11, 64), blake2s256=h(11, 64), length=313
+ ),
+ Content(
+ sha1_git=h(14), sha1=h(14), sha256=h(14, 64), blake2s256=h(14, 64), length=14
+ ),
+ SkippedContent(
+ sha1_git=h(15),
+ sha1=h(15),
+ sha256=h(15, 64),
+ blake2s256=h(15, 64),
+ length=404,
+ status="absent",
+ reason="Not found",
+ ),
+ Directory(
+ id=h(2),
+ entries=(
+ DirectoryEntry(name=b"README.md", perms=0o644, type="file", target=h(1),),
+ ),
+ ),
+ Directory(
+ id=h(6),
+ entries=(
+ DirectoryEntry(name=b"README.md", perms=0o644, type="file", target=h(4),),
+ DirectoryEntry(name=b"parser.c", perms=0o644, type="file", target=h(5),),
+ ),
+ ),
+ Directory(
+ id=h(8),
+ entries=(
+ DirectoryEntry(name=b"README.md", perms=0o644, type="file", target=h(1),),
+ DirectoryEntry(name=b"parser.c", perms=0o644, type="file", target=h(7),),
+ DirectoryEntry(name=b"tests", perms=0o755, type="dir", target=h(6),),
+ ),
+ ),
+ Directory(
+ id=h(12),
+ entries=(
+ DirectoryEntry(name=b"README.md", perms=0o644, type="file", target=h(11),),
+ DirectoryEntry(name=b"oldproject", perms=0o755, type="dir", target=h(8),),
+ ),
+ ),
+ Directory(
+ id=h(16),
+ entries=(
+ DirectoryEntry(name=b"TODO.txt", perms=0o644, type="file", target=h(15),),
+ ),
+ ),
+ Directory(
+ id=h(17),
+ entries=(
+ DirectoryEntry(name=b"TODO.txt", perms=0o644, type="file", target=h(14),),
+ DirectoryEntry(name=b"old", perms=0o755, type="dir", target=h(16),),
+ ),
+ ),
+ Revision(
+ id=h(3),
+ message=b"Initial commit",
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111122220, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ committer=PERSONS[0],
+ author=PERSONS[0],
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111122220, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ type=RevisionType.GIT,
+ directory=h(2),
+ synthetic=False,
+ metadata=None,
+ parents=(),
+ ),
+ Revision(
+ id=h(9),
+ message=b"Add parser",
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111144440, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ committer=PERSONS[1],
+ author=PERSONS[1],
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111155550, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ type=RevisionType.GIT,
+ directory=h(8),
+ synthetic=False,
+ metadata=None,
+ parents=(h(3),),
+ ),
+ Revision(
+ id=h(13),
+ message=b"Add tests",
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111166660, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ committer=PERSONS[1],
+ author=PERSONS[0],
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111166660, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ type=RevisionType.GIT,
+ directory=h(12),
+ synthetic=False,
+ metadata=None,
+ parents=(h(9),),
+ ),
+ Revision(
+ id=h(18),
+ message=b"Refactor codebase",
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111177770, microseconds=0,),
+ offset_bytes=b"+0000",
+ ),
+ committer=PERSONS[0],
+ author=PERSONS[2],
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1111177770, microseconds=0,),
+ offset_bytes=b"+0000",
+ ),
+ type=RevisionType.GIT,
+ directory=h(17),
+ synthetic=False,
+ metadata=None,
+ parents=(h(13),),
+ ),
+ Release(
+ id=h(10),
+ name=b"v1.0",
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1234567890, microseconds=0,),
+ offset_bytes=b"+0200",
+ ),
+ author=PERSONS[0],
+ target_type=ObjectType.REVISION,
+ target=h(9),
+ message=b"Version 1.0",
+ synthetic=False,
+ ),
+ Release(
+ id=h(19),
+ name=b"v2.0",
+ date=None,
+ author=PERSONS[1],
+ target_type=ObjectType.REVISION,
+ target=h(18),
+ message=b"Version 2.0",
+ synthetic=False,
+ ),
+ Snapshot(
+ id=h(20),
+ branches={
+ b"refs/heads/master": SnapshotBranch(
+ target=h(9), target_type=TargetType.REVISION
+ ),
+ b"refs/tags/v1.0": SnapshotBranch(
+ target=h(10), target_type=TargetType.RELEASE
+ ),
+ },
+ ),
+ OriginVisit(
+ origin="https://example.com/swh/graph",
+ date=datetime.datetime(
+ 2013, 5, 7, 4, 20, 39, 369271, tzinfo=datetime.timezone.utc
+ ),
+ visit=1,
+ type="git",
+ ),
+ OriginVisitStatus(
+ origin="https://example.com/swh/graph",
+ date=datetime.datetime(
+ 2013, 5, 7, 4, 20, 41, 369271, tzinfo=datetime.timezone.utc
+ ),
+ visit=1,
+ type="git",
+ status="full",
+ snapshot=h(20),
+ metadata=None,
+ ),
+ Origin(url="https://example.com/swh/graph"),
+]
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+
+ parser = argparse.ArgumentParser(description="Generate a test dataset")
+ parser.add_argument(
+ "--compress",
+ action="store_true",
+ default=False,
+ help="Also compress the dataset",
+ )
+ parser.add_argument("output", help="output directory", nargs="?", default=".")
+ args = parser.parse_args()
+
+ exporters = {"edges": GraphEdgesExporter, "orc": ORCExporter}
+ config = {"test_unique_file_id": "all"}
+ output_path = Path(args.output)
+ for name, exporter in exporters.items():
+ with exporter(config, output_path / name) as e:
+ for obj in TEST_DATASET:
+ e.process_object(obj.object_type, obj.to_dict())
+
+ if args.compress:
+ compress("example", output_path / "orc", output_path / "compressed")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/swh/graph/tests/dataset/generate_graph.sh b/swh/graph/tests/dataset/generate_graph.sh
deleted file mode 100755
index 9621cad..0000000
--- a/swh/graph/tests/dataset/generate_graph.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Clean previous run
-rm -rf docker/ output
-mkdir output
-
-# Build Docker work environment
-toplevel_dir=`git rev-parse --show-toplevel`
-mkdir -p docker
-cp -r $toplevel_dir/docker/ .
-docker build --tag swh-graph-test docker
-
-# Setup input for compression script
-tr ' ' '\n' < example.edges.csv | sort -u > example.nodes.csv
-zstd < example.nodes.csv > example.nodes.csv.zst
-zstd < example.edges.csv > example.edges.csv.zst
-
-docker run \
- --user $(id -u):$(id -g) \
- --name swh-graph-test --rm --tty --interactive \
- --volume $(pwd):/input --volume $(pwd)/output:/output \
- swh-graph-test:latest \
- swh graph compress --graph /input/example --outdir /output
diff --git a/swh/graph/tests/dataset/orc/content/graph-all.orc b/swh/graph/tests/dataset/orc/content/graph-all.orc
new file mode 100644
index 0000000..ae4b9eb
Binary files /dev/null and b/swh/graph/tests/dataset/orc/content/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/directory/graph-all.orc b/swh/graph/tests/dataset/orc/directory/graph-all.orc
new file mode 100644
index 0000000..6112766
Binary files /dev/null and b/swh/graph/tests/dataset/orc/directory/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/directory_entry/graph-all.orc b/swh/graph/tests/dataset/orc/directory_entry/graph-all.orc
new file mode 100644
index 0000000..9a243ff
Binary files /dev/null and b/swh/graph/tests/dataset/orc/directory_entry/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/origin/graph-all.orc b/swh/graph/tests/dataset/orc/origin/graph-all.orc
new file mode 100644
index 0000000..b99c839
Binary files /dev/null and b/swh/graph/tests/dataset/orc/origin/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/origin_visit/graph-all.orc b/swh/graph/tests/dataset/orc/origin_visit/graph-all.orc
new file mode 100644
index 0000000..171b8fe
Binary files /dev/null and b/swh/graph/tests/dataset/orc/origin_visit/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/origin_visit_status/graph-all.orc b/swh/graph/tests/dataset/orc/origin_visit_status/graph-all.orc
new file mode 100644
index 0000000..31ae0bf
Binary files /dev/null and b/swh/graph/tests/dataset/orc/origin_visit_status/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/release/graph-all.orc b/swh/graph/tests/dataset/orc/release/graph-all.orc
new file mode 100644
index 0000000..406e6b0
Binary files /dev/null and b/swh/graph/tests/dataset/orc/release/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/revision/graph-all.orc b/swh/graph/tests/dataset/orc/revision/graph-all.orc
new file mode 100644
index 0000000..6cadc1f
Binary files /dev/null and b/swh/graph/tests/dataset/orc/revision/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/revision_history/graph-all.orc b/swh/graph/tests/dataset/orc/revision_history/graph-all.orc
new file mode 100644
index 0000000..6fe8cab
Binary files /dev/null and b/swh/graph/tests/dataset/orc/revision_history/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/skipped_content/graph-all.orc b/swh/graph/tests/dataset/orc/skipped_content/graph-all.orc
new file mode 100644
index 0000000..ec7b08c
Binary files /dev/null and b/swh/graph/tests/dataset/orc/skipped_content/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/snapshot/graph-all.orc b/swh/graph/tests/dataset/orc/snapshot/graph-all.orc
new file mode 100644
index 0000000..042c41c
Binary files /dev/null and b/swh/graph/tests/dataset/orc/snapshot/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/orc/snapshot_branch/graph-all.orc b/swh/graph/tests/dataset/orc/snapshot_branch/graph-all.orc
new file mode 100644
index 0000000..3a9082f
Binary files /dev/null and b/swh/graph/tests/dataset/orc/snapshot_branch/graph-all.orc differ
diff --git a/swh/graph/tests/dataset/output/example-transposed.graph b/swh/graph/tests/dataset/output/example-transposed.graph
deleted file mode 100644
index ad5756e..0000000
--- a/swh/graph/tests/dataset/output/example-transposed.graph
+++ /dev/null
@@ -1 +0,0 @@
-z.hѮIt{
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example-transposed.offsets b/swh/graph/tests/dataset/output/example-transposed.offsets
deleted file mode 100644
index 92c2947..0000000
--- a/swh/graph/tests/dataset/output/example-transposed.offsets
+++ /dev/null
@@ -1,2 +0,0 @@
-
- RqG4PTP(
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example-transposed.properties b/swh/graph/tests/dataset/output/example-transposed.properties
deleted file mode 100644
index 512ce9d..0000000
--- a/swh/graph/tests/dataset/output/example-transposed.properties
+++ /dev/null
@@ -1,35 +0,0 @@
-#BVGraph properties
-#Sat Dec 04 01:37:28 CET 2021
-bitsforreferences=31
-avgbitsforintervals=0.714
-graphclass=it.unimi.dsi.big.webgraph.BVGraph
-avgdist=0.571
-successoravggap=6.478
-residualexpstats=6,6,2,2,2
-arcs=23
-minintervallength=4
-bitsforoutdegrees=61
-residualavgloggap=2.1534522798004265
-avgbitsforoutdegrees=2.905
-bitsforresiduals=85
-successoravgloggap=2.3226776741991215
-maxrefcount=3
-successorexpstats=7,6,4,3,3
-residualarcs=18
-avgbitsforresiduals=4.048
-avgbitsforblocks=0.238
-windowsize=7
-residualavggap=5.667
-copiedarcs=5
-avgbitsforreferences=1.476
-version=0
-compratio=1.554
-bitsperlink=8.565
-compressionflags=
-nodes=21
-avgref=0.238
-zetak=3
-bitsforintervals=15
-intervalisedarcs=0
-bitspernode=9.381
-bitsforblocks=5
diff --git a/swh/graph/tests/dataset/output/example.graph b/swh/graph/tests/dataset/output/example.graph
deleted file mode 100644
index 621b9b7..0000000
--- a/swh/graph/tests/dataset/output/example.graph
+++ /dev/null
@@ -1 +0,0 @@
-'t}UOGϹ]ް].dP}R
\ No newline at end of file
diff --git a/swh/graph/tests/dataset/output/example.node2swhid.bin b/swh/graph/tests/dataset/output/example.node2swhid.bin
deleted file mode 100644
index 9cc50b2..0000000
Binary files a/swh/graph/tests/dataset/output/example.node2swhid.bin and /dev/null differ
diff --git a/swh/graph/tests/dataset/output/example.offsets b/swh/graph/tests/dataset/output/example.offsets
deleted file mode 100644
index 407e1a6..0000000
--- a/swh/graph/tests/dataset/output/example.offsets
+++ /dev/null
@@ -1 +0,0 @@
-BU!B
diff --git a/swh/graph/tests/dataset/output/example.order b/swh/graph/tests/dataset/output/example.order
deleted file mode 100644
index 2cb5540..0000000
Binary files a/swh/graph/tests/dataset/output/example.order and /dev/null differ
diff --git a/swh/graph/tests/test_api_client.py b/swh/graph/tests/test_api_client.py
index 79b4d86..4dc4036 100644
--- a/swh/graph/tests/test_api_client.py
+++ b/swh/graph/tests/test_api_client.py
@@ -1,379 +1,381 @@
+import hashlib
+
import pytest
from pytest import raises
from swh.core.api import RemoteException
from swh.graph.client import GraphArgumentException
+TEST_ORIGIN_ID = "swh:1:ori:{}".format(
+ hashlib.sha1(b"https://example.com/swh/graph").hexdigest()
+)
+
def test_stats(graph_client):
stats = graph_client.stats()
assert set(stats.keys()) == {"counts", "ratios", "indegree", "outdegree"}
assert set(stats["counts"].keys()) == {"nodes", "edges"}
assert set(stats["ratios"].keys()) == {
"compression",
"bits_per_node",
"bits_per_edge",
"avg_locality",
}
assert set(stats["indegree"].keys()) == {"min", "max", "avg"}
assert set(stats["outdegree"].keys()) == {"min", "max", "avg"}
assert stats["counts"]["nodes"] == 21
assert stats["counts"]["edges"] == 23
assert isinstance(stats["ratios"]["compression"], float)
assert isinstance(stats["ratios"]["bits_per_node"], float)
assert isinstance(stats["ratios"]["bits_per_edge"], float)
assert isinstance(stats["ratios"]["avg_locality"], float)
assert stats["indegree"]["min"] == 0
assert stats["indegree"]["max"] == 3
assert isinstance(stats["indegree"]["avg"], float)
assert stats["outdegree"]["min"] == 0
assert stats["outdegree"]["max"] == 3
assert isinstance(stats["outdegree"]["avg"], float)
def test_leaves(graph_client):
- actual = list(
- graph_client.leaves("swh:1:ori:0000000000000000000000000000000000000021")
- )
+ actual = list(graph_client.leaves(TEST_ORIGIN_ID))
expected = [
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
"swh:1:cnt:0000000000000000000000000000000000000007",
]
assert set(actual) == set(expected)
def test_neighbors(graph_client):
actual = list(
graph_client.neighbors(
"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
)
)
expected = [
"swh:1:snp:0000000000000000000000000000000000000020",
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000013",
]
assert set(actual) == set(expected)
def test_visit_nodes(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev",
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
]
assert set(actual) == set(expected)
def test_visit_nodes_filtered(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", return_types="dir",
)
)
expected = [
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
]
assert set(actual) == set(expected)
def test_visit_nodes_filtered_star(graph_client):
actual = list(
graph_client.visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", return_types="*",
)
)
expected = [
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
"swh:1:cnt:0000000000000000000000000000000000000005",
]
assert set(actual) == set(expected)
def test_visit_edges(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rel:0000000000000000000000000000000000000010",
edges="rel:rev,rev:rev,rev:dir",
)
)
expected = [
(
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
]
assert set(actual) == set(expected)
def test_visit_edges_limited(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rel:0000000000000000000000000000000000000010",
max_edges=4,
edges="rel:rev,rev:rev,rev:dir",
)
)
expected = [
(
"swh:1:rel:0000000000000000000000000000000000000010",
"swh:1:rev:0000000000000000000000000000000000000009",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
]
# As there are four valid answers (up to reordering), we cannot check for
# equality. Instead, we check the client returned all edges but one.
assert set(actual).issubset(set(expected))
assert len(actual) == 3
def test_visit_edges_diamond_pattern(graph_client):
actual = list(
graph_client.visit_edges(
"swh:1:rev:0000000000000000000000000000000000000009", edges="*",
)
)
expected = [
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:rev:0000000000000000000000000000000000000003",
),
(
"swh:1:rev:0000000000000000000000000000000000000009",
"swh:1:dir:0000000000000000000000000000000000000008",
),
(
"swh:1:rev:0000000000000000000000000000000000000003",
"swh:1:dir:0000000000000000000000000000000000000002",
),
(
"swh:1:dir:0000000000000000000000000000000000000002",
"swh:1:cnt:0000000000000000000000000000000000000001",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000001",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:cnt:0000000000000000000000000000000000000007",
),
(
"swh:1:dir:0000000000000000000000000000000000000008",
"swh:1:dir:0000000000000000000000000000000000000006",
),
(
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000004",
),
(
"swh:1:dir:0000000000000000000000000000000000000006",
"swh:1:cnt:0000000000000000000000000000000000000005",
),
]
assert set(actual) == set(expected)
@pytest.mark.skip(reason="currently disabled due to T1969")
def test_walk(graph_client):
args = ("swh:1:dir:0000000000000000000000000000000000000016", "rel")
kwargs = {
"edges": "dir:dir,dir:rev,rev:*",
"direction": "backward",
"traversal": "bfs",
}
actual = list(graph_client.walk(*args, **kwargs))
expected = [
"swh:1:dir:0000000000000000000000000000000000000016",
"swh:1:dir:0000000000000000000000000000000000000017",
"swh:1:rev:0000000000000000000000000000000000000018",
"swh:1:rel:0000000000000000000000000000000000000019",
]
assert set(actual) == set(expected)
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.walk(*args, **kwargs2))
expected = ["swh:1:rel:0000000000000000000000000000000000000019"]
assert set(actual) == set(expected)
kwargs2 = kwargs.copy()
kwargs2["limit"] = 2
actual = list(graph_client.walk(*args, **kwargs2))
expected = [
"swh:1:dir:0000000000000000000000000000000000000016",
"swh:1:dir:0000000000000000000000000000000000000017",
]
assert set(actual) == set(expected)
def test_random_walk_dst_is_type(graph_client):
"""as the walk is random, we test a visit from a cnt node to a release
reachable from every single path in the backward graph, and only check the
final node of the path (i.e., the release)
"""
args = ("swh:1:cnt:0000000000000000000000000000000000000015", "rel")
kwargs = {"direction": "backward"}
expected_root = "swh:1:rel:0000000000000000000000000000000000000019"
actual = list(graph_client.random_walk(*args, **kwargs))
assert len(actual) > 1 # no release directly links to a content
assert actual[0] == args[0]
assert actual[-1] == expected_root
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.random_walk(*args, **kwargs2))
assert actual == [expected_root]
kwargs2["limit"] = -2
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 2
assert actual[-1] == expected_root
kwargs2["limit"] = 3
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 3
def test_random_walk_dst_is_node(graph_client):
"""Same as test_random_walk_dst_is_type, but we target the specific release
node instead of a type
"""
args = (
"swh:1:cnt:0000000000000000000000000000000000000015",
"swh:1:rel:0000000000000000000000000000000000000019",
)
kwargs = {"direction": "backward"}
expected_root = "swh:1:rel:0000000000000000000000000000000000000019"
actual = list(graph_client.random_walk(*args, **kwargs))
assert len(actual) > 1 # no origin directly links to a content
assert actual[0] == args[0]
assert actual[-1] == expected_root
kwargs2 = kwargs.copy()
kwargs2["limit"] = -1
actual = list(graph_client.random_walk(*args, **kwargs2))
assert actual == [expected_root]
kwargs2["limit"] = -2
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 2
assert actual[-1] == expected_root
kwargs2["limit"] = 3
actual = list(graph_client.random_walk(*args, **kwargs2))
assert len(actual) == 3
def test_count(graph_client):
- actual = graph_client.count_leaves(
- "swh:1:ori:0000000000000000000000000000000000000021"
- )
+ actual = graph_client.count_leaves(TEST_ORIGIN_ID)
assert actual == 4
actual = graph_client.count_visit_nodes(
"swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev"
)
assert actual == 3
actual = graph_client.count_neighbors(
"swh:1:rev:0000000000000000000000000000000000000009", direction="backward"
)
assert actual == 3
def test_param_validation(graph_client):
with raises(GraphArgumentException) as exc_info: # SWHID not found
- list(graph_client.leaves("swh:1:ori:fff0000000000000000000000000000000000021"))
+ list(graph_client.leaves("swh:1:rel:00ffffffff000000000000000000000000000010"))
if exc_info.value.response:
assert exc_info.value.response.status_code == 404
with raises(GraphArgumentException) as exc_info: # malformed SWHID
list(
- graph_client.neighbors("swh:1:ori:fff000000zzzzzz0000000000000000000000021")
+ graph_client.neighbors("swh:1:rel:00ffffffff00000000zzzzzzz000000000000010")
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
with raises(GraphArgumentException) as exc_info: # malformed edge specificaiton
list(
graph_client.visit_nodes(
"swh:1:dir:0000000000000000000000000000000000000016",
edges="dir:notanodetype,dir:rev,rev:*",
direction="backward",
)
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
with raises(GraphArgumentException) as exc_info: # malformed direction
list(
graph_client.visit_nodes(
"swh:1:dir:0000000000000000000000000000000000000016",
edges="dir:dir,dir:rev,rev:*",
direction="notadirection",
)
)
if exc_info.value.response:
assert exc_info.value.response.status_code == 400
@pytest.mark.skip(reason="currently disabled due to T1969")
def test_param_validation_walk(graph_client):
"""test validation of walk-specific parameters only"""
with raises(RemoteException) as exc_info: # malformed traversal order
list(
graph_client.walk(
"swh:1:dir:0000000000000000000000000000000000000016",
"rel",
edges="dir:dir,dir:rev,rev:*",
direction="backward",
traversal="notatraversalorder",
)
)
assert exc_info.value.response.status_code == 400
diff --git a/swh/graph/tests/test_cli.py b/swh/graph/tests/test_cli.py
index fc066b0..eceb164 100644
--- a/swh/graph/tests/test_cli.py
+++ b/swh/graph/tests/test_cli.py
@@ -1,56 +1,58 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Dict
from click.testing import CliRunner
import yaml
from swh.graph.cli import graph_cli_group
DATA_DIR = Path(__file__).parents[0] / "dataset"
def read_properties(properties_fname) -> Dict[str, str]:
"""read a Java .properties file"""
with open(properties_fname) as f:
keyvalues = (
line.split("=", maxsplit=1)
for line in f
if not line.strip().startswith("#")
)
return dict((k.strip(), v.strip()) for (k, v) in keyvalues)
def test_pipeline():
"""run full compression pipeline"""
# bare bone configuration, to allow testing the compression pipeline
# with minimum RAM requirements on trivial graphs
config = {"graph": {"compress": {"batch_size": 1000}}}
runner = CliRunner()
with TemporaryDirectory(suffix=".swh-graph-test") as tmpdir:
config_path = Path(tmpdir, "config.yml")
config_path.write_text(yaml.dump(config))
result = runner.invoke(
graph_cli_group,
[
"--config-file",
config_path,
"compress",
- "--graph",
- DATA_DIR / "example",
- "--outdir",
+ "--input-dataset",
+ DATA_DIR / "orc",
+ "--output-directory",
tmpdir,
+ "--graph-name",
+ "example",
],
)
assert result.exit_code == 0, result
properties = read_properties(Path(tmpdir) / "example.properties")
assert int(properties["nodes"]) == 21
assert int(properties["arcs"]) == 23
diff --git a/swh/graph/webgraph.py b/swh/graph/webgraph.py
index 26b8e34..80bcbe6 100644
--- a/swh/graph/webgraph.py
+++ b/swh/graph/webgraph.py
@@ -1,280 +1,350 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""WebGraph driver
"""
from datetime import datetime
from enum import Enum
import logging
import os
from pathlib import Path
import subprocess
from typing import Dict, List, Set
from swh.graph.config import check_config_compress
class CompressionStep(Enum):
- MPH = 1
- BV = 2
- BFS = 3
- PERMUTE_BFS = 4
- TRANSPOSE_BFS = 5
- SIMPLIFY = 6
- LLP = 7
- PERMUTE_LLP = 8
- OBL = 9
- COMPOSE_ORDERS = 10
- STATS = 11
- TRANSPOSE = 12
- TRANSPOSE_OBL = 13
- MAPS = 14
- CLEAN_TMP = 15
+ EXTRACT_NODES = 1
+ MPH = 2
+ BV = 3
+ BFS = 4
+ PERMUTE_BFS = 5
+ TRANSPOSE_BFS = 6
+ SIMPLIFY = 7
+ LLP = 8
+ PERMUTE_LLP = 9
+ OBL = 10
+ COMPOSE_ORDERS = 11
+ STATS = 12
+ TRANSPOSE = 13
+ TRANSPOSE_OBL = 14
+ MAPS = 15
+ EXTRACT_PERSONS = 16
+ MPH_PERSONS = 17
+ NODE_PROPERTIES = 18
+ MPH_LABELS = 19
+ FCL_LABELS = 20
+ EDGE_LABELS = 21
+ CLEAN_TMP = 22
def __str__(self):
return self.name
# full compression pipeline
COMP_SEQ = list(CompressionStep)
# Mapping from compression steps to shell commands implementing them. Commands
# will be executed by the shell, so be careful with meta characters. They are
# specified here as lists of tokens that will be joined together only for ease
# of line splitting. In commands, {tokens} will be interpolated with
# configuration values, see :func:`compress`.
STEP_ARGV: Dict[CompressionStep, List[str]] = {
+ CompressionStep.EXTRACT_NODES: [
+ "{java}",
+ "org.softwareheritage.graph.compress.ExtractNodes",
+ "--format",
+ "orc",
+ "--temp-dir",
+ "{tmp_dir}",
+ "{in_dir}",
+ "{out_dir}/{graph_name}",
+ ],
CompressionStep.MPH: [
"{java}",
"it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction",
"--byte-array",
"--temp-dir",
"{tmp_dir}",
+ "--decompressor",
+ "com.github.luben.zstd.ZstdInputStream",
"{out_dir}/{graph_name}.mph",
- "<( zstdcat {in_dir}/{graph_name}.nodes.csv.zst )",
+ "{out_dir}/{graph_name}.nodes.csv.zst",
],
# use process substitution (and hence FIFO) above as MPH class load the
# entire file in memory when reading from stdin
CompressionStep.BV: [
- "zstdcat",
- "{in_dir}/{graph_name}.edges.csv.zst",
+ "{java}",
+ "org.softwareheritage.graph.compress.ORCGraphDataset",
+ "{in_dir}",
"|",
"cut -d' ' -f1,2",
"|",
"{java}",
"it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph",
"--byte-array",
"--temp-dir",
"{tmp_dir}",
"--function",
"{out_dir}/{graph_name}.mph",
"{out_dir}/{graph_name}-base",
],
CompressionStep.BFS: [
"{java}",
"it.unimi.dsi.law.big.graph.BFS",
"{out_dir}/{graph_name}-base",
"{out_dir}/{graph_name}-bfs.order",
],
CompressionStep.PERMUTE_BFS: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"mapOffline",
"{out_dir}/{graph_name}-base",
"{out_dir}/{graph_name}-bfs",
"{out_dir}/{graph_name}-bfs.order",
"{batch_size}",
"{tmp_dir}",
],
CompressionStep.TRANSPOSE_BFS: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"transposeOffline",
"{out_dir}/{graph_name}-bfs",
"{out_dir}/{graph_name}-bfs-transposed",
"{batch_size}",
"{tmp_dir}",
],
CompressionStep.SIMPLIFY: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"simplify",
"{out_dir}/{graph_name}-bfs",
"{out_dir}/{graph_name}-bfs-transposed",
"{out_dir}/{graph_name}-bfs-simplified",
],
CompressionStep.LLP: [
"{java}",
"it.unimi.dsi.law.big.graph.LayeredLabelPropagation",
"-g",
"{llp_gammas}",
"{out_dir}/{graph_name}-bfs-simplified",
"{out_dir}/{graph_name}-llp.order",
],
CompressionStep.PERMUTE_LLP: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"mapOffline",
"{out_dir}/{graph_name}-bfs",
"{out_dir}/{graph_name}",
"{out_dir}/{graph_name}-llp.order",
"{batch_size}",
"{tmp_dir}",
],
CompressionStep.OBL: [
"{java}",
"it.unimi.dsi.big.webgraph.BVGraph",
"--list",
"{out_dir}/{graph_name}",
],
CompressionStep.COMPOSE_ORDERS: [
"{java}",
"org.softwareheritage.graph.compress.ComposePermutations",
"{out_dir}/{graph_name}-bfs.order",
"{out_dir}/{graph_name}-llp.order",
"{out_dir}/{graph_name}.order",
],
CompressionStep.STATS: [
"{java}",
"it.unimi.dsi.big.webgraph.Stats",
"{out_dir}/{graph_name}",
],
CompressionStep.TRANSPOSE: [
"{java}",
"it.unimi.dsi.big.webgraph.Transform",
"transposeOffline",
"{out_dir}/{graph_name}",
"{out_dir}/{graph_name}-transposed",
"{batch_size}",
"{tmp_dir}",
],
CompressionStep.TRANSPOSE_OBL: [
"{java}",
"it.unimi.dsi.big.webgraph.BVGraph",
"--list",
"{out_dir}/{graph_name}-transposed",
],
CompressionStep.MAPS: [
- "zstdcat",
- "{in_dir}/{graph_name}.nodes.csv.zst",
- "|",
"{java}",
"org.softwareheritage.graph.compress.NodeMapBuilder",
"{out_dir}/{graph_name}",
"{tmp_dir}",
+ "< {out_dir}/{graph_name}.nodes.csv.zst",
+ ],
+ CompressionStep.MPH_PERSONS: [
+ "{java}",
+ "it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction",
+ "--byte-array",
+ "--decompressor",
+ "com.github.luben.zstd.ZstdInputStream",
+ "--temp-dir",
+ "{tmp_dir}",
+ "{out_dir}/{graph_name}.persons.mph",
+ "{out_dir}/{graph_name}.persons.csv.zst",
+ ],
+ CompressionStep.EXTRACT_PERSONS: [
+ "{java}",
+ "org.softwareheritage.graph.compress.ExtractPersons",
+ "--temp-dir",
+ "{tmp_dir}",
+ "{in_dir}",
+ "{out_dir}/{graph_name}",
+ ],
+ CompressionStep.NODE_PROPERTIES: [
+ "{java}",
+ "org.softwareheritage.graph.compress.WriteNodeProperties",
+ "{in_dir}",
+ "{out_dir}/{graph_name}",
+ ],
+ CompressionStep.MPH_LABELS: [
+ "{java}",
+ "it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction",
+ "--byte-array",
+ "--temp-dir",
+ "{tmp_dir}",
+ "--decompressor",
+ "com.github.luben.zstd.ZstdInputStream",
+ "{out_dir}/{graph_name}.labels.mph",
+ "{out_dir}/{graph_name}.labels.csv.zst",
+ ],
+ CompressionStep.FCL_LABELS: [
+ "{java}",
+ "it.unimi.dsi.big.util.MappedFrontCodedStringBigList",
+ "--decompressor",
+ "com.github.luben.zstd.ZstdInputStream",
+ "{out_dir}/{graph_name}.labels.fcl",
+ "< {out_dir}/{graph_name}.labels.csv.zst",
+ ],
+ CompressionStep.EDGE_LABELS: [
+ "{java}",
+ "org.softwareheritage.graph.compress.LabelMapBuilder",
+ "--temp-dir",
+ "{tmp_dir}",
+ "{in_dir}",
+ "{out_dir}/{graph_name}",
],
CompressionStep.CLEAN_TMP: [
"rm",
"-rf",
"{out_dir}/{graph_name}-base.graph",
"{out_dir}/{graph_name}-base.offsets",
"{out_dir}/{graph_name}-base.properties",
"{out_dir}/{graph_name}-bfs-simplified.graph",
"{out_dir}/{graph_name}-bfs-simplified.offsets",
"{out_dir}/{graph_name}-bfs-simplified.properties",
"{out_dir}/{graph_name}-bfs-transposed.graph",
"{out_dir}/{graph_name}-bfs-transposed.offsets",
"{out_dir}/{graph_name}-bfs-transposed.properties",
"{out_dir}/{graph_name}-bfs.graph",
"{out_dir}/{graph_name}-bfs.offsets",
"{out_dir}/{graph_name}-bfs.order",
"{out_dir}/{graph_name}-bfs.properties",
"{out_dir}/{graph_name}-llp.order",
"{tmp_dir}",
],
}
def do_step(step, conf):
cmd = " ".join(STEP_ARGV[step]).format(**conf)
cmd_env = os.environ.copy()
cmd_env["JAVA_TOOL_OPTIONS"] = conf["java_tool_options"]
cmd_env["CLASSPATH"] = conf["classpath"]
logging.info(f"running: {cmd}")
process = subprocess.Popen(
["/bin/bash", "-c", cmd],
env=cmd_env,
encoding="utf8",
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
with process.stdout as stdout:
for line in stdout:
logging.info(line.rstrip())
rc = process.wait()
if rc != 0:
raise RuntimeError(
f"compression step {step} returned non-zero " f"exit code {rc}"
)
else:
return rc
def compress(
graph_name: str,
in_dir: Path,
out_dir: Path,
steps: Set[CompressionStep] = set(COMP_SEQ),
conf: Dict[str, str] = {},
):
"""graph compression pipeline driver from nodes/edges files to compressed
on-disk representation
Args:
graph_name: graph base name, relative to in_dir
in_dir: input directory, where the uncompressed graph can be found
out_dir: output directory, where the compressed graph will be stored
steps: compression steps to run (default: all steps)
conf: compression configuration, supporting the following keys (all are
optional, so an empty configuration is fine and is the default)
- batch_size: batch size for `WebGraph transformations
`_;
defaults to 1 billion
- classpath: java classpath, defaults to swh-graph JAR only
- java: command to run java VM, defaults to "java"
- java_tool_options: value for JAVA_TOOL_OPTIONS environment
variable; defaults to various settings for high memory machines
- logback: path to a logback.xml configuration file; if not provided
a temporary one will be created and used
- max_ram: maximum RAM to use for compression; defaults to available
virtual memory
- tmp_dir: temporary directory, defaults to the "tmp" subdir of
out_dir
"""
if not steps:
steps = set(COMP_SEQ)
conf = check_config_compress(conf, graph_name, in_dir, out_dir)
compression_start_time = datetime.now()
logging.info(f"starting compression at {compression_start_time}")
seq_no = 0
for step in COMP_SEQ:
if step not in steps:
logging.debug(f"skipping compression step {step}")
continue
seq_no += 1
step_start_time = datetime.now()
logging.info(
f"starting compression step {step} "
f"({seq_no}/{len(steps)}) at {step_start_time}"
)
do_step(step, conf)
step_end_time = datetime.now()
step_duration = step_end_time - step_start_time
logging.info(
f"completed compression step {step} "
f"({seq_no}/{len(steps)}) "
f"at {step_end_time} in {step_duration}"
)
compression_end_time = datetime.now()
compression_duration = compression_end_time - compression_start_time
logging.info(f"completed compression in {compression_duration}")