diff --git a/java/server/pom.xml b/java/server/pom.xml
index 8d70888..2fed083 100644
--- a/java/server/pom.xml
+++ b/java/server/pom.xml
@@ -1,209 +1,214 @@
4.0.0
org.softwareheritage.graph
swh-graph
${git.closest.tag.name}
swh-graph
https://forge.softwareheritage.org/source/swh-graph/
UTF-8
11
ch.qos.logback
logback-classic
1.2.3
junit
junit
4.11
test
org.hamcrest
hamcrest
2.1
test
io.javalin
javalin
3.0.0
org.slf4j
slf4j-simple
1.7.26
com.fasterxml.jackson.core
jackson-databind
2.9.8
it.unimi.dsi
webgraph-big
3.5.1
it.unimi.dsi
fastutil
8.3.0
it.unimi.dsi
law-library
2.6.0
org.apache.hadoop
hadoop-common
org.umlgraph
umlgraph
org.eclipse.jetty.aggregate
jetty-all
com.martiansoftware
jsap
2.1
net.sf.py4j
py4j
0.10.8.1
+
+ commons-codec
+ commons-codec
+ 1.11
+
maven-clean-plugin
3.1.0
maven-resources-plugin
3.0.2
maven-compiler-plugin
3.8.0
-verbose
-Xlint:all
maven-surefire-plugin
2.22.1
maven-jar-plugin
3.0.2
maven-install-plugin
2.5.2
maven-deploy-plugin
2.8.2
maven-site-plugin
3.7.1
maven-project-info-reports-plugin
3.0.0
maven-assembly-plugin
org.softwareheritage.graph.App
jar-with-dependencies
false
make-assembly
package
single
pl.project13.maven
git-commit-id-plugin
3.0.1
get-the-git-infos
revision
initialize
true
true
true
true
v*
git.closest.tag.name
^v
true
org.apache.maven.plugins
maven-javadoc-plugin
3.1.1
diff --git a/java/server/src/main/java/org/softwareheritage/graph/Graph.java b/java/server/src/main/java/org/softwareheritage/graph/Graph.java
index 1221984..2c86c48 100644
--- a/java/server/src/main/java/org/softwareheritage/graph/Graph.java
+++ b/java/server/src/main/java/org/softwareheritage/graph/Graph.java
@@ -1,227 +1,227 @@
package org.softwareheritage.graph;
import java.io.IOException;
import it.unimi.dsi.big.webgraph.BVGraph;
import it.unimi.dsi.lang.FlyweightPrototype;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.backend.NodeIdMap;
import org.softwareheritage.graph.backend.NodeTypesMap;
/**
* Main class storing the compressed graph and node id mappings.
*
* The compressed graph is stored using the WebGraph
* ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent
* identifiers (PID) while WebGraph uses integers internally. These two mappings (long id ↔
* PID) are used for the input (users refer to the graph using PID) and the output (convert back to
* PID for users results). However, since graph traversal can be restricted depending on the node
* type (see {@link AllowedEdges}), a long id → node type map is stored as well to avoid a full
* PID lookup.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.AllowedEdges
* @see org.softwareheritage.graph.backend.NodeIdMap
* @see org.softwareheritage.graph.backend.NodeTypesMap
*/
public class Graph implements FlyweightPrototype {
/** File extension for the SWH PID to long node id map */
- public static final String PID_TO_NODE = ".pid2node.csv";
+ public static final String PID_TO_NODE = ".pid2node.bin";
/** File extension for the long node id to SWH PID map */
- public static final String NODE_TO_PID = ".node2pid.csv";
- /** File extension for the long node id to node typ map */
+ public static final String NODE_TO_PID = ".node2pid.bin";
+ /** File extension for the long node id to node type map */
public static final String NODE_TO_TYPE = ".node2type.map";
/** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */
BVGraph graph;
/** Transposed compressed graph (used for backward traversals) */
BVGraph graphTransposed;
/** Path and basename of the compressed graph */
String path;
/** Mapping long id ↔ SWH PIDs */
NodeIdMap nodeIdMap;
/** Mapping long id → node types */
NodeTypesMap nodeTypesMap;
/**
* Constructor.
*
* @param path path and basename of the compressed graph to load
*/
public Graph(String path) throws IOException {
this.graph = BVGraph.load(path);
this.graphTransposed = BVGraph.load(path + "-transposed");
this.path = path;
this.nodeTypesMap = new NodeTypesMap(path);
// TODO: we don't need to load the nodeIdMap now that all the
// translation between ints and PIDs is happening on the Python side.
// However, some parts of the code still depend on this, so it's
// commented out while we decide on what to do with it.
this.nodeIdMap = null; // new NodeIdMap(path, getNbNodes());
}
// Protected empty constructor to implement copy()
protected Graph() { }
/**
* Return a flyweight copy of the graph.
*/
@Override
public Graph copy() {
final Graph ng = new Graph();
ng.graph = this.graph.copy();
ng.graphTransposed = this.graphTransposed.copy();
ng.path = path;
ng.nodeIdMap = this.nodeIdMap;
ng.nodeTypesMap = this.nodeTypesMap;
return ng;
}
/**
* Cleans up graph resources after use.
*/
public void cleanUp() throws IOException {
nodeIdMap.close();
}
/**
* Returns the graph full path.
*
* @return graph full path
*/
public String getPath() {
return path;
}
/**
* Converts {@link SwhPID} node to long.
*
* @param swhPID node specified as a {@link SwhPID}
* @return internal long node id
* @see org.softwareheritage.graph.SwhPID
*/
public long getNodeId(SwhPID swhPID) {
return nodeIdMap.getNodeId(swhPID);
}
/**
* Converts long id node to {@link SwhPID}.
*
* @param nodeId node specified as a long id
* @return external SWH PID
* @see org.softwareheritage.graph.SwhPID
*/
public SwhPID getSwhPID(long nodeId) {
return nodeIdMap.getSwhPID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see org.softwareheritage.graph.Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return nodeTypesMap.getType(nodeId);
}
/**
* Returns number of nodes in the graph.
*
* @return number of nodes in the graph
*/
public long getNbNodes() {
return graph.numNodes();
}
/**
* Returns number of edges in the graph.
*
* @return number of edges in the graph
*/
public long getNbEdges() {
return graph.numArcs();
}
/**
* Returns lazy iterator of successors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of successors of the node, specified as a WebGraph LazyLongIterator
*/
public LazyLongIterator successors(long nodeId) {
return graph.successors(nodeId);
}
/**
* Returns the outdegree of a node.
*
* @param nodeId node specified as a long id
* @return outdegree of a node
*/
public long outdegree(long nodeId) {
return graph.outdegree(nodeId);
}
/**
* Returns lazy iterator of predecessors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of predecessors of the node, specified as a WebGraph LazyLongIterator
*/
public LazyLongIterator predecessors(long nodeId) {
return graphTransposed.successors(nodeId);
}
/**
* Returns the indegree of a node.
*
* @param nodeId node specified as a long id
* @return indegree of a node
*/
public long indegree(long nodeId) {
return graphTransposed.outdegree(nodeId);
}
/**
* Returns the degree of a node, depending on graph orientation.
*
* @param nodeId node specified as a long id
* @param useTransposed boolean value to use transposed graph
* @return degree of a node
*/
public long degree(long nodeId, boolean useTransposed) {
return (useTransposed) ? indegree(nodeId) : outdegree(nodeId);
}
/**
* Returns the neighbors of a node (as a lazy iterator), depending on graph orientation.
*
* @param nodeId node specified as a long id
* @param useTransposed boolean value to use transposed graph
* @return lazy iterator of neighbors of the node, specified as a WebGraph LazyLongIterator
*/
public LazyLongIterator neighbors(long nodeId, boolean useTransposed) {
return (useTransposed) ? predecessors(nodeId) : successors(nodeId);
}
/**
* Returns the underlying BVGraph.
*
* @param useTransposed boolean value to use transposed graph
* @return WebGraph BVGraph
*/
public BVGraph getBVGraph(boolean useTransposed) {
return (useTransposed) ? this.graphTransposed : this.graph;
}
}
diff --git a/java/server/src/main/java/org/softwareheritage/graph/Node.java b/java/server/src/main/java/org/softwareheritage/graph/Node.java
index 04fd073..5c04ac1 100644
--- a/java/server/src/main/java/org/softwareheritage/graph/Node.java
+++ b/java/server/src/main/java/org/softwareheritage/graph/Node.java
@@ -1,93 +1,105 @@
package org.softwareheritage.graph;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* A node in the Software Heritage graph.
*
* @author The Software Heritage developers
*/
public class Node {
/**
* Software Heritage graph node types, as described in the
* data model.
*/
public enum Type {
/** Content node */
CNT,
/** Directory node */
DIR,
/** Origin node */
ORI,
/** Release node */
REL,
/** Revision node */
REV,
/** Snapshot node */
SNP;
/**
* Converts integer to corresponding SWH node type.
*
* @param intType node type represented as an integer
* @return the corresponding {@link Node.Type} value
* @see org.softwareheritage.graph.Node.Type
*/
public static Node.Type fromInt(int intType) {
switch (intType) {
- case 0:
- return CNT;
- case 1:
- return DIR;
- case 2:
- return ORI;
- case 3:
- return REL;
- case 4:
- return REV;
- case 5:
- return SNP;
+ case 0: return CNT;
+ case 1: return DIR;
+ case 2: return ORI;
+ case 3: return REL;
+ case 4: return REV;
+ case 5: return SNP;
}
return null;
}
+ /**
+ * Converts node types to the corresponding int value
+ *
+ * @param type node type as an enum
+ * @return the corresponding int value
+ */
+ public static int toInt(Node.Type type) {
+ switch (type) {
+ case CNT: return 0;
+ case DIR: return 1;
+ case ORI: return 2;
+ case REL: return 3;
+ case REV: return 4;
+ case SNP: return 5;
+ }
+ throw new IllegalArgumentException("Unknown node type: " + type);
+ }
+
/**
* Converts string to corresponding SWH node type.
*
* @param strType node type represented as a string
* @return the corresponding {@link Node.Type} value
* @see org.softwareheritage.graph.Node.Type
*/
public static Node.Type fromStr(String strType) {
if (!strType.matches("cnt|dir|ori|rel|rev|snp")) {
throw new IllegalArgumentException("Unknown node type: " + strType);
}
return Node.Type.valueOf(strType.toUpperCase());
}
/**
* Parses SWH node type possible values from formatted string (see the API
* syntax).
*
* @param strFmtType node types represented as a formatted string
* @return a list containing the {@link Node.Type} values
* @see org.softwareheritage.graph.Node.Type
*/
public static ArrayList parse(String strFmtType) {
ArrayList types = new ArrayList<>();
if (strFmtType.equals("*")) {
List nodeTypes = Arrays.asList(Node.Type.values());
types.addAll(nodeTypes);
} else {
types.add(Node.Type.fromStr(strFmtType));
}
return types;
}
}
}
diff --git a/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java b/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java
index 34f44e1..75c7aa8 100644
--- a/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java
+++ b/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java
@@ -1,98 +1,108 @@
package org.softwareheritage.graph;
+import java.lang.System;
+
import com.fasterxml.jackson.annotation.JsonValue;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.codec.DecoderException;
import org.softwareheritage.graph.Node;
/**
* A Software Heritage PID, see persistent
* identifier documentation.
*
* @author The Software Heritage developers
*/
public class SwhPID {
/** Fixed hash length of the PID */
public static final int HASH_LENGTH = 40;
/** Full PID as a string */
String swhPID;
/** PID node type */
Node.Type type;
- /** PID hex-encoded SHA1 hash */
- String hash;
/**
* Constructor.
*
* @param swhPID full PID as a string
*/
public SwhPID(String swhPID) {
this.swhPID = swhPID;
// PID format: 'swh:1:type:hash'
String[] parts = swhPID.split(":");
if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) {
- throw new IllegalArgumentException(
- "Expected SWH PID format to be 'swh:1:type:hash', got: " + swhPID);
+ throw new IllegalArgumentException("malformed SWH PID: " + swhPID);
}
-
this.type = Node.Type.fromStr(parts[2]);
-
- this.hash = parts[3];
- if (!hash.matches("[0-9a-f]{" + HASH_LENGTH + "}")) {
- throw new IllegalArgumentException("Wrong SWH PID hash format in: " + swhPID);
+ if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) {
+ throw new IllegalArgumentException("malformed SWH PID: " + swhPID);
}
}
@Override
public boolean equals(Object otherObj) {
if (otherObj == this)
return true;
if (!(otherObj instanceof SwhPID))
return false;
SwhPID other = (SwhPID) otherObj;
return swhPID.equals(other.getSwhPID());
}
@Override
public int hashCode() {
return swhPID.hashCode();
}
@Override
public String toString() {
return swhPID;
}
+ /** Converts PID to a compact binary representation.
+ *
+ * The binary format is specified in the Python module
+ * swh.graph.pid:str_to_bytes .
+ */
+ public byte[] toBytes() {
+ byte[] bytes = new byte[22];
+ byte[] digest;
+
+ bytes[0] = (byte) 1; // namespace version
+ bytes[1] = (byte) Node.Type.toInt(this.type); // PID type
+ try {
+ digest = Hex.decodeHex(this.swhPID.substring(10)); // SHA1 hash
+ System.arraycopy(digest, 0, bytes, 2, digest.length);
+ } catch (DecoderException e) {
+ throw new IllegalArgumentException("invalid hex sequence in PID: " + this.swhPID);
+ }
+
+ return bytes;
+ }
+
/**
* Returns full PID as a string.
*
* @return full PID string
*/
@JsonValue
public String getSwhPID() {
return swhPID;
}
/**
* Returns PID node type.
*
* @return PID corresponding {@link Node.Type}
* @see org.softwareheritage.graph.Node.Type
*/
public Node.Type getType() {
return type;
}
-
- /**
- * Returns PID hex-encoded SHA1 hash.
- *
- * @return PID string hash
- */
- public String getHash() {
- return hash;
- }
}
diff --git a/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
index 3a8d19a..35ca81f 100644
--- a/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
+++ b/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
@@ -1,123 +1,124 @@
package org.softwareheritage.graph.backend;
-import java.io.BufferedWriter;
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
import java.io.FileInputStream;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.Writer;
import java.util.zip.GZIPInputStream;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectBigArrays;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.backend.NodeTypesMap;
/**
* Pre-processing steps (such as dumping mapping files on disk) before running the graph service.
*
* @author The Software Heritage developers
*/
public class Setup {
/**
* Main entrypoint.
*
* @param args command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.err.println("Usage: NODES_CSV_GZ COMPRESSED_GRAPH_BASE_NAME");
System.exit(1);
}
String nodesPath = args[0];
String graphPath = args[1];
System.out.println("Pre-computing node id maps...");
long startTime = System.nanoTime();
precomputeNodeIdMap(nodesPath, graphPath);
long endTime = System.nanoTime();
double duration = (endTime - startTime) / 1_000_000_000;
System.out.println("Done in: " + duration + " seconds");
}
/**
* Computes and dumps on disk mapping files.
*
* @param nodesPath path of the compressed csv nodes file
* @param graphPath path of the compressed graph
*/
// Suppress warning for Object2LongFunction cast
@SuppressWarnings("unchecked")
static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException {
// First internal mapping: SWH PID (string) -> WebGraph MPH (long)
Object2LongFunction mphMap = null;
try {
mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph");
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("The .mph file contains unknown class object: " + e);
}
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
// Second internal mapping: WebGraph MPH (long) -> BFS ordering (long)
long[][] bfsMap = LongBigArrays.newBigArray(nbIds);
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap);
if (loaded != nbIds) {
throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded);
}
// Dump complete mapping for all nodes: SWH PID (string) <=> WebGraph node id (long)
InputStream nodesStream = new GZIPInputStream(new FileInputStream(nodesPath));
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodesStream, "UTF-8"));
LineIterator swhPIDIterator = new LineIterator(buffer);
- try (Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE));
- Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) {
- // nodeToSwhMap needs to write SWH PID in order of node id, so use a temporary array
+ // for the binary format of pidToNodeMap, see Python module swh.graph.pid:PidToIntMap
+ // for the binary format of nodeToPidMap, see Python module swh.graph.pid:IntToPidMap
+ try (DataOutputStream pidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.PID_TO_NODE)));
+ BufferedOutputStream nodeToPidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_PID))) {
+ // nodeToPidMap needs to write SWH PID in order of node id, so use a temporary array
Object[][] nodeToSwhPID = ObjectBigArrays.newBigArray(nbIds);
// To effectively run edge restriction during graph traversals, we store node id (long) -> SWH
// type map. This is represented as a bitmap using minimum number of bits per Node.Type.
final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2));
final int nbBitsPerNodeType = log2NbTypes;
LongArrayBitVector nodeTypesBitVector =
LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) {
String strSwhPID = swhPIDIterator.next().toString();
+ SwhPID swhPID = new SwhPID(strSwhPID);
+ byte[] swhPIDBin = swhPID.toBytes();
+
long mphId = mphMap.getLong(strSwhPID);
long nodeId = LongBigArrays.get(bfsMap, mphId);
- String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId);
- String line = strSwhPID + " " + paddedNodeId + "\n";
- swhToNodeMap.write(line);
+ pidToNodeMap.write(swhPIDBin, 0, swhPIDBin.length);
+ pidToNodeMap.writeLong(nodeId);
- ObjectBigArrays.set(nodeToSwhPID, nodeId, strSwhPID);
-
- SwhPID swhPID = new SwhPID(strSwhPID);
+ ObjectBigArrays.set(nodeToSwhPID, nodeId, swhPIDBin);
nodeTypesMap.set(nodeId, swhPID.getType().ordinal());
}
BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE);
for (long iNode = 0; iNode < nbIds; iNode++) {
- String line = ObjectBigArrays.get(nodeToSwhPID, iNode).toString() + "\n";
- nodeToSwhMap.write(line);
+ nodeToPidMap.write((byte[]) ObjectBigArrays.get(nodeToSwhPID, iNode));
}
}
}
}
diff --git a/swh/graph/tests/dataset/output/example.node2pid.bin b/swh/graph/tests/dataset/output/example.node2pid.bin
index 7755043..c0e13f5 100644
Binary files a/swh/graph/tests/dataset/output/example.node2pid.bin and b/swh/graph/tests/dataset/output/example.node2pid.bin differ
diff --git a/swh/graph/tests/dataset/output/example.pid2node.bin b/swh/graph/tests/dataset/output/example.pid2node.bin
index 753264c..70fa0aa 100644
Binary files a/swh/graph/tests/dataset/output/example.pid2node.bin and b/swh/graph/tests/dataset/output/example.pid2node.bin differ