diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java
index f8fe5d4..db9a26f 100644
--- a/java/src/main/java/org/softwareheritage/graph/Graph.java
+++ b/java/src/main/java/org/softwareheritage/graph/Graph.java
@@ -1,227 +1,218 @@
package org.softwareheritage.graph;
import java.io.IOException;
import it.unimi.dsi.big.webgraph.BVGraph;
+import it.unimi.dsi.big.webgraph.ImmutableGraph;
+import it.unimi.dsi.big.webgraph.Transform;
import it.unimi.dsi.lang.FlyweightPrototype;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.backend.NodeIdMap;
import org.softwareheritage.graph.backend.NodeTypesMap;
/**
* Main class storing the compressed graph and node id mappings.
*
* The compressed graph is stored using the WebGraph
* ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent
* identifiers (PID) while WebGraph uses integers internally. These two mappings (long id ↔
* PID) are used for the input (users refer to the graph using PID) and the output (convert back to
* PID for users results). However, since graph traversal can be restricted depending on the node
* type (see {@link AllowedEdges}), a long id → node type map is stored as well to avoid a full
* PID lookup.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.AllowedEdges
* @see org.softwareheritage.graph.backend.NodeIdMap
* @see org.softwareheritage.graph.backend.NodeTypesMap
*/
-public class Graph implements FlyweightPrototype {
+public class Graph extends ImmutableGraph {
/** File extension for the SWH PID to long node id map */
public static final String PID_TO_NODE = ".pid2node.bin";
/** File extension for the long node id to SWH PID map */
public static final String NODE_TO_PID = ".node2pid.bin";
/** File extension for the long node id to node type map */
public static final String NODE_TO_TYPE = ".node2type.map";
/** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */
- BVGraph graph;
+ ImmutableGraph graph;
/** Transposed compressed graph (used for backward traversals) */
- BVGraph graphTransposed;
+ ImmutableGraph graphTransposed;
/** Path and basename of the compressed graph */
String path;
/** Mapping long id ↔ SWH PIDs */
NodeIdMap nodeIdMap;
/** Mapping long id → node types */
NodeTypesMap nodeTypesMap;
/**
* Constructor.
*
* @param path path and basename of the compressed graph to load
*/
public Graph(String path) throws IOException {
this.graph = BVGraph.loadMapped(path);
this.graphTransposed = BVGraph.loadMapped(path + "-transposed");
this.path = path;
this.nodeTypesMap = new NodeTypesMap(path);
// TODO: we don't need to load the nodeIdMap now that all the
// translation between ints and PIDs is happening on the Python side.
// However, some parts of the code still depend on this, so it's
// commented out while we decide on what to do with it.
- this.nodeIdMap = null; // new NodeIdMap(path, getNbNodes());
+ this.nodeIdMap = null; // new NodeIdMap(path, numNodes());
}
// Protected empty constructor to implement copy()
protected Graph() { }
+ protected Graph(ImmutableGraph graph, ImmutableGraph graphTransposed, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
+ this.graph = graph;
+ this.graphTransposed = graphTransposed;
+ this.path = path;
+ this.nodeIdMap = nodeIdMap;
+ this.nodeTypesMap = nodeTypesMap;
+ }
+
/**
* Return a flyweight copy of the graph.
*/
@Override
public Graph copy() {
- final Graph ng = new Graph();
- ng.graph = this.graph.copy();
- ng.graphTransposed = this.graphTransposed.copy();
- ng.path = path;
- ng.nodeIdMap = this.nodeIdMap;
- ng.nodeTypesMap = this.nodeTypesMap;
- return ng;
+ return new Graph(this.graph.copy(), this.graphTransposed.copy(), this.path, this.nodeIdMap, this.nodeTypesMap);
+ }
+
+ public Graph transpose() {
+ return new Graph(this.graphTransposed.copy(), this.graph.copy(), this.path, this.nodeIdMap, this.nodeTypesMap);
+ }
+
+ public Graph symmetrize() {
+ ImmutableGraph symmetric = Transform.union(graph, graphTransposed);
+ return new Graph(symmetric, symmetric, this.path, this.nodeIdMap, this.nodeTypesMap);
}
/**
* Cleans up graph resources after use.
*/
public void cleanUp() throws IOException {
nodeIdMap.close();
}
/**
* Returns the graph full path.
*
* @return graph full path
*/
public String getPath() {
return path;
}
/**
* Converts {@link SwhPID} node to long.
*
* @param swhPID node specified as a {@link SwhPID}
* @return internal long node id
* @see org.softwareheritage.graph.SwhPID
*/
public long getNodeId(SwhPID swhPID) {
return nodeIdMap.getNodeId(swhPID);
}
/**
* Converts long id node to {@link SwhPID}.
*
* @param nodeId node specified as a long id
* @return external SWH PID
* @see org.softwareheritage.graph.SwhPID
*/
public SwhPID getSwhPID(long nodeId) {
return nodeIdMap.getSwhPID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see org.softwareheritage.graph.Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return nodeTypesMap.getType(nodeId);
}
+ public boolean randomAccess() { return graph.randomAccess() && graphTransposed.randomAccess(); }
+
/**
* Returns number of nodes in the graph.
*
* @return number of nodes in the graph
*/
- public long getNbNodes() {
+ public long numNodes() {
return graph.numNodes();
}
/**
* Returns number of edges in the graph.
*
* @return number of edges in the graph
*/
- public long getNbEdges() {
+ public long numArcs() {
return graph.numArcs();
}
/**
* Returns lazy iterator of successors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of successors of the node, specified as a WebGraph LazyLongIterator
*/
public LazyLongIterator successors(long nodeId) {
return graph.successors(nodeId);
}
/**
* Returns the outdegree of a node.
*
* @param nodeId node specified as a long id
* @return outdegree of a node
*/
public long outdegree(long nodeId) {
return graph.outdegree(nodeId);
}
/**
* Returns lazy iterator of predecessors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of predecessors of the node, specified as a WebGraph LazyLongIterator
*/
public LazyLongIterator predecessors(long nodeId) {
return graphTransposed.successors(nodeId);
}
/**
* Returns the indegree of a node.
*
* @param nodeId node specified as a long id
* @return indegree of a node
*/
public long indegree(long nodeId) {
return graphTransposed.outdegree(nodeId);
}
- /**
- * Returns the degree of a node, depending on graph orientation.
- *
- * @param nodeId node specified as a long id
- * @param useTransposed boolean value to use transposed graph
- * @return degree of a node
- */
- public long degree(long nodeId, boolean useTransposed) {
- return (useTransposed) ? indegree(nodeId) : outdegree(nodeId);
- }
-
- /**
- * Returns the neighbors of a node (as a lazy iterator), depending on graph orientation.
- *
- * @param nodeId node specified as a long id
- * @param useTransposed boolean value to use transposed graph
- * @return lazy iterator of neighbors of the node, specified as a WebGraph LazyLongIterator
- */
- public LazyLongIterator neighbors(long nodeId, boolean useTransposed) {
- return (useTransposed) ? predecessors(nodeId) : successors(nodeId);
- }
-
/**
* Returns the underlying BVGraph.
*
- * @param useTransposed boolean value to use transposed graph
* @return WebGraph BVGraph
*/
- public BVGraph getBVGraph(boolean useTransposed) {
- return (useTransposed) ? this.graphTransposed : this.graph;
+ public ImmutableGraph getGraph() {
+ return this.graph;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/Neighbors.java b/java/src/main/java/org/softwareheritage/graph/Neighbors.java
index 1b13c88..d7514a1 100644
--- a/java/src/main/java/org/softwareheritage/graph/Neighbors.java
+++ b/java/src/main/java/org/softwareheritage/graph/Neighbors.java
@@ -1,84 +1,80 @@
package org.softwareheritage.graph;
import java.util.Iterator;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.AllowedEdges;
import org.softwareheritage.graph.Graph;
/**
* Iterator class to go over a node neighbors in the graph.
*
* Wrapper iterator class to easily deal with {@link AllowedEdges} during traversals.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.AllowedEdges
*/
public class Neighbors implements Iterable {
/** Graph used to explore neighbors */
Graph graph;
- /** Boolean to specify the use of the transposed graph */
- boolean useTransposed;
/** Graph edge restriction */
AllowedEdges edges;
/** Source node from which neighbors will be listed */
long srcNodeId;
/**
* Constructor.
*
* @param graph graph used to explore neighbors
- * @param useTransposed boolean value to use transposed graph
* @param edges edges allowed to be used in the graph
* @param srcNodeId source node from where to list neighbors
*/
- public Neighbors(Graph graph, boolean useTransposed, AllowedEdges edges, long srcNodeId) {
+ public Neighbors(Graph graph, AllowedEdges edges, long srcNodeId) {
this.graph = graph;
- this.useTransposed = useTransposed;
this.edges = edges;
this.srcNodeId = srcNodeId;
}
@Override
public Iterator iterator() {
return new NeighborsIterator();
}
/**
* Inner class for {@link Neighbors} iterator.
*
* @author The Software Heritage developers
*/
public class NeighborsIterator implements Iterator {
LazyLongIterator neighbors;
long nextNeighborId;
public NeighborsIterator() {
- this.neighbors = graph.neighbors(srcNodeId, useTransposed);
+ this.neighbors = graph.successors(srcNodeId);
this.nextNeighborId = -1;
}
public boolean hasNext() {
// Case 1: no edge restriction, bypass type checks and skip to next neighbor
if (edges.restrictedTo == null) {
nextNeighborId = neighbors.nextLong();
return (nextNeighborId != -1);
}
// Case 2: edge restriction, look ahead for next neighbor
while ((nextNeighborId = neighbors.nextLong()) != -1) {
if (edges.isAllowed(srcNodeId, nextNeighborId)) {
return true;
}
}
return false;
}
public Long next() {
return nextNeighborId;
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java b/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java
deleted file mode 100644
index cc4fd85..0000000
--- a/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package org.softwareheritage.graph.algo;
-
-import java.util.function.BiConsumer;
-
-public interface NodeIdsConsumer extends BiConsumer {
-
- /** Callback for returning a (potentially large) list of node identifiers.
- * The callback will be invoked repeatedly without reallocating the array.
- * At each invocation the array might contain more than size node
- * identifiers, but only identifiers located up to position size-1 are to
- * be considered during that specific invocation.
- */
- void accept(long nodeIds[], int size);
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java b/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java
index 0420a41..0c8a400 100644
--- a/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java
+++ b/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java
@@ -1,494 +1,495 @@
package org.softwareheritage.graph.algo;
import java.util.*;
-import it.unimi.dsi.bits.LongArrayBitVector;
-
import org.softwareheritage.graph.AllowedEdges;
import org.softwareheritage.graph.Endpoint;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Neighbors;
import org.softwareheritage.graph.Node;
/**
* Traversal algorithms on the compressed graph.
*
* Internal implementation of the traversal API endpoints. These methods only input/output internal
* long ids, which are converted in the {@link Endpoint} higher-level class to Software Heritage
* PID.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.Endpoint
*/
public class Traversal {
/** Graph used in the traversal */
Graph graph;
- /** Boolean to specify the use of the transposed graph */
- boolean useTransposed;
/** Graph edge restriction */
AllowedEdges edges;
/** Hash set storing if we have visited a node */
HashSet visited;
/** Hash map storing parent node id for each nodes during a traversal */
Map parentNode;
/** Number of edges accessed during traversal */
long nbEdgesAccessed;
/** random number generator, for random walks */
Random rng;
/**
* Constructor.
*
* @param graph graph used in the traversal
* @param direction a string (either "forward" or "backward") specifying edge orientation
* @param edgesFmt a formatted string describing allowed edges
*/
public Traversal(Graph graph, String direction, String edgesFmt) {
if (!direction.matches("forward|backward")) {
throw new IllegalArgumentException("Unknown traversal direction: " + direction);
}
- this.graph = graph;
- this.useTransposed = (direction.equals("backward"));
+ if (direction.equals("backward")) {
+ this.graph = graph.transpose();
+ } else {
+ this.graph = graph;
+ }
this.edges = new AllowedEdges(graph, edgesFmt);
- long nbNodes = graph.getNbNodes();
this.visited = new HashSet<>();
this.parentNode = new HashMap<>();
this.nbEdgesAccessed = 0;
this.rng = new Random();
}
/**
* Returns number of accessed edges during traversal.
*
* @return number of edges accessed in last traversal
*/
public long getNbEdgesAccessed() {
return nbEdgesAccessed;
}
/**
* Returns number of accessed nodes during traversal.
*
* @return number of nodes accessed in last traversal
*/
public long getNbNodesAccessed() {
return this.visited.size();
}
/**
- * Push version of {@link leaves}: will fire passed callback for each leaf.
+ * Push version of {@link #leaves} will fire passed callback for each leaf.
*/
public void leavesVisitor(long srcNodeId, NodeIdConsumer cb) {
- Stack stack = new Stack();
+ Stack stack = new Stack<>();
this.nbEdgesAccessed = 0;
stack.push(srcNodeId);
visited.add(srcNodeId);
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
long neighborsCnt = 0;
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
neighborsCnt++;
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
}
}
if (neighborsCnt == 0) {
cb.accept(currentNodeId);
}
}
}
/**
* Returns the leaves of a subgraph rooted at the specified source node.
*
* @param srcNodeId source node
* @return list of node ids corresponding to the leaves
*/
public ArrayList leaves(long srcNodeId) {
- ArrayList nodeIds = new ArrayList();
- leavesVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId));
+ ArrayList nodeIds = new ArrayList<>();
+ leavesVisitor(srcNodeId, nodeIds::add);
return nodeIds;
}
/**
- * Push version of {@link neighbors}: will fire passed callback on each
+ * Push version of {@link #neighbors}: will fire passed callback on each
* neighbor.
*/
public void neighborsVisitor(long srcNodeId, NodeIdConsumer cb) {
- this.nbEdgesAccessed = graph.degree(srcNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, srcNodeId)) {
+ this.nbEdgesAccessed = graph.outdegree(srcNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, srcNodeId)) {
cb.accept(neighborNodeId);
}
}
/**
* Returns node direct neighbors (linked with exactly one edge).
*
* @param srcNodeId source node
* @return list of node ids corresponding to the neighbors
*/
public ArrayList neighbors(long srcNodeId) {
- ArrayList nodeIds = new ArrayList();
- neighborsVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId));
+ ArrayList nodeIds = new ArrayList<>();
+ neighborsVisitor(srcNodeId, nodeIds::add);
return nodeIds;
}
/**
- * Push version of {@link visitNodes}: will fire passed callback on each
+ * Push version of {@link #visitNodes}: will fire passed callback on each
* visited node.
*/
public void visitNodesVisitor(long srcNodeId, NodeIdConsumer nodeCb, EdgeIdConsumer edgeCb) {
- Stack stack = new Stack();
+ Stack stack = new Stack<>();
this.nbEdgesAccessed = 0;
stack.push(srcNodeId);
visited.add(srcNodeId);
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
if (nodeCb != null) {
nodeCb.accept(currentNodeId);
}
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
if (edgeCb != null) {
edgeCb.accept(currentNodeId, neighborNodeId);
}
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
}
}
}
}
/** One-argument version to handle callbacks properly */
public void visitNodesVisitor(long srcNodeId, NodeIdConsumer cb) {
visitNodesVisitor(srcNodeId, cb, null);
}
/**
* Performs a graph traversal and returns explored nodes.
*
* @param srcNodeId source node
* @return list of explored node ids
*/
public ArrayList visitNodes(long srcNodeId) {
- ArrayList nodeIds = new ArrayList();
- visitNodesVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId));
+ ArrayList nodeIds = new ArrayList<>();
+ visitNodesVisitor(srcNodeId, nodeIds::add);
return nodeIds;
}
/**
- * Push version of {@link visitPaths}: will fire passed callback on each
+ * Push version of {@link #visitPaths}: will fire passed callback on each
* discovered (complete) path.
*/
public void visitPathsVisitor(long srcNodeId, PathConsumer cb) {
- Stack currentPath = new Stack();
+ Stack currentPath = new Stack<>();
this.nbEdgesAccessed = 0;
visitPathsInternalVisitor(srcNodeId, currentPath, cb);
}
/**
* Performs a graph traversal and returns explored paths.
*
* @param srcNodeId source node
* @return list of explored paths (represented as a list of node ids)
*/
public ArrayList> visitPaths(long srcNodeId) {
ArrayList> paths = new ArrayList<>();
- visitPathsVisitor(srcNodeId, (path) -> paths.add(path));
+ visitPathsVisitor(srcNodeId, paths::add);
return paths;
}
private void visitPathsInternalVisitor(long currentNodeId,
Stack currentPath,
PathConsumer cb) {
currentPath.push(currentNodeId);
long visitedNeighbors = 0;
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
visitPathsInternalVisitor(neighborNodeId, currentPath, cb);
visitedNeighbors++;
}
if (visitedNeighbors == 0) {
- ArrayList path = new ArrayList();
- for (long nodeId : currentPath) {
- path.add(nodeId);
- }
+ ArrayList path = new ArrayList<>(currentPath);
cb.accept(path);
}
currentPath.pop();
}
/**
* Performs a graph traversal with backtracking, and returns the first
* found path from source to destination.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return found path as a list of node ids
*/
public ArrayList walk(long srcNodeId, T dst, String visitOrder) {
- long dstNodeId = -1;
+ long dstNodeId;
if (visitOrder.equals("dfs")) {
dstNodeId = walkInternalDFS(srcNodeId, dst);
} else if (visitOrder.equals("bfs")) {
dstNodeId = walkInternalBFS(srcNodeId, dst);
} else {
throw new IllegalArgumentException("Unknown visit order: " + visitOrder);
}
if (dstNodeId == -1) {
throw new IllegalArgumentException("Cannot find destination: " + dst);
}
- ArrayList nodeIds = backtracking(srcNodeId, dstNodeId);
- return nodeIds;
+ return backtracking(srcNodeId, dstNodeId);
}
/**
* Performs a random walk (picking a random successor at each step) from
* source to destination.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return found path as a list of node ids or an empty path to indicate
* that no suitable path have been found
*/
public ArrayList randomWalk(long srcNodeId, T dst) {
return randomWalk(srcNodeId, dst, 0);
}
/**
* Performs a stubborn random walk (picking a random successor at each
* step) from source to destination. The walk is "stubborn" in the sense
* that it will not give up the first time if a satisfying target node is
* found, but it will retry up to a limited amount of times.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @param retries number of times to retry; 0 means no retries (single walk)
* @return found path as a list of node ids or an empty path to indicate
* that no suitable path have been found
*/
public ArrayList randomWalk(long srcNodeId, T dst, int retries) {
long curNodeId = srcNodeId;
- ArrayList path = new ArrayList();
+ ArrayList path = new ArrayList<>();
this.nbEdgesAccessed = 0;
boolean found;
if (retries < 0) {
throw new IllegalArgumentException("Negative number of retries given: " + retries);
}
while (true) {
path.add(curNodeId);
- Neighbors neighbors = new Neighbors(graph, useTransposed, edges, curNodeId);
+ Neighbors neighbors = new Neighbors(graph, edges, curNodeId);
curNodeId = randomPick(neighbors.iterator());
if (curNodeId < 0) {
found = false;
break;
}
if (isDstNode(curNodeId, dst)) {
path.add(curNodeId);
found = true;
break;
}
}
if (found) {
return path;
} else if (retries > 0) { // try again
return randomWalk(srcNodeId, dst, retries - 1);
} else { // not found and no retries left
path.clear();
return path;
}
}
/**
* Randomly choose an element from an iterator over Longs using reservoir
* sampling
*
* @param elements iterator over selection domain
* @return randomly chosen element or -1 if no suitable element was found
*/
private long randomPick(Iterator elements) {
long curPick = -1;
long seenCandidates = 0;
while (elements.hasNext()) {
seenCandidates++;
if (Math.round(rng.nextFloat() * (seenCandidates - 1)) == 0) {
curPick = elements.next();
}
}
return curPick;
}
/**
* Internal DFS function of {@link #walk}.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return final destination node or -1 if no path found
*/
private long walkInternalDFS(long srcNodeId, T dst) {
- Stack stack = new Stack();
+ Stack stack = new Stack<>();
this.nbEdgesAccessed = 0;
stack.push(srcNodeId);
visited.add(srcNodeId);
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
if (isDstNode(currentNodeId, dst)) {
return currentNodeId;
}
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
parentNode.put(neighborNodeId, currentNodeId);
}
}
}
return -1;
}
/**
* Internal BFS function of {@link #walk}.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return final destination node or -1 if no path found
*/
private long walkInternalBFS(long srcNodeId, T dst) {
- Queue queue = new LinkedList();
+ Queue queue = new LinkedList<>();
this.nbEdgesAccessed = 0;
queue.add(srcNodeId);
visited.add(srcNodeId);
while (!queue.isEmpty()) {
long currentNodeId = queue.poll();
if (isDstNode(currentNodeId, dst)) {
return currentNodeId;
}
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
if (!visited.contains(neighborNodeId)) {
queue.add(neighborNodeId);
visited.add(neighborNodeId);
parentNode.put(neighborNodeId, currentNodeId);
}
}
}
return -1;
}
/**
* Internal function of {@link #walk} to check if a node corresponds to the destination.
*
* @param nodeId current node
* @param dst destination (either a node or a node type)
* @return true if the node is a destination, or false otherwise
*/
private boolean isDstNode(long nodeId, T dst) {
if (dst instanceof Long) {
long dstNodeId = (Long) dst;
return nodeId == dstNodeId;
} else if (dst instanceof Node.Type) {
Node.Type dstType = (Node.Type) dst;
return graph.getNodeType(nodeId) == dstType;
} else {
return false;
}
}
/**
* Internal backtracking function of {@link #walk}.
*
* @param srcNodeId source node
* @param dstNodeId destination node
* @return the found path, as a list of node ids
*/
private ArrayList backtracking(long srcNodeId, long dstNodeId) {
- ArrayList path = new ArrayList();
+ ArrayList path = new ArrayList<>();
long currentNodeId = dstNodeId;
while (currentNodeId != srcNodeId) {
path.add(currentNodeId);
currentNodeId = parentNode.get(currentNodeId);
}
path.add(srcNodeId);
Collections.reverse(path);
return path;
}
+ /**
+ * Find a common descendant between two given nodes using two parallel BFS
+ *
+ * @param lhsNode the first node
+ * @param rhsNode the second node
+ * @return the found path, as a list of node ids
+ */
public Long findCommonDescendant(long lhsNode, long rhsNode) {
Queue lhsStack = new ArrayDeque<>();
Queue rhsStack = new ArrayDeque<>();
HashSet lhsVisited = new HashSet<>();
HashSet rhsVisited = new HashSet<>();
lhsStack.add(lhsNode);
rhsStack.add(rhsNode);
lhsVisited.add(lhsNode);
rhsVisited.add(rhsNode);
this.nbEdgesAccessed = 0;
Long curNode;
while (!lhsStack.isEmpty() || !rhsStack.isEmpty()) {
if (!lhsStack.isEmpty()) {
curNode = lhsStack.poll();
- nbEdgesAccessed += graph.degree(curNode, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, curNode)) {
+ nbEdgesAccessed += graph.outdegree(curNode);
+ for (long neighborNodeId : new Neighbors(graph, edges, curNode)) {
if (!lhsVisited.contains(neighborNodeId)) {
if (rhsVisited.contains(neighborNodeId))
return neighborNodeId;
lhsStack.add(neighborNodeId);
lhsVisited.add(neighborNodeId);
}
}
}
if (!rhsStack.isEmpty()) {
curNode = rhsStack.poll();
- nbEdgesAccessed += graph.degree(curNode, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, curNode)) {
+ nbEdgesAccessed += graph.outdegree(curNode);
+ for (long neighborNodeId : new Neighbors(graph, edges, curNode)) {
if (!rhsVisited.contains(neighborNodeId)) {
if (lhsVisited.contains(neighborNodeId))
return neighborNodeId;
rhsStack.add(neighborNodeId);
rhsVisited.add(neighborNodeId);
}
}
}
}
return null;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java b/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java
index c080a93..1f57665 100644
--- a/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java
@@ -1,216 +1,217 @@
package org.softwareheritage.graph.backend;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Scanner;
import java.util.concurrent.*;
import it.unimi.dsi.bits.LongArrayBitVector;
+import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectBigArrays;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.backend.NodeTypesMap;
/**
* Create maps needed at runtime by the graph service, in particular:
*
* - SWH PID → WebGraph long node id
* - WebGraph long node id → SWH PID (converse of the former)
* - WebGraph long node id → SWH node type (enum)
*
* @author The Software Heritage developers
*/
public class MapBuilder {
final static String SORT_BUFFER_SIZE = "40%";
final static Logger logger = LoggerFactory.getLogger(MapBuilder.class);
/**
* Main entrypoint.
*
* @param args command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV");
System.exit(1);
}
String graphPath = args[0];
String tmpDir = args[1];
logger.info("starting maps generation...");
precomputeNodeIdMap(graphPath, tmpDir);
logger.info("maps generation completed");
}
/**
* Computes and dumps on disk mapping files.
*
* @param graphPath path of the compressed graph
*/
// Suppress warning for Object2LongFunction cast
@SuppressWarnings("unchecked")
static void precomputeNodeIdMap(String graphPath, String tmpDir)
throws IOException
{
ProgressLogger plPid2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
ProgressLogger plNode2Pid = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plPid2Node.itemsName = "pid→node";
plNode2Pid.itemsName = "node→pid";
// avg speed for pid→node is sometime skewed due to write to the sort
// pipe hanging when sort is sorting; hence also desplay local speed
plPid2Node.displayLocalSpeed = true;
// first half of PID->node mapping: PID -> WebGraph MPH (long)
Object2LongFunction mphMap = null;
try {
logger.info("loading MPH function...");
mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph");
logger.info("MPH function loaded");
} catch (ClassNotFoundException e) {
logger.error("unknown class object in .mph file: " + e);
System.exit(2);
}
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
plPid2Node.expectedUpdates = nbIds;
plNode2Pid.expectedUpdates = nbIds;
// second half of PID->node mapping: WebGraph MPH (long) -> BFS order (long)
long[][] bfsMap = LongBigArrays.newBigArray(nbIds);
logger.info("loading BFS order file...");
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap);
logger.info("BFS order file loaded");
if (loaded != nbIds) {
logger.error("graph contains " + nbIds + " nodes, but read " + loaded);
System.exit(2);
}
// Create mapping SWH PID -> WebGraph node id, by sequentially reading
// nodes, hashing them with MPH, and permuting according to BFS order
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in,
StandardCharsets.US_ASCII));
LineIterator swhPIDIterator = new LineIterator(buffer);
// The WebGraph node id -> SWH PID mapping can be obtained from the
// PID->node one by numerically sorting on node id and sequentially
// writing obtained PIDs to a binary map. Delegates the sorting job to
// /usr/bin/sort via pipes
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command("sort", "--numeric-sort", "--key", "2",
"--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir);
Process sort = processBuilder.start();
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream());
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream());
// for the binary format of pidToNodeMap, see Python module swh.graph.pid:PidToIntMap
// for the binary format of nodeToPidMap, see Python module swh.graph.pid:IntToPidMap
try (DataOutputStream pidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.PID_TO_NODE)));
BufferedOutputStream nodeToPidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_PID))) {
// background handler for sort output, it will be fed PID/node
// pairs while pidToNodeMap is being filled, and will itself fill
// nodeToPidMap as soon as data from sort is ready
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToPidMap, plNode2Pid);
outputHandler.start();
// Type map from WebGraph node ID to SWH type. Used at runtime by
// pure Java graph traversals to efficiently check edge
// restrictions.
final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length)
/ Math.log(2));
final int nbBitsPerNodeType = log2NbTypes;
LongArrayBitVector nodeTypesBitVector =
LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
plPid2Node.start("filling pid2node map");
for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) {
String strSwhPID = swhPIDIterator.next().toString();
SwhPID swhPID = new SwhPID(strSwhPID);
byte[] swhPIDBin = swhPID.toBytes();
long mphId = mphMap.getLong(strSwhPID);
- long nodeId = LongBigArrays.get(bfsMap, mphId);
+ long nodeId = BigArrays.get(bfsMap, mphId);
pidToNodeMap.write(swhPIDBin, 0, swhPIDBin.length);
pidToNodeMap.writeLong(nodeId);
sort_stdin.write((strSwhPID + "\t" + nodeId + "\n")
.getBytes(StandardCharsets.US_ASCII));
nodeTypesMap.set(nodeId, swhPID.getType().ordinal());
plPid2Node.lightUpdate();
}
plPid2Node.done();
sort_stdin.close();
// write type map
logger.info("storing type map");
BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE);
logger.info("type map stored");
// wait for nodeToPidMap filling
try {
logger.info("waiting for node2pid map...");
int sortExitCode = sort.waitFor();
if (sortExitCode != 0) {
logger.error("sort returned non-zero exit code: " + sortExitCode);
System.exit(2);
}
outputHandler.join();
} catch (InterruptedException e) {
logger.error("processing of sort output failed with: " + e);
System.exit(2);
}
}
}
private static class SortOutputHandler extends Thread {
private Scanner input;
private OutputStream output;
private ProgressLogger pl;
SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) {
this.input = new Scanner(input, StandardCharsets.US_ASCII);
this.output = output;
this.pl = pl;
}
public void run() {
boolean sortDone = false;
logger.info("node2pid: waiting for sort output...");
while (input.hasNextLine()) {
if (! sortDone) {
sortDone = true;
this.pl.start("filling node2pid map");
}
String line = input.nextLine(); // format: SWH_PID NODE_ID
SwhPID swhPID = new SwhPID(line.split("\\t")[0]); // get PID
try {
output.write((byte[]) swhPID.toBytes());
} catch (IOException e) {
logger.error("writing to node->PID map failed with: " + e);
}
this.pl.lightUpdate();
}
this.pl.done();
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/backend/Pp.java b/java/src/main/java/org/softwareheritage/graph/backend/Pp.java
index 3cd0eb7..af495a7 100644
--- a/java/src/main/java/org/softwareheritage/graph/backend/Pp.java
+++ b/java/src/main/java/org/softwareheritage/graph/backend/Pp.java
@@ -1,42 +1,23 @@
package org.softwareheritage.graph.backend;
-import java.io.BufferedWriter;
-import java.io.FileInputStream;
-import java.io.FileWriter;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Writer;
-import java.util.zip.GZIPInputStream;
-
-import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
-import it.unimi.dsi.fastutil.longs.LongBigArrays;
-import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
-import it.unimi.dsi.fastutil.objects.ObjectBigArrays;
-import it.unimi.dsi.io.FastBufferedReader;
-import it.unimi.dsi.io.LineIterator;
-
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
-import org.softwareheritage.graph.SwhPID;
-import org.softwareheritage.graph.backend.NodeTypesMap;
public class Pp {
-
+ @SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException {
Object2LongFunction mphMap = null;
try {
mphMap = (Object2LongFunction) BinIO.loadObject("all.mph");
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("The .mph file contains unknown class object: " + e);
}
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
System.out.println("mph size: " + nbIds);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
index 27334f9..5d5069b 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
@@ -1,130 +1,115 @@
package org.softwareheritage.graph.benchmark;
import com.google.common.primitives.Longs;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.io.ByteDiskQueue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Graph;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.fastutil.Arrays;
import java.io.File;
import java.io.IOException;
public class BFS {
- private Graph graph;
-
- private void load_graph(String graphBasename) throws IOException {
- System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = new Graph(graphBasename);
- System.err.println("Graph loaded.");
- }
+ private final ImmutableGraph graph;
private final static Logger LOGGER = LoggerFactory.getLogger(BFS.class);
+ public BFS(ImmutableGraph graph) {
+ this.graph = graph;
+ }
+
// Partly inlined from it.unimi.dsi.law.big.graph.BFS
- private static void bfsperm(final ImmutableGraph graph) throws IOException {
+ private void bfsperm() throws IOException {
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
final LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
final ProgressLogger pl = new ProgressLogger(LOGGER);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting breadth-first visit...");
- long pos = 0;
-
for (long i = 0; i < n; i++) {
if (visited.getBoolean(i)) continue;
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while((succ = iterator.nextLong()) != -1) {
if (!visited.getBoolean(succ)) {
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
}
pl.update();
}
}
pl.done();
queue.close();
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(
BFS.class.getName(),
"",
new Parameter[] {
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),
new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED,
'T', "transposed", "Use transposed graph (default: false)"),
}
);
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
boolean useTransposed = config.getBoolean("useTransposed");
- ProgressLogger logger = new ProgressLogger();
- long startTime;
- double totalTime;
-
+ System.err.println("Loading graph " + graphPath + " ...");
+ Graph graph = new Graph(graphPath);
+ System.err.println("Graph loaded.");
- BFS bfs = new BFS();
- try {
- bfs.load_graph(graphPath);
- } catch (IOException e) {
- System.out.println("Could not load graph: " + e);
- System.exit(2);
- }
+ if (useTransposed)
+ graph = graph.transpose();
- logger.start("Parallel BFS visit...");
- try {
- BFS.bfsperm(bfs.graph.getBVGraph(useTransposed));
- } catch (IOException e) {
- e.printStackTrace();
- }
- logger.done();
+ BFS bfs = new BFS(graph);
+ bfs.bfsperm();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
index 0eb6c23..bdf51b6 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
@@ -1,170 +1,170 @@
package org.softwareheritage.graph.benchmark;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.StringJoiner;
import java.util.function.Function;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
import org.softwareheritage.graph.Endpoint;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.benchmark.utils.Random;
import org.softwareheritage.graph.benchmark.utils.Statistics;
/**
* Benchmark common utility functions.
*
* @author The Software Heritage developers
*/
public class Benchmark {
/**
* Input arguments.
*/
public class Args {
/** Basename of the compressed graph */
public String graphPath;
/** Number of random nodes to use for the benchmark */
public int nbNodes;
/** File name for CSV format benchmark log */
public String logFile;
/** Random generator */
public Random random;
}
/** Command line arguments */
public Args args;
/** CSV separator for log file */
final String CSV_SEPARATOR = ";";
/**
* Constructor.
*/
public Benchmark() {
this.args = new Args();
}
/**
* Parses benchmark command line arguments.
*
* @param args command line arguments
*/
public void parseCommandLineArgs(String[] args) throws JSAPException {
SimpleJSAP jsap = new SimpleJSAP(Benchmark.class.getName(),
"Benchmark tool for Software Heritage use-cases scenarios.",
new Parameter[] {
new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
JSAP.NOT_GREEDY, "The basename of the compressed graph."),
new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n',
"nb-nodes", "Number of random nodes used to do the benchmark."),
new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l',
"log-file", "File name to output CSV format benchmark log."),
new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's',
"seed", "Random generator seed."),
});
JSAPResult config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
this.args.graphPath = config.getString("graphPath");
this.args.nbNodes = config.getInt("nbNodes");
this.args.logFile = config.getString("logFile");
this.args.random = config.contains("seed") ? new Random(config.getLong("seed")) : new Random();
}
/**
* Creates CSV file for log output.
*/
public void createCSVLogFile() throws IOException {
try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile))) {
StringJoiner csvHeader = new StringJoiner(CSV_SEPARATOR);
csvHeader.add("use case name")
.add("SWH PID")
.add("number of edges accessed")
.add("traversal timing")
.add("pid2node timing")
.add("node2pid timing");
csvLog.write(csvHeader.toString() + "\n");
}
}
/**
* Times a specific endpoint and outputs individual datapoints along with aggregated statistics.
*
* @param useCaseName benchmark use-case name
* @param graph compressed graph used in the benchmark
* @param nodeIds node ids to use as starting point for the endpoint traversal
* @param operation endpoint function to benchmark
* @param dstFmt destination formatted string as described in the API
* @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs")
*/
public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds,
Function operation, String dstFmt, String algorithm)
throws IOException {
ArrayList timings = new ArrayList<>();
ArrayList timingsNormalized = new ArrayList<>();
ArrayList nbEdgesAccessed = new ArrayList<>();
final boolean append = true;
try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile, append))) {
for (long nodeId : nodeIds) {
SwhPID swhPID = graph.getSwhPID(nodeId);
Endpoint.Output output = (dstFmt == null)
? operation.apply(new Endpoint.Input(swhPID))
: operation.apply(new Endpoint.Input(swhPID, dstFmt, algorithm));
StringJoiner csvLine = new StringJoiner(CSV_SEPARATOR);
csvLine.add(useCaseName)
.add(swhPID.toString())
.add(Long.toString(output.meta.nbEdgesAccessed))
.add(Double.toString(output.meta.timings.traversal))
.add(Double.toString(output.meta.timings.pid2node))
.add(Double.toString(output.meta.timings.node2pid));
csvLog.write(csvLine.toString() + "\n");
timings.add(output.meta.timings.traversal);
nbEdgesAccessed.add((double) output.meta.nbEdgesAccessed);
if (output.meta.nbEdgesAccessed != 0) {
timingsNormalized.add(output.meta.timings.traversal / output.meta.nbEdgesAccessed);
}
}
}
System.out.println("\n" + useCaseName + " use-case:");
System.out.println("timings:");
Statistics stats = new Statistics(timings);
stats.printAll();
System.out.println("timings normalized:");
Statistics statsNormalized = new Statistics(timingsNormalized);
statsNormalized.printAll();
System.out.println("nb edges accessed:");
Statistics statsNbEdgesAccessed = new Statistics(nbEdgesAccessed);
statsNbEdgesAccessed.printAll();
}
/**
- * Same as {@link timeEndpoint} but without destination or algorithm specified to endpoint call.
+ * Same as {@link #timeEndpoint} but without destination or algorithm specified to endpoint call.
*/
public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds,
Function operation) throws IOException {
timeEndpoint(useCaseName, graph, nodeIds, operation, null, null);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java
index e6a5cd2..722dd74 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java
@@ -1,223 +1,216 @@
package org.softwareheritage.graph.benchmark;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
-import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.big.webgraph.Transform;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.logging.ProgressLogger;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class ForkCC {
public Boolean includeRootDir;
private Graph graph;
private Long emptySnapshot;
private LongArrayBitVector visited;
private LongArrayBitVector whitelist;
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = new Graph(graphBasename);
+ this.graph = new Graph(graphBasename).symmetrize();
System.err.println("Graph loaded.");
this.emptySnapshot = null;
this.whitelist = null;
this.visited = null;
this.includeRootDir = null;
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(
ForkCC.class.getName(),
"",
new Parameter[] {
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),
new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED,
't', "whitelist", "Whitelist of origins"),
new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED,
'w', "numthreads", "Number of threads"),
new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED,
'R', "includerootdir", "Include root directory (default: false)"),
}
);
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private boolean nodeIsEmptySnapshot(Long node) {
if (this.emptySnapshot == null
&& this.graph.getNodeType(node) == Node.Type.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private Boolean shouldVisit(Long node){
Node.Type nt = this.graph.getNodeType(node);
if (nt == Node.Type.CNT) {
return false;
}
if (nt == Node.Type.DIR && !includeRootDir)
return false;
if (this.nodeIsEmptySnapshot(node))
return false;
if (visited.getBoolean(node))
return false;
return true;
}
- private ArrayList> compute(final ImmutableGraph graph, ProgressLogger pl) throws IOException {
+ private ArrayList> compute(ProgressLogger pl) throws IOException {
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
visited = LongArrayBitVector.ofLength(n);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting connected components visit...");
ArrayList> components = new ArrayList<>();
for (long i = 0; i < n; i++) {
if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) continue;
ArrayList component = new ArrayList<>();
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
Node.Type cur_nt = this.graph.getNodeType(currentNode);
if (cur_nt == Node.Type.ORI
&& (this.whitelist == null || this.whitelist.getBoolean(currentNode))) {
// TODO: add a check that the origin has >=1 non-empty snapshot
component.add(currentNode);
}
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (!shouldVisit(succ)) continue;
if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
pl.update();
}
if (component.size() > 0) {
components.add(component);
}
}
pl.done();
queue.close();
return components;
}
private static void printDistribution(ArrayList> components) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
System.out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
System.out.println(node);
}
}
private void parseWhitelist(String path) {
System.err.println("Loading whitelist " + path + " ...");
- this.whitelist = LongArrayBitVector.ofLength(this.graph.getNbNodes());
+ this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes());
Scanner scanner = null;
try {
scanner = new Scanner(new File(path));
while(scanner.hasNextLong()) {
whitelist.set(scanner.nextLong());
}
System.err.println("Whitelist loaded.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
int numThreads = config.getInt("numThreads");
String whitelistPath = config.getString("whitelistPath");
boolean includeRootDir = config.getBoolean("includeRootDir");
ForkCC forkCc = new ForkCC();
try {
forkCc.load_graph(graphPath);
forkCc.includeRootDir = includeRootDir;
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
if (whitelistPath != null) {
forkCc.parseWhitelist(whitelistPath);
}
- ImmutableGraph symmetric = Transform.union(
- forkCc.graph.getBVGraph(false),
- forkCc.graph.getBVGraph(true)
- );
-
ProgressLogger logger = new ProgressLogger();
try {
- ArrayList> components = forkCc.compute(symmetric, logger);
+ ArrayList> components = forkCc.compute(logger);
printDistribution(components);
// printLargestComponent(components);
} catch (IOException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java
index a6f1c79..402632c 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java
@@ -1,93 +1,93 @@
package org.softwareheritage.graph.benchmark;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import java.io.IOException;
import java.util.ArrayList;
public class ListEmptyOrigins {
private Graph graph;
private Long emptySnapshot;
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
this.graph = new Graph(graphBasename);
System.err.println("Graph loaded.");
this.emptySnapshot = null;
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(
ListEmptyOrigins.class.getName(),
"",
new Parameter[] {
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),
}
);
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private boolean nodeIsEmptySnapshot(Long node) {
System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node);
if (this.emptySnapshot == null
&& this.graph.getNodeType(node) == Node.Type.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private ArrayList compute(ImmutableGraph graph) {
final long n = graph.numNodes();
ArrayList bad = new ArrayList<>();
for (long i = 0; i < n; i++) {
Node.Type nt = this.graph.getNodeType(i);
if (nt != Node.Type.ORI) continue;
final LazyLongIterator iterator = graph.successors(i);
long succ;
boolean found = false;
while ((succ = iterator.nextLong()) != -1) {
if (this.graph.outdegree(succ) > 0) {
found = true;
}
}
if (!found)
bad.add(i);
}
return bad;
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
ListEmptyOrigins leo = new ListEmptyOrigins();
try {
leo.load_graph(graphPath);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
- ArrayList badlist = leo.compute(leo.graph.getBVGraph(false));
+ ArrayList badlist = leo.compute(leo.graph);
for (Long bad : badlist) {
System.out.println(bad);
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
index abcec76..df93c96 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
@@ -1,67 +1,67 @@
package org.softwareheritage.graph.benchmark.utils;
import java.util.PrimitiveIterator;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
/**
* Random related utility class.
*
* @author The Software Heritage developers
*/
public class Random {
/** Internal pseudorandom generator */
java.util.Random random;
/**
* Constructor.
*/
public Random() {
this.random = new java.util.Random();
}
/**
* Constructor.
*
* @param seed random generator seed
*/
public Random(long seed) {
this.random = new java.util.Random(seed);
}
/**
* Generates random node ids.
*
* @param graph graph used to pick node ids
* @param nbNodes number of node ids to generate
* @return an array of random node ids
*/
public long[] generateNodeIds(Graph graph, int nbNodes) {
- return random.longs(nbNodes, 0, graph.getNbNodes()).toArray();
+ return random.longs(nbNodes, 0, graph.numNodes()).toArray();
}
/**
* Generates random node ids with a specific type.
*
* @param graph graph used to pick node ids
* @param nbNodes number of node ids to generate
* @param expectedType specific node type to pick
* @return an array of random node ids
*/
public long[] generateNodeIdsOfType(Graph graph, int nbNodes, Node.Type expectedType) {
- PrimitiveIterator.OfLong nodes = random.longs(0, graph.getNbNodes()).iterator();
+ PrimitiveIterator.OfLong nodes = random.longs(0, graph.numNodes()).iterator();
long[] nodeIds = new long[nbNodes];
long nextId;
for (int i = 0; i < nbNodes; i++) {
do {
nextId = nodes.nextLong();
} while (graph.getNodeType(nextId) != expectedType);
nodeIds[i] = nextId;
}
return nodeIds;
}
}
diff --git a/swh/graph/graph.py b/swh/graph/graph.py
index fda4c68..fcc71ba 100644
--- a/swh/graph/graph.py
+++ b/swh/graph/graph.py
@@ -1,185 +1,185 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import asyncio
import contextlib
import functools
from swh.graph.backend import Backend
from swh.graph.dot import dot_to_svg, graph_dot, KIND_TO_SHAPE
BASE_URL = "https://archive.softwareheritage.org/browse"
KIND_TO_URL_FRAGMENT = {
"ori": "/origin/{}",
"snp": "/snapshot/{}",
"rel": "/release/{}",
"rev": "/revision/{}",
"dir": "/directory/{}",
"cnt": "/content/sha1_git:{}/",
}
def call_async_gen(generator, *args, **kwargs):
loop = asyncio.get_event_loop()
it = generator(*args, **kwargs).__aiter__()
while True:
try:
res = loop.run_until_complete(it.__anext__())
yield res
except StopAsyncIteration:
break
class Neighbors:
"""Neighbor iterator with custom O(1) length method"""
def __init__(self, graph, iterator, length_func):
self.graph = graph
self.iterator = iterator
self.length_func = length_func
def __iter__(self):
return self
def __next__(self):
succ = self.iterator.nextLong()
if succ == -1:
raise StopIteration
return GraphNode(self.graph, succ)
def __len__(self):
return self.length_func()
class GraphNode:
"""Node in the SWH graph"""
def __init__(self, graph, node_id):
self.graph = graph
self.id = node_id
def children(self):
return Neighbors(
self.graph,
self.graph.java_graph.successors(self.id),
lambda: self.graph.java_graph.outdegree(self.id),
)
def parents(self):
return Neighbors(
self.graph,
self.graph.java_graph.predecessors(self.id),
lambda: self.graph.java_graph.indegree(self.id),
)
def simple_traversal(self, ttype, direction="forward", edges="*"):
for node in call_async_gen(
self.graph.backend.simple_traversal, ttype, direction, edges, self.id
):
yield self.graph[node]
def leaves(self, *args, **kwargs):
yield from self.simple_traversal("leaves", *args, **kwargs)
def visit_nodes(self, *args, **kwargs):
yield from self.simple_traversal("visit_nodes", *args, **kwargs)
def visit_edges(self, direction="forward", edges="*"):
for src, dst in call_async_gen(
self.graph.backend.visit_edges, direction, edges, self.id
):
yield (self.graph[src], self.graph[dst])
def visit_paths(self, direction="forward", edges="*"):
for path in call_async_gen(
self.graph.backend.visit_paths, direction, edges, self.id
):
yield [self.graph[node] for node in path]
def walk(self, dst, direction="forward", edges="*", traversal="dfs"):
for node in call_async_gen(
self.graph.backend.walk, direction, edges, traversal, self.id, dst
):
yield self.graph[node]
def _count(self, ttype, direction="forward", edges="*"):
return self.graph.backend.count(ttype, direction, edges, self.id)
count_leaves = functools.partialmethod(_count, ttype="leaves")
count_neighbors = functools.partialmethod(_count, ttype="neighbors")
count_visit_nodes = functools.partialmethod(_count, ttype="visit_nodes")
@property
def pid(self):
return self.graph.node2pid[self.id]
@property
def kind(self):
return self.pid.split(":")[2]
def __str__(self):
return self.pid
def __repr__(self):
return "<{}>".format(self.pid)
def dot_fragment(self):
swh, version, kind, hash = self.pid.split(":")
label = "{}:{}..{}".format(kind, hash[0:2], hash[-2:])
url = BASE_URL + KIND_TO_URL_FRAGMENT[kind].format(hash)
shape = KIND_TO_SHAPE[kind]
return '{} [label="{}", href="{}", target="_blank", shape="{}"];'.format(
self.id, label, url, shape
)
def _repr_svg_(self):
nodes = [self, *list(self.children()), *list(self.parents())]
dot = graph_dot(nodes)
svg = dot_to_svg(dot)
return svg
class Graph:
def __init__(self, backend, node2pid, pid2node):
self.backend = backend
self.java_graph = backend.entry.get_graph()
self.node2pid = node2pid
self.pid2node = pid2node
def stats(self):
return self.backend.stats()
@property
def path(self):
return self.java_graph.getPath()
def __len__(self):
- return self.java_graph.getNbNodes()
+ return self.java_graph.numNodes()
def __getitem__(self, node_id):
if isinstance(node_id, int):
self.node2pid[node_id] # check existence
return GraphNode(self, node_id)
elif isinstance(node_id, str):
node_id = self.pid2node[node_id]
return GraphNode(self, node_id)
def __iter__(self):
for pid, pos in self.backend.pid2node:
yield self[pid]
def iter_prefix(self, prefix):
for pid, pos in self.backend.pid2node.iter_prefix(prefix):
yield self[pid]
def iter_type(self, pid_type):
for pid, pos in self.backend.pid2node.iter_type(pid_type):
yield self[pid]
@contextlib.contextmanager
def load(graph_path):
with Backend(graph_path) as backend:
yield Graph(backend, backend.node2pid, backend.pid2node)