Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345912
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
77 KB
Subscribers
None
View Options
diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java
index f8fe5d4..db9a26f 100644
--- a/java/src/main/java/org/softwareheritage/graph/Graph.java
+++ b/java/src/main/java/org/softwareheritage/graph/Graph.java
@@ -1,227 +1,218 @@
package org.softwareheritage.graph;
import java.io.IOException;
import it.unimi.dsi.big.webgraph.BVGraph;
+import it.unimi.dsi.big.webgraph.ImmutableGraph;
+import it.unimi.dsi.big.webgraph.Transform;
import it.unimi.dsi.lang.FlyweightPrototype;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.backend.NodeIdMap;
import org.softwareheritage.graph.backend.NodeTypesMap;
/**
* Main class storing the compressed graph and node id mappings.
* <p>
* The compressed graph is stored using the <a href="http://webgraph.di.unimi.it/">WebGraph</a>
* ecosystem. Additional mappings are necessary because Software Heritage uses string based <a
* href="https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html">persistent
* identifiers</a> (PID) while WebGraph uses integers internally. These two mappings (long id ↔
* PID) are used for the input (users refer to the graph using PID) and the output (convert back to
* PID for users results). However, since graph traversal can be restricted depending on the node
* type (see {@link AllowedEdges}), a long id → node type map is stored as well to avoid a full
* PID lookup.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.AllowedEdges
* @see org.softwareheritage.graph.backend.NodeIdMap
* @see org.softwareheritage.graph.backend.NodeTypesMap
*/
-public class Graph implements FlyweightPrototype<Graph> {
+public class Graph extends ImmutableGraph {
/** File extension for the SWH PID to long node id map */
public static final String PID_TO_NODE = ".pid2node.bin";
/** File extension for the long node id to SWH PID map */
public static final String NODE_TO_PID = ".node2pid.bin";
/** File extension for the long node id to node type map */
public static final String NODE_TO_TYPE = ".node2type.map";
/** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */
- BVGraph graph;
+ ImmutableGraph graph;
/** Transposed compressed graph (used for backward traversals) */
- BVGraph graphTransposed;
+ ImmutableGraph graphTransposed;
/** Path and basename of the compressed graph */
String path;
/** Mapping long id ↔ SWH PIDs */
NodeIdMap nodeIdMap;
/** Mapping long id → node types */
NodeTypesMap nodeTypesMap;
/**
* Constructor.
*
* @param path path and basename of the compressed graph to load
*/
public Graph(String path) throws IOException {
this.graph = BVGraph.loadMapped(path);
this.graphTransposed = BVGraph.loadMapped(path + "-transposed");
this.path = path;
this.nodeTypesMap = new NodeTypesMap(path);
// TODO: we don't need to load the nodeIdMap now that all the
// translation between ints and PIDs is happening on the Python side.
// However, some parts of the code still depend on this, so it's
// commented out while we decide on what to do with it.
- this.nodeIdMap = null; // new NodeIdMap(path, getNbNodes());
+ this.nodeIdMap = null; // new NodeIdMap(path, numNodes());
}
// Protected empty constructor to implement copy()
protected Graph() { }
+ protected Graph(ImmutableGraph graph, ImmutableGraph graphTransposed, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
+ this.graph = graph;
+ this.graphTransposed = graphTransposed;
+ this.path = path;
+ this.nodeIdMap = nodeIdMap;
+ this.nodeTypesMap = nodeTypesMap;
+ }
+
/**
* Return a flyweight copy of the graph.
*/
@Override
public Graph copy() {
- final Graph ng = new Graph();
- ng.graph = this.graph.copy();
- ng.graphTransposed = this.graphTransposed.copy();
- ng.path = path;
- ng.nodeIdMap = this.nodeIdMap;
- ng.nodeTypesMap = this.nodeTypesMap;
- return ng;
+ return new Graph(this.graph.copy(), this.graphTransposed.copy(), this.path, this.nodeIdMap, this.nodeTypesMap);
+ }
+
+ public Graph transpose() {
+ return new Graph(this.graphTransposed.copy(), this.graph.copy(), this.path, this.nodeIdMap, this.nodeTypesMap);
+ }
+
+ public Graph symmetrize() {
+ ImmutableGraph symmetric = Transform.union(graph, graphTransposed);
+ return new Graph(symmetric, symmetric, this.path, this.nodeIdMap, this.nodeTypesMap);
}
/**
* Cleans up graph resources after use.
*/
public void cleanUp() throws IOException {
nodeIdMap.close();
}
/**
* Returns the graph full path.
*
* @return graph full path
*/
public String getPath() {
return path;
}
/**
* Converts {@link SwhPID} node to long.
*
* @param swhPID node specified as a {@link SwhPID}
* @return internal long node id
* @see org.softwareheritage.graph.SwhPID
*/
public long getNodeId(SwhPID swhPID) {
return nodeIdMap.getNodeId(swhPID);
}
/**
* Converts long id node to {@link SwhPID}.
*
* @param nodeId node specified as a long id
* @return external SWH PID
* @see org.softwareheritage.graph.SwhPID
*/
public SwhPID getSwhPID(long nodeId) {
return nodeIdMap.getSwhPID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see org.softwareheritage.graph.Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return nodeTypesMap.getType(nodeId);
}
+ public boolean randomAccess() { return graph.randomAccess() && graphTransposed.randomAccess(); }
+
/**
* Returns number of nodes in the graph.
*
* @return number of nodes in the graph
*/
- public long getNbNodes() {
+ public long numNodes() {
return graph.numNodes();
}
/**
* Returns number of edges in the graph.
*
* @return number of edges in the graph
*/
- public long getNbEdges() {
+ public long numArcs() {
return graph.numArcs();
}
/**
* Returns lazy iterator of successors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of successors of the node, specified as a <a
* href="http://webgraph.di.unimi.it/">WebGraph</a> LazyLongIterator
*/
public LazyLongIterator successors(long nodeId) {
return graph.successors(nodeId);
}
/**
* Returns the outdegree of a node.
*
* @param nodeId node specified as a long id
* @return outdegree of a node
*/
public long outdegree(long nodeId) {
return graph.outdegree(nodeId);
}
/**
* Returns lazy iterator of predecessors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of predecessors of the node, specified as a <a
* href="http://webgraph.di.unimi.it/">WebGraph</a> LazyLongIterator
*/
public LazyLongIterator predecessors(long nodeId) {
return graphTransposed.successors(nodeId);
}
/**
* Returns the indegree of a node.
*
* @param nodeId node specified as a long id
* @return indegree of a node
*/
public long indegree(long nodeId) {
return graphTransposed.outdegree(nodeId);
}
- /**
- * Returns the degree of a node, depending on graph orientation.
- *
- * @param nodeId node specified as a long id
- * @param useTransposed boolean value to use transposed graph
- * @return degree of a node
- */
- public long degree(long nodeId, boolean useTransposed) {
- return (useTransposed) ? indegree(nodeId) : outdegree(nodeId);
- }
-
- /**
- * Returns the neighbors of a node (as a lazy iterator), depending on graph orientation.
- *
- * @param nodeId node specified as a long id
- * @param useTransposed boolean value to use transposed graph
- * @return lazy iterator of neighbors of the node, specified as a <a
- * href="http://webgraph.di.unimi.it/">WebGraph</a> LazyLongIterator
- */
- public LazyLongIterator neighbors(long nodeId, boolean useTransposed) {
- return (useTransposed) ? predecessors(nodeId) : successors(nodeId);
- }
-
/**
* Returns the underlying BVGraph.
*
- * @param useTransposed boolean value to use transposed graph
* @return WebGraph BVGraph
*/
- public BVGraph getBVGraph(boolean useTransposed) {
- return (useTransposed) ? this.graphTransposed : this.graph;
+ public ImmutableGraph getGraph() {
+ return this.graph;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/Neighbors.java b/java/src/main/java/org/softwareheritage/graph/Neighbors.java
index 1b13c88..d7514a1 100644
--- a/java/src/main/java/org/softwareheritage/graph/Neighbors.java
+++ b/java/src/main/java/org/softwareheritage/graph/Neighbors.java
@@ -1,84 +1,80 @@
package org.softwareheritage.graph;
import java.util.Iterator;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.AllowedEdges;
import org.softwareheritage.graph.Graph;
/**
* Iterator class to go over a node neighbors in the graph.
* <p>
* Wrapper iterator class to easily deal with {@link AllowedEdges} during traversals.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.AllowedEdges
*/
public class Neighbors implements Iterable<Long> {
/** Graph used to explore neighbors */
Graph graph;
- /** Boolean to specify the use of the transposed graph */
- boolean useTransposed;
/** Graph edge restriction */
AllowedEdges edges;
/** Source node from which neighbors will be listed */
long srcNodeId;
/**
* Constructor.
*
* @param graph graph used to explore neighbors
- * @param useTransposed boolean value to use transposed graph
* @param edges edges allowed to be used in the graph
* @param srcNodeId source node from where to list neighbors
*/
- public Neighbors(Graph graph, boolean useTransposed, AllowedEdges edges, long srcNodeId) {
+ public Neighbors(Graph graph, AllowedEdges edges, long srcNodeId) {
this.graph = graph;
- this.useTransposed = useTransposed;
this.edges = edges;
this.srcNodeId = srcNodeId;
}
@Override
public Iterator<Long> iterator() {
return new NeighborsIterator();
}
/**
* Inner class for {@link Neighbors} iterator.
*
* @author The Software Heritage developers
*/
public class NeighborsIterator implements Iterator<Long> {
LazyLongIterator neighbors;
long nextNeighborId;
public NeighborsIterator() {
- this.neighbors = graph.neighbors(srcNodeId, useTransposed);
+ this.neighbors = graph.successors(srcNodeId);
this.nextNeighborId = -1;
}
public boolean hasNext() {
// Case 1: no edge restriction, bypass type checks and skip to next neighbor
if (edges.restrictedTo == null) {
nextNeighborId = neighbors.nextLong();
return (nextNeighborId != -1);
}
// Case 2: edge restriction, look ahead for next neighbor
while ((nextNeighborId = neighbors.nextLong()) != -1) {
if (edges.isAllowed(srcNodeId, nextNeighborId)) {
return true;
}
}
return false;
}
public Long next() {
return nextNeighborId;
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java b/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java
deleted file mode 100644
index cc4fd85..0000000
--- a/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package org.softwareheritage.graph.algo;
-
-import java.util.function.BiConsumer;
-
-public interface NodeIdsConsumer extends BiConsumer {
-
- /** Callback for returning a (potentially large) list of node identifiers.
- * The callback will be invoked repeatedly without reallocating the array.
- * At each invocation the array might contain more than size node
- * identifiers, but only identifiers located up to position size-1 are to
- * be considered during that specific invocation.
- */
- void accept(long nodeIds[], int size);
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java b/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java
index 0420a41..0c8a400 100644
--- a/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java
+++ b/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java
@@ -1,494 +1,495 @@
package org.softwareheritage.graph.algo;
import java.util.*;
-import it.unimi.dsi.bits.LongArrayBitVector;
-
import org.softwareheritage.graph.AllowedEdges;
import org.softwareheritage.graph.Endpoint;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Neighbors;
import org.softwareheritage.graph.Node;
/**
* Traversal algorithms on the compressed graph.
* <p>
* Internal implementation of the traversal API endpoints. These methods only input/output internal
* long ids, which are converted in the {@link Endpoint} higher-level class to Software Heritage
* PID.
*
* @author The Software Heritage developers
* @see org.softwareheritage.graph.Endpoint
*/
public class Traversal {
/** Graph used in the traversal */
Graph graph;
- /** Boolean to specify the use of the transposed graph */
- boolean useTransposed;
/** Graph edge restriction */
AllowedEdges edges;
/** Hash set storing if we have visited a node */
HashSet<Long> visited;
/** Hash map storing parent node id for each nodes during a traversal */
Map<Long, Long> parentNode;
/** Number of edges accessed during traversal */
long nbEdgesAccessed;
/** random number generator, for random walks */
Random rng;
/**
* Constructor.
*
* @param graph graph used in the traversal
* @param direction a string (either "forward" or "backward") specifying edge orientation
* @param edgesFmt a formatted string describing <a
* href="https://docs.softwareheritage.org/devel/swh-graph/api.html#terminology">allowed edges</a>
*/
public Traversal(Graph graph, String direction, String edgesFmt) {
if (!direction.matches("forward|backward")) {
throw new IllegalArgumentException("Unknown traversal direction: " + direction);
}
- this.graph = graph;
- this.useTransposed = (direction.equals("backward"));
+ if (direction.equals("backward")) {
+ this.graph = graph.transpose();
+ } else {
+ this.graph = graph;
+ }
this.edges = new AllowedEdges(graph, edgesFmt);
- long nbNodes = graph.getNbNodes();
this.visited = new HashSet<>();
this.parentNode = new HashMap<>();
this.nbEdgesAccessed = 0;
this.rng = new Random();
}
/**
* Returns number of accessed edges during traversal.
*
* @return number of edges accessed in last traversal
*/
public long getNbEdgesAccessed() {
return nbEdgesAccessed;
}
/**
* Returns number of accessed nodes during traversal.
*
* @return number of nodes accessed in last traversal
*/
public long getNbNodesAccessed() {
return this.visited.size();
}
/**
- * Push version of {@link leaves}: will fire passed callback for each leaf.
+ * Push version of {@link #leaves} will fire passed callback for each leaf.
*/
public void leavesVisitor(long srcNodeId, NodeIdConsumer cb) {
- Stack<Long> stack = new Stack<Long>();
+ Stack<Long> stack = new Stack<>();
this.nbEdgesAccessed = 0;
stack.push(srcNodeId);
visited.add(srcNodeId);
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
long neighborsCnt = 0;
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
neighborsCnt++;
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
}
}
if (neighborsCnt == 0) {
cb.accept(currentNodeId);
}
}
}
/**
* Returns the leaves of a subgraph rooted at the specified source node.
*
* @param srcNodeId source node
* @return list of node ids corresponding to the leaves
*/
public ArrayList<Long> leaves(long srcNodeId) {
- ArrayList<Long> nodeIds = new ArrayList<Long>();
- leavesVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId));
+ ArrayList<Long> nodeIds = new ArrayList<>();
+ leavesVisitor(srcNodeId, nodeIds::add);
return nodeIds;
}
/**
- * Push version of {@link neighbors}: will fire passed callback on each
+ * Push version of {@link #neighbors}: will fire passed callback on each
* neighbor.
*/
public void neighborsVisitor(long srcNodeId, NodeIdConsumer cb) {
- this.nbEdgesAccessed = graph.degree(srcNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, srcNodeId)) {
+ this.nbEdgesAccessed = graph.outdegree(srcNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, srcNodeId)) {
cb.accept(neighborNodeId);
}
}
/**
* Returns node direct neighbors (linked with exactly one edge).
*
* @param srcNodeId source node
* @return list of node ids corresponding to the neighbors
*/
public ArrayList<Long> neighbors(long srcNodeId) {
- ArrayList<Long> nodeIds = new ArrayList<Long>();
- neighborsVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId));
+ ArrayList<Long> nodeIds = new ArrayList<>();
+ neighborsVisitor(srcNodeId, nodeIds::add);
return nodeIds;
}
/**
- * Push version of {@link visitNodes}: will fire passed callback on each
+ * Push version of {@link #visitNodes}: will fire passed callback on each
* visited node.
*/
public void visitNodesVisitor(long srcNodeId, NodeIdConsumer nodeCb, EdgeIdConsumer edgeCb) {
- Stack<Long> stack = new Stack<Long>();
+ Stack<Long> stack = new Stack<>();
this.nbEdgesAccessed = 0;
stack.push(srcNodeId);
visited.add(srcNodeId);
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
if (nodeCb != null) {
nodeCb.accept(currentNodeId);
}
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
if (edgeCb != null) {
edgeCb.accept(currentNodeId, neighborNodeId);
}
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
}
}
}
}
/** One-argument version to handle callbacks properly */
public void visitNodesVisitor(long srcNodeId, NodeIdConsumer cb) {
visitNodesVisitor(srcNodeId, cb, null);
}
/**
* Performs a graph traversal and returns explored nodes.
*
* @param srcNodeId source node
* @return list of explored node ids
*/
public ArrayList<Long> visitNodes(long srcNodeId) {
- ArrayList<Long> nodeIds = new ArrayList<Long>();
- visitNodesVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId));
+ ArrayList<Long> nodeIds = new ArrayList<>();
+ visitNodesVisitor(srcNodeId, nodeIds::add);
return nodeIds;
}
/**
- * Push version of {@link visitPaths}: will fire passed callback on each
+ * Push version of {@link #visitPaths}: will fire passed callback on each
* discovered (complete) path.
*/
public void visitPathsVisitor(long srcNodeId, PathConsumer cb) {
- Stack<Long> currentPath = new Stack<Long>();
+ Stack<Long> currentPath = new Stack<>();
this.nbEdgesAccessed = 0;
visitPathsInternalVisitor(srcNodeId, currentPath, cb);
}
/**
* Performs a graph traversal and returns explored paths.
*
* @param srcNodeId source node
* @return list of explored paths (represented as a list of node ids)
*/
public ArrayList<ArrayList<Long>> visitPaths(long srcNodeId) {
ArrayList<ArrayList<Long>> paths = new ArrayList<>();
- visitPathsVisitor(srcNodeId, (path) -> paths.add(path));
+ visitPathsVisitor(srcNodeId, paths::add);
return paths;
}
private void visitPathsInternalVisitor(long currentNodeId,
Stack<Long> currentPath,
PathConsumer cb) {
currentPath.push(currentNodeId);
long visitedNeighbors = 0;
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
visitPathsInternalVisitor(neighborNodeId, currentPath, cb);
visitedNeighbors++;
}
if (visitedNeighbors == 0) {
- ArrayList<Long> path = new ArrayList<Long>();
- for (long nodeId : currentPath) {
- path.add(nodeId);
- }
+ ArrayList<Long> path = new ArrayList<>(currentPath);
cb.accept(path);
}
currentPath.pop();
}
/**
* Performs a graph traversal with backtracking, and returns the first
* found path from source to destination.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return found path as a list of node ids
*/
public <T> ArrayList<Long> walk(long srcNodeId, T dst, String visitOrder) {
- long dstNodeId = -1;
+ long dstNodeId;
if (visitOrder.equals("dfs")) {
dstNodeId = walkInternalDFS(srcNodeId, dst);
} else if (visitOrder.equals("bfs")) {
dstNodeId = walkInternalBFS(srcNodeId, dst);
} else {
throw new IllegalArgumentException("Unknown visit order: " + visitOrder);
}
if (dstNodeId == -1) {
throw new IllegalArgumentException("Cannot find destination: " + dst);
}
- ArrayList<Long> nodeIds = backtracking(srcNodeId, dstNodeId);
- return nodeIds;
+ return backtracking(srcNodeId, dstNodeId);
}
/**
* Performs a random walk (picking a random successor at each step) from
* source to destination.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return found path as a list of node ids or an empty path to indicate
* that no suitable path have been found
*/
public <T> ArrayList<Long> randomWalk(long srcNodeId, T dst) {
return randomWalk(srcNodeId, dst, 0);
}
/**
* Performs a stubborn random walk (picking a random successor at each
* step) from source to destination. The walk is "stubborn" in the sense
* that it will not give up the first time if a satisfying target node is
* found, but it will retry up to a limited amount of times.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @param retries number of times to retry; 0 means no retries (single walk)
* @return found path as a list of node ids or an empty path to indicate
* that no suitable path have been found
*/
public <T> ArrayList<Long> randomWalk(long srcNodeId, T dst, int retries) {
long curNodeId = srcNodeId;
- ArrayList<Long> path = new ArrayList<Long>();
+ ArrayList<Long> path = new ArrayList<>();
this.nbEdgesAccessed = 0;
boolean found;
if (retries < 0) {
throw new IllegalArgumentException("Negative number of retries given: " + retries);
}
while (true) {
path.add(curNodeId);
- Neighbors neighbors = new Neighbors(graph, useTransposed, edges, curNodeId);
+ Neighbors neighbors = new Neighbors(graph, edges, curNodeId);
curNodeId = randomPick(neighbors.iterator());
if (curNodeId < 0) {
found = false;
break;
}
if (isDstNode(curNodeId, dst)) {
path.add(curNodeId);
found = true;
break;
}
}
if (found) {
return path;
} else if (retries > 0) { // try again
return randomWalk(srcNodeId, dst, retries - 1);
} else { // not found and no retries left
path.clear();
return path;
}
}
/**
* Randomly choose an element from an iterator over Longs using reservoir
* sampling
*
* @param elements iterator over selection domain
* @return randomly chosen element or -1 if no suitable element was found
*/
private long randomPick(Iterator<Long> elements) {
long curPick = -1;
long seenCandidates = 0;
while (elements.hasNext()) {
seenCandidates++;
if (Math.round(rng.nextFloat() * (seenCandidates - 1)) == 0) {
curPick = elements.next();
}
}
return curPick;
}
/**
* Internal DFS function of {@link #walk}.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return final destination node or -1 if no path found
*/
private <T> long walkInternalDFS(long srcNodeId, T dst) {
- Stack<Long> stack = new Stack<Long>();
+ Stack<Long> stack = new Stack<>();
this.nbEdgesAccessed = 0;
stack.push(srcNodeId);
visited.add(srcNodeId);
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
if (isDstNode(currentNodeId, dst)) {
return currentNodeId;
}
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
parentNode.put(neighborNodeId, currentNodeId);
}
}
}
return -1;
}
/**
* Internal BFS function of {@link #walk}.
*
* @param srcNodeId source node
* @param dst destination (either a node or a node type)
* @return final destination node or -1 if no path found
*/
private <T> long walkInternalBFS(long srcNodeId, T dst) {
- Queue<Long> queue = new LinkedList<Long>();
+ Queue<Long> queue = new LinkedList<>();
this.nbEdgesAccessed = 0;
queue.add(srcNodeId);
visited.add(srcNodeId);
while (!queue.isEmpty()) {
long currentNodeId = queue.poll();
if (isDstNode(currentNodeId, dst)) {
return currentNodeId;
}
- nbEdgesAccessed += graph.degree(currentNodeId, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) {
+ nbEdgesAccessed += graph.outdegree(currentNodeId);
+ for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) {
if (!visited.contains(neighborNodeId)) {
queue.add(neighborNodeId);
visited.add(neighborNodeId);
parentNode.put(neighborNodeId, currentNodeId);
}
}
}
return -1;
}
/**
* Internal function of {@link #walk} to check if a node corresponds to the destination.
*
* @param nodeId current node
* @param dst destination (either a node or a node type)
* @return true if the node is a destination, or false otherwise
*/
private <T> boolean isDstNode(long nodeId, T dst) {
if (dst instanceof Long) {
long dstNodeId = (Long) dst;
return nodeId == dstNodeId;
} else if (dst instanceof Node.Type) {
Node.Type dstType = (Node.Type) dst;
return graph.getNodeType(nodeId) == dstType;
} else {
return false;
}
}
/**
* Internal backtracking function of {@link #walk}.
*
* @param srcNodeId source node
* @param dstNodeId destination node
* @return the found path, as a list of node ids
*/
private ArrayList<Long> backtracking(long srcNodeId, long dstNodeId) {
- ArrayList<Long> path = new ArrayList<Long>();
+ ArrayList<Long> path = new ArrayList<>();
long currentNodeId = dstNodeId;
while (currentNodeId != srcNodeId) {
path.add(currentNodeId);
currentNodeId = parentNode.get(currentNodeId);
}
path.add(srcNodeId);
Collections.reverse(path);
return path;
}
+ /**
+ * Find a common descendant between two given nodes using two parallel BFS
+ *
+ * @param lhsNode the first node
+ * @param rhsNode the second node
+ * @return the found path, as a list of node ids
+ */
public Long findCommonDescendant(long lhsNode, long rhsNode) {
Queue<Long> lhsStack = new ArrayDeque<>();
Queue<Long> rhsStack = new ArrayDeque<>();
HashSet<Long> lhsVisited = new HashSet<>();
HashSet<Long> rhsVisited = new HashSet<>();
lhsStack.add(lhsNode);
rhsStack.add(rhsNode);
lhsVisited.add(lhsNode);
rhsVisited.add(rhsNode);
this.nbEdgesAccessed = 0;
Long curNode;
while (!lhsStack.isEmpty() || !rhsStack.isEmpty()) {
if (!lhsStack.isEmpty()) {
curNode = lhsStack.poll();
- nbEdgesAccessed += graph.degree(curNode, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, curNode)) {
+ nbEdgesAccessed += graph.outdegree(curNode);
+ for (long neighborNodeId : new Neighbors(graph, edges, curNode)) {
if (!lhsVisited.contains(neighborNodeId)) {
if (rhsVisited.contains(neighborNodeId))
return neighborNodeId;
lhsStack.add(neighborNodeId);
lhsVisited.add(neighborNodeId);
}
}
}
if (!rhsStack.isEmpty()) {
curNode = rhsStack.poll();
- nbEdgesAccessed += graph.degree(curNode, useTransposed);
- for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, curNode)) {
+ nbEdgesAccessed += graph.outdegree(curNode);
+ for (long neighborNodeId : new Neighbors(graph, edges, curNode)) {
if (!rhsVisited.contains(neighborNodeId)) {
if (lhsVisited.contains(neighborNodeId))
return neighborNodeId;
rhsStack.add(neighborNodeId);
rhsVisited.add(neighborNodeId);
}
}
}
}
return null;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java b/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java
index c080a93..1f57665 100644
--- a/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java
@@ -1,216 +1,217 @@
package org.softwareheritage.graph.backend;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Scanner;
import java.util.concurrent.*;
import it.unimi.dsi.bits.LongArrayBitVector;
+import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectBigArrays;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.backend.NodeTypesMap;
/**
* Create maps needed at runtime by the graph service, in particular:
*
* - SWH PID → WebGraph long node id
* - WebGraph long node id → SWH PID (converse of the former)
* - WebGraph long node id → SWH node type (enum)
*
* @author The Software Heritage developers
*/
public class MapBuilder {
final static String SORT_BUFFER_SIZE = "40%";
final static Logger logger = LoggerFactory.getLogger(MapBuilder.class);
/**
* Main entrypoint.
*
* @param args command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV");
System.exit(1);
}
String graphPath = args[0];
String tmpDir = args[1];
logger.info("starting maps generation...");
precomputeNodeIdMap(graphPath, tmpDir);
logger.info("maps generation completed");
}
/**
* Computes and dumps on disk mapping files.
*
* @param graphPath path of the compressed graph
*/
// Suppress warning for Object2LongFunction cast
@SuppressWarnings("unchecked")
static void precomputeNodeIdMap(String graphPath, String tmpDir)
throws IOException
{
ProgressLogger plPid2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
ProgressLogger plNode2Pid = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plPid2Node.itemsName = "pid→node";
plNode2Pid.itemsName = "node→pid";
// avg speed for pid→node is sometime skewed due to write to the sort
// pipe hanging when sort is sorting; hence also desplay local speed
plPid2Node.displayLocalSpeed = true;
// first half of PID->node mapping: PID -> WebGraph MPH (long)
Object2LongFunction<String> mphMap = null;
try {
logger.info("loading MPH function...");
mphMap = (Object2LongFunction<String>) BinIO.loadObject(graphPath + ".mph");
logger.info("MPH function loaded");
} catch (ClassNotFoundException e) {
logger.error("unknown class object in .mph file: " + e);
System.exit(2);
}
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
plPid2Node.expectedUpdates = nbIds;
plNode2Pid.expectedUpdates = nbIds;
// second half of PID->node mapping: WebGraph MPH (long) -> BFS order (long)
long[][] bfsMap = LongBigArrays.newBigArray(nbIds);
logger.info("loading BFS order file...");
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap);
logger.info("BFS order file loaded");
if (loaded != nbIds) {
logger.error("graph contains " + nbIds + " nodes, but read " + loaded);
System.exit(2);
}
// Create mapping SWH PID -> WebGraph node id, by sequentially reading
// nodes, hashing them with MPH, and permuting according to BFS order
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in,
StandardCharsets.US_ASCII));
LineIterator swhPIDIterator = new LineIterator(buffer);
// The WebGraph node id -> SWH PID mapping can be obtained from the
// PID->node one by numerically sorting on node id and sequentially
// writing obtained PIDs to a binary map. Delegates the sorting job to
// /usr/bin/sort via pipes
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command("sort", "--numeric-sort", "--key", "2",
"--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir);
Process sort = processBuilder.start();
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream());
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream());
// for the binary format of pidToNodeMap, see Python module swh.graph.pid:PidToIntMap
// for the binary format of nodeToPidMap, see Python module swh.graph.pid:IntToPidMap
try (DataOutputStream pidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.PID_TO_NODE)));
BufferedOutputStream nodeToPidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_PID))) {
// background handler for sort output, it will be fed PID/node
// pairs while pidToNodeMap is being filled, and will itself fill
// nodeToPidMap as soon as data from sort is ready
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToPidMap, plNode2Pid);
outputHandler.start();
// Type map from WebGraph node ID to SWH type. Used at runtime by
// pure Java graph traversals to efficiently check edge
// restrictions.
final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length)
/ Math.log(2));
final int nbBitsPerNodeType = log2NbTypes;
LongArrayBitVector nodeTypesBitVector =
LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
plPid2Node.start("filling pid2node map");
for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) {
String strSwhPID = swhPIDIterator.next().toString();
SwhPID swhPID = new SwhPID(strSwhPID);
byte[] swhPIDBin = swhPID.toBytes();
long mphId = mphMap.getLong(strSwhPID);
- long nodeId = LongBigArrays.get(bfsMap, mphId);
+ long nodeId = BigArrays.get(bfsMap, mphId);
pidToNodeMap.write(swhPIDBin, 0, swhPIDBin.length);
pidToNodeMap.writeLong(nodeId);
sort_stdin.write((strSwhPID + "\t" + nodeId + "\n")
.getBytes(StandardCharsets.US_ASCII));
nodeTypesMap.set(nodeId, swhPID.getType().ordinal());
plPid2Node.lightUpdate();
}
plPid2Node.done();
sort_stdin.close();
// write type map
logger.info("storing type map");
BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE);
logger.info("type map stored");
// wait for nodeToPidMap filling
try {
logger.info("waiting for node2pid map...");
int sortExitCode = sort.waitFor();
if (sortExitCode != 0) {
logger.error("sort returned non-zero exit code: " + sortExitCode);
System.exit(2);
}
outputHandler.join();
} catch (InterruptedException e) {
logger.error("processing of sort output failed with: " + e);
System.exit(2);
}
}
}
private static class SortOutputHandler extends Thread {
private Scanner input;
private OutputStream output;
private ProgressLogger pl;
SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) {
this.input = new Scanner(input, StandardCharsets.US_ASCII);
this.output = output;
this.pl = pl;
}
public void run() {
boolean sortDone = false;
logger.info("node2pid: waiting for sort output...");
while (input.hasNextLine()) {
if (! sortDone) {
sortDone = true;
this.pl.start("filling node2pid map");
}
String line = input.nextLine(); // format: SWH_PID <TAB> NODE_ID
SwhPID swhPID = new SwhPID(line.split("\\t")[0]); // get PID
try {
output.write((byte[]) swhPID.toBytes());
} catch (IOException e) {
logger.error("writing to node->PID map failed with: " + e);
}
this.pl.lightUpdate();
}
this.pl.done();
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/backend/Pp.java b/java/src/main/java/org/softwareheritage/graph/backend/Pp.java
index 3cd0eb7..af495a7 100644
--- a/java/src/main/java/org/softwareheritage/graph/backend/Pp.java
+++ b/java/src/main/java/org/softwareheritage/graph/backend/Pp.java
@@ -1,42 +1,23 @@
package org.softwareheritage.graph.backend;
-import java.io.BufferedWriter;
-import java.io.FileInputStream;
-import java.io.FileWriter;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Writer;
-import java.util.zip.GZIPInputStream;
-
-import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
-import it.unimi.dsi.fastutil.longs.LongBigArrays;
-import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
-import it.unimi.dsi.fastutil.objects.ObjectBigArrays;
-import it.unimi.dsi.io.FastBufferedReader;
-import it.unimi.dsi.io.LineIterator;
-
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
-import org.softwareheritage.graph.SwhPID;
-import org.softwareheritage.graph.backend.NodeTypesMap;
public class Pp {
-
+ @SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException {
Object2LongFunction<String> mphMap = null;
try {
mphMap = (Object2LongFunction<String>) BinIO.loadObject("all.mph");
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("The .mph file contains unknown class object: " + e);
}
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
System.out.println("mph size: " + nbIds);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
index 27334f9..5d5069b 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
@@ -1,130 +1,115 @@
package org.softwareheritage.graph.benchmark;
import com.google.common.primitives.Longs;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.io.ByteDiskQueue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Graph;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.fastutil.Arrays;
import java.io.File;
import java.io.IOException;
public class BFS {
- private Graph graph;
-
- private void load_graph(String graphBasename) throws IOException {
- System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = new Graph(graphBasename);
- System.err.println("Graph loaded.");
- }
+ private final ImmutableGraph graph;
private final static Logger LOGGER = LoggerFactory.getLogger(BFS.class);
+ public BFS(ImmutableGraph graph) {
+ this.graph = graph;
+ }
+
// Partly inlined from it.unimi.dsi.law.big.graph.BFS
- private static void bfsperm(final ImmutableGraph graph) throws IOException {
+ private void bfsperm() throws IOException {
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
final LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
final ProgressLogger pl = new ProgressLogger(LOGGER);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting breadth-first visit...");
- long pos = 0;
-
for (long i = 0; i < n; i++) {
if (visited.getBoolean(i)) continue;
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while((succ = iterator.nextLong()) != -1) {
if (!visited.getBoolean(succ)) {
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
}
pl.update();
}
}
pl.done();
queue.close();
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(
BFS.class.getName(),
"",
new Parameter[] {
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),
new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED,
'T', "transposed", "Use transposed graph (default: false)"),
}
);
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
boolean useTransposed = config.getBoolean("useTransposed");
- ProgressLogger logger = new ProgressLogger();
- long startTime;
- double totalTime;
-
+ System.err.println("Loading graph " + graphPath + " ...");
+ Graph graph = new Graph(graphPath);
+ System.err.println("Graph loaded.");
- BFS bfs = new BFS();
- try {
- bfs.load_graph(graphPath);
- } catch (IOException e) {
- System.out.println("Could not load graph: " + e);
- System.exit(2);
- }
+ if (useTransposed)
+ graph = graph.transpose();
- logger.start("Parallel BFS visit...");
- try {
- BFS.bfsperm(bfs.graph.getBVGraph(useTransposed));
- } catch (IOException e) {
- e.printStackTrace();
- }
- logger.done();
+ BFS bfs = new BFS(graph);
+ bfs.bfsperm();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
index 0eb6c23..bdf51b6 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
@@ -1,170 +1,170 @@
package org.softwareheritage.graph.benchmark;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.StringJoiner;
import java.util.function.Function;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
import org.softwareheritage.graph.Endpoint;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SwhPID;
import org.softwareheritage.graph.benchmark.utils.Random;
import org.softwareheritage.graph.benchmark.utils.Statistics;
/**
* Benchmark common utility functions.
*
* @author The Software Heritage developers
*/
public class Benchmark {
/**
* Input arguments.
*/
public class Args {
/** Basename of the compressed graph */
public String graphPath;
/** Number of random nodes to use for the benchmark */
public int nbNodes;
/** File name for CSV format benchmark log */
public String logFile;
/** Random generator */
public Random random;
}
/** Command line arguments */
public Args args;
/** CSV separator for log file */
final String CSV_SEPARATOR = ";";
/**
* Constructor.
*/
public Benchmark() {
this.args = new Args();
}
/**
* Parses benchmark command line arguments.
*
* @param args command line arguments
*/
public void parseCommandLineArgs(String[] args) throws JSAPException {
SimpleJSAP jsap = new SimpleJSAP(Benchmark.class.getName(),
"Benchmark tool for Software Heritage use-cases scenarios.",
new Parameter[] {
new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
JSAP.NOT_GREEDY, "The basename of the compressed graph."),
new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n',
"nb-nodes", "Number of random nodes used to do the benchmark."),
new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l',
"log-file", "File name to output CSV format benchmark log."),
new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's',
"seed", "Random generator seed."),
});
JSAPResult config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
this.args.graphPath = config.getString("graphPath");
this.args.nbNodes = config.getInt("nbNodes");
this.args.logFile = config.getString("logFile");
this.args.random = config.contains("seed") ? new Random(config.getLong("seed")) : new Random();
}
/**
* Creates CSV file for log output.
*/
public void createCSVLogFile() throws IOException {
try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile))) {
StringJoiner csvHeader = new StringJoiner(CSV_SEPARATOR);
csvHeader.add("use case name")
.add("SWH PID")
.add("number of edges accessed")
.add("traversal timing")
.add("pid2node timing")
.add("node2pid timing");
csvLog.write(csvHeader.toString() + "\n");
}
}
/**
* Times a specific endpoint and outputs individual datapoints along with aggregated statistics.
*
* @param useCaseName benchmark use-case name
* @param graph compressed graph used in the benchmark
* @param nodeIds node ids to use as starting point for the endpoint traversal
* @param operation endpoint function to benchmark
* @param dstFmt destination formatted string as described in the <a
* href="https://docs.softwareheritage.org/devel/swh-graph/api.html#walk">API</a>
* @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs")
*/
public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds,
Function<Endpoint.Input, Endpoint.Output> operation, String dstFmt, String algorithm)
throws IOException {
ArrayList<Double> timings = new ArrayList<>();
ArrayList<Double> timingsNormalized = new ArrayList<>();
ArrayList<Double> nbEdgesAccessed = new ArrayList<>();
final boolean append = true;
try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile, append))) {
for (long nodeId : nodeIds) {
SwhPID swhPID = graph.getSwhPID(nodeId);
Endpoint.Output output = (dstFmt == null)
? operation.apply(new Endpoint.Input(swhPID))
: operation.apply(new Endpoint.Input(swhPID, dstFmt, algorithm));
StringJoiner csvLine = new StringJoiner(CSV_SEPARATOR);
csvLine.add(useCaseName)
.add(swhPID.toString())
.add(Long.toString(output.meta.nbEdgesAccessed))
.add(Double.toString(output.meta.timings.traversal))
.add(Double.toString(output.meta.timings.pid2node))
.add(Double.toString(output.meta.timings.node2pid));
csvLog.write(csvLine.toString() + "\n");
timings.add(output.meta.timings.traversal);
nbEdgesAccessed.add((double) output.meta.nbEdgesAccessed);
if (output.meta.nbEdgesAccessed != 0) {
timingsNormalized.add(output.meta.timings.traversal / output.meta.nbEdgesAccessed);
}
}
}
System.out.println("\n" + useCaseName + " use-case:");
System.out.println("timings:");
Statistics stats = new Statistics(timings);
stats.printAll();
System.out.println("timings normalized:");
Statistics statsNormalized = new Statistics(timingsNormalized);
statsNormalized.printAll();
System.out.println("nb edges accessed:");
Statistics statsNbEdgesAccessed = new Statistics(nbEdgesAccessed);
statsNbEdgesAccessed.printAll();
}
/**
- * Same as {@link timeEndpoint} but without destination or algorithm specified to endpoint call.
+ * Same as {@link #timeEndpoint} but without destination or algorithm specified to endpoint call.
*/
public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds,
Function<Endpoint.Input, Endpoint.Output> operation) throws IOException {
timeEndpoint(useCaseName, graph, nodeIds, operation, null, null);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java
index e6a5cd2..722dd74 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java
@@ -1,223 +1,216 @@
package org.softwareheritage.graph.benchmark;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
-import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.big.webgraph.Transform;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.logging.ProgressLogger;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class ForkCC {
public Boolean includeRootDir;
private Graph graph;
private Long emptySnapshot;
private LongArrayBitVector visited;
private LongArrayBitVector whitelist;
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = new Graph(graphBasename);
+ this.graph = new Graph(graphBasename).symmetrize();
System.err.println("Graph loaded.");
this.emptySnapshot = null;
this.whitelist = null;
this.visited = null;
this.includeRootDir = null;
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(
ForkCC.class.getName(),
"",
new Parameter[] {
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),
new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED,
't', "whitelist", "Whitelist of origins"),
new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED,
'w', "numthreads", "Number of threads"),
new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED,
'R', "includerootdir", "Include root directory (default: false)"),
}
);
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private boolean nodeIsEmptySnapshot(Long node) {
if (this.emptySnapshot == null
&& this.graph.getNodeType(node) == Node.Type.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private Boolean shouldVisit(Long node){
Node.Type nt = this.graph.getNodeType(node);
if (nt == Node.Type.CNT) {
return false;
}
if (nt == Node.Type.DIR && !includeRootDir)
return false;
if (this.nodeIsEmptySnapshot(node))
return false;
if (visited.getBoolean(node))
return false;
return true;
}
- private ArrayList<ArrayList<Long>> compute(final ImmutableGraph graph, ProgressLogger pl) throws IOException {
+ private ArrayList<ArrayList<Long>> compute(ProgressLogger pl) throws IOException {
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
visited = LongArrayBitVector.ofLength(n);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting connected components visit...");
ArrayList<ArrayList<Long>> components = new ArrayList<>();
for (long i = 0; i < n; i++) {
if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) continue;
ArrayList<Long> component = new ArrayList<>();
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
Node.Type cur_nt = this.graph.getNodeType(currentNode);
if (cur_nt == Node.Type.ORI
&& (this.whitelist == null || this.whitelist.getBoolean(currentNode))) {
// TODO: add a check that the origin has >=1 non-empty snapshot
component.add(currentNode);
}
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (!shouldVisit(succ)) continue;
if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
pl.update();
}
if (component.size() > 0) {
components.add(component);
}
}
pl.done();
queue.close();
return components;
}
private static void printDistribution(ArrayList<ArrayList<Long>> components) {
TreeMap<Long, Long> distribution = new TreeMap<>();
for (ArrayList<Long> component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry<Long, Long> entry : distribution.entrySet()) {
System.out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList<ArrayList<Long>> components) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList<Long> component = components.get(indexLargest);
for (Long node : component) {
System.out.println(node);
}
}
private void parseWhitelist(String path) {
System.err.println("Loading whitelist " + path + " ...");
- this.whitelist = LongArrayBitVector.ofLength(this.graph.getNbNodes());
+ this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes());
Scanner scanner = null;
try {
scanner = new Scanner(new File(path));
while(scanner.hasNextLong()) {
whitelist.set(scanner.nextLong());
}
System.err.println("Whitelist loaded.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
int numThreads = config.getInt("numThreads");
String whitelistPath = config.getString("whitelistPath");
boolean includeRootDir = config.getBoolean("includeRootDir");
ForkCC forkCc = new ForkCC();
try {
forkCc.load_graph(graphPath);
forkCc.includeRootDir = includeRootDir;
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
if (whitelistPath != null) {
forkCc.parseWhitelist(whitelistPath);
}
- ImmutableGraph symmetric = Transform.union(
- forkCc.graph.getBVGraph(false),
- forkCc.graph.getBVGraph(true)
- );
-
ProgressLogger logger = new ProgressLogger();
try {
- ArrayList<ArrayList<Long>> components = forkCc.compute(symmetric, logger);
+ ArrayList<ArrayList<Long>> components = forkCc.compute(logger);
printDistribution(components);
// printLargestComponent(components);
} catch (IOException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java
index a6f1c79..402632c 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java
@@ -1,93 +1,93 @@
package org.softwareheritage.graph.benchmark;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
import java.io.IOException;
import java.util.ArrayList;
public class ListEmptyOrigins {
private Graph graph;
private Long emptySnapshot;
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
this.graph = new Graph(graphBasename);
System.err.println("Graph loaded.");
this.emptySnapshot = null;
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(
ListEmptyOrigins.class.getName(),
"",
new Parameter[] {
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),
}
);
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private boolean nodeIsEmptySnapshot(Long node) {
System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node);
if (this.emptySnapshot == null
&& this.graph.getNodeType(node) == Node.Type.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private ArrayList<Long> compute(ImmutableGraph graph) {
final long n = graph.numNodes();
ArrayList<Long> bad = new ArrayList<>();
for (long i = 0; i < n; i++) {
Node.Type nt = this.graph.getNodeType(i);
if (nt != Node.Type.ORI) continue;
final LazyLongIterator iterator = graph.successors(i);
long succ;
boolean found = false;
while ((succ = iterator.nextLong()) != -1) {
if (this.graph.outdegree(succ) > 0) {
found = true;
}
}
if (!found)
bad.add(i);
}
return bad;
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
ListEmptyOrigins leo = new ListEmptyOrigins();
try {
leo.load_graph(graphPath);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
- ArrayList<Long> badlist = leo.compute(leo.graph.getBVGraph(false));
+ ArrayList<Long> badlist = leo.compute(leo.graph);
for (Long bad : badlist) {
System.out.println(bad);
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
index abcec76..df93c96 100644
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
+++ b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
@@ -1,67 +1,67 @@
package org.softwareheritage.graph.benchmark.utils;
import java.util.PrimitiveIterator;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.Node;
/**
* Random related utility class.
*
* @author The Software Heritage developers
*/
public class Random {
/** Internal pseudorandom generator */
java.util.Random random;
/**
* Constructor.
*/
public Random() {
this.random = new java.util.Random();
}
/**
* Constructor.
*
* @param seed random generator seed
*/
public Random(long seed) {
this.random = new java.util.Random(seed);
}
/**
* Generates random node ids.
*
* @param graph graph used to pick node ids
* @param nbNodes number of node ids to generate
* @return an array of random node ids
*/
public long[] generateNodeIds(Graph graph, int nbNodes) {
- return random.longs(nbNodes, 0, graph.getNbNodes()).toArray();
+ return random.longs(nbNodes, 0, graph.numNodes()).toArray();
}
/**
* Generates random node ids with a specific type.
*
* @param graph graph used to pick node ids
* @param nbNodes number of node ids to generate
* @param expectedType specific node type to pick
* @return an array of random node ids
*/
public long[] generateNodeIdsOfType(Graph graph, int nbNodes, Node.Type expectedType) {
- PrimitiveIterator.OfLong nodes = random.longs(0, graph.getNbNodes()).iterator();
+ PrimitiveIterator.OfLong nodes = random.longs(0, graph.numNodes()).iterator();
long[] nodeIds = new long[nbNodes];
long nextId;
for (int i = 0; i < nbNodes; i++) {
do {
nextId = nodes.nextLong();
} while (graph.getNodeType(nextId) != expectedType);
nodeIds[i] = nextId;
}
return nodeIds;
}
}
diff --git a/swh/graph/graph.py b/swh/graph/graph.py
index fda4c68..fcc71ba 100644
--- a/swh/graph/graph.py
+++ b/swh/graph/graph.py
@@ -1,185 +1,185 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import asyncio
import contextlib
import functools
from swh.graph.backend import Backend
from swh.graph.dot import dot_to_svg, graph_dot, KIND_TO_SHAPE
BASE_URL = "https://archive.softwareheritage.org/browse"
KIND_TO_URL_FRAGMENT = {
"ori": "/origin/{}",
"snp": "/snapshot/{}",
"rel": "/release/{}",
"rev": "/revision/{}",
"dir": "/directory/{}",
"cnt": "/content/sha1_git:{}/",
}
def call_async_gen(generator, *args, **kwargs):
loop = asyncio.get_event_loop()
it = generator(*args, **kwargs).__aiter__()
while True:
try:
res = loop.run_until_complete(it.__anext__())
yield res
except StopAsyncIteration:
break
class Neighbors:
"""Neighbor iterator with custom O(1) length method"""
def __init__(self, graph, iterator, length_func):
self.graph = graph
self.iterator = iterator
self.length_func = length_func
def __iter__(self):
return self
def __next__(self):
succ = self.iterator.nextLong()
if succ == -1:
raise StopIteration
return GraphNode(self.graph, succ)
def __len__(self):
return self.length_func()
class GraphNode:
"""Node in the SWH graph"""
def __init__(self, graph, node_id):
self.graph = graph
self.id = node_id
def children(self):
return Neighbors(
self.graph,
self.graph.java_graph.successors(self.id),
lambda: self.graph.java_graph.outdegree(self.id),
)
def parents(self):
return Neighbors(
self.graph,
self.graph.java_graph.predecessors(self.id),
lambda: self.graph.java_graph.indegree(self.id),
)
def simple_traversal(self, ttype, direction="forward", edges="*"):
for node in call_async_gen(
self.graph.backend.simple_traversal, ttype, direction, edges, self.id
):
yield self.graph[node]
def leaves(self, *args, **kwargs):
yield from self.simple_traversal("leaves", *args, **kwargs)
def visit_nodes(self, *args, **kwargs):
yield from self.simple_traversal("visit_nodes", *args, **kwargs)
def visit_edges(self, direction="forward", edges="*"):
for src, dst in call_async_gen(
self.graph.backend.visit_edges, direction, edges, self.id
):
yield (self.graph[src], self.graph[dst])
def visit_paths(self, direction="forward", edges="*"):
for path in call_async_gen(
self.graph.backend.visit_paths, direction, edges, self.id
):
yield [self.graph[node] for node in path]
def walk(self, dst, direction="forward", edges="*", traversal="dfs"):
for node in call_async_gen(
self.graph.backend.walk, direction, edges, traversal, self.id, dst
):
yield self.graph[node]
def _count(self, ttype, direction="forward", edges="*"):
return self.graph.backend.count(ttype, direction, edges, self.id)
count_leaves = functools.partialmethod(_count, ttype="leaves")
count_neighbors = functools.partialmethod(_count, ttype="neighbors")
count_visit_nodes = functools.partialmethod(_count, ttype="visit_nodes")
@property
def pid(self):
return self.graph.node2pid[self.id]
@property
def kind(self):
return self.pid.split(":")[2]
def __str__(self):
return self.pid
def __repr__(self):
return "<{}>".format(self.pid)
def dot_fragment(self):
swh, version, kind, hash = self.pid.split(":")
label = "{}:{}..{}".format(kind, hash[0:2], hash[-2:])
url = BASE_URL + KIND_TO_URL_FRAGMENT[kind].format(hash)
shape = KIND_TO_SHAPE[kind]
return '{} [label="{}", href="{}", target="_blank", shape="{}"];'.format(
self.id, label, url, shape
)
def _repr_svg_(self):
nodes = [self, *list(self.children()), *list(self.parents())]
dot = graph_dot(nodes)
svg = dot_to_svg(dot)
return svg
class Graph:
def __init__(self, backend, node2pid, pid2node):
self.backend = backend
self.java_graph = backend.entry.get_graph()
self.node2pid = node2pid
self.pid2node = pid2node
def stats(self):
return self.backend.stats()
@property
def path(self):
return self.java_graph.getPath()
def __len__(self):
- return self.java_graph.getNbNodes()
+ return self.java_graph.numNodes()
def __getitem__(self, node_id):
if isinstance(node_id, int):
self.node2pid[node_id] # check existence
return GraphNode(self, node_id)
elif isinstance(node_id, str):
node_id = self.pid2node[node_id]
return GraphNode(self, node_id)
def __iter__(self):
for pid, pos in self.backend.pid2node:
yield self[pid]
def iter_prefix(self, prefix):
for pid, pos in self.backend.pid2node.iter_prefix(prefix):
yield self[pid]
def iter_type(self, pid_type):
for pid, pos in self.backend.pid2node.iter_type(pid_type):
yield self[pid]
@contextlib.contextmanager
def load(graph_path):
with Backend(graph_path) as backend:
yield Graph(backend, backend.node2pid, backend.pid2node)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:36 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3273204
Attached To
rDGRPH Compressed graph representation
Event Timeline
Log In to Comment