diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java index f8fe5d4..db9a26f 100644 --- a/java/src/main/java/org/softwareheritage/graph/Graph.java +++ b/java/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,227 +1,218 @@ package org.softwareheritage.graph; import java.io.IOException; import it.unimi.dsi.big.webgraph.BVGraph; +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.Transform; import it.unimi.dsi.lang.FlyweightPrototype; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhPID; import org.softwareheritage.graph.backend.NodeIdMap; import org.softwareheritage.graph.backend.NodeTypesMap; /** * Main class storing the compressed graph and node id mappings. *

* The compressed graph is stored using the WebGraph * ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent * identifiers (PID) while WebGraph uses integers internally. These two mappings (long id ↔ * PID) are used for the input (users refer to the graph using PID) and the output (convert back to * PID for users results). However, since graph traversal can be restricted depending on the node * type (see {@link AllowedEdges}), a long id → node type map is stored as well to avoid a full * PID lookup. * * @author The Software Heritage developers * @see org.softwareheritage.graph.AllowedEdges * @see org.softwareheritage.graph.backend.NodeIdMap * @see org.softwareheritage.graph.backend.NodeTypesMap */ -public class Graph implements FlyweightPrototype { +public class Graph extends ImmutableGraph { /** File extension for the SWH PID to long node id map */ public static final String PID_TO_NODE = ".pid2node.bin"; /** File extension for the long node id to SWH PID map */ public static final String NODE_TO_PID = ".node2pid.bin"; /** File extension for the long node id to node type map */ public static final String NODE_TO_TYPE = ".node2type.map"; /** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */ - BVGraph graph; + ImmutableGraph graph; /** Transposed compressed graph (used for backward traversals) */ - BVGraph graphTransposed; + ImmutableGraph graphTransposed; /** Path and basename of the compressed graph */ String path; /** Mapping long id ↔ SWH PIDs */ NodeIdMap nodeIdMap; /** Mapping long id → node types */ NodeTypesMap nodeTypesMap; /** * Constructor. * * @param path path and basename of the compressed graph to load */ public Graph(String path) throws IOException { this.graph = BVGraph.loadMapped(path); this.graphTransposed = BVGraph.loadMapped(path + "-transposed"); this.path = path; this.nodeTypesMap = new NodeTypesMap(path); // TODO: we don't need to load the nodeIdMap now that all the // translation between ints and PIDs is happening on the Python side. // However, some parts of the code still depend on this, so it's // commented out while we decide on what to do with it. - this.nodeIdMap = null; // new NodeIdMap(path, getNbNodes()); + this.nodeIdMap = null; // new NodeIdMap(path, numNodes()); } // Protected empty constructor to implement copy() protected Graph() { } + protected Graph(ImmutableGraph graph, ImmutableGraph graphTransposed, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { + this.graph = graph; + this.graphTransposed = graphTransposed; + this.path = path; + this.nodeIdMap = nodeIdMap; + this.nodeTypesMap = nodeTypesMap; + } + /** * Return a flyweight copy of the graph. */ @Override public Graph copy() { - final Graph ng = new Graph(); - ng.graph = this.graph.copy(); - ng.graphTransposed = this.graphTransposed.copy(); - ng.path = path; - ng.nodeIdMap = this.nodeIdMap; - ng.nodeTypesMap = this.nodeTypesMap; - return ng; + return new Graph(this.graph.copy(), this.graphTransposed.copy(), this.path, this.nodeIdMap, this.nodeTypesMap); + } + + public Graph transpose() { + return new Graph(this.graphTransposed.copy(), this.graph.copy(), this.path, this.nodeIdMap, this.nodeTypesMap); + } + + public Graph symmetrize() { + ImmutableGraph symmetric = Transform.union(graph, graphTransposed); + return new Graph(symmetric, symmetric, this.path, this.nodeIdMap, this.nodeTypesMap); } /** * Cleans up graph resources after use. */ public void cleanUp() throws IOException { nodeIdMap.close(); } /** * Returns the graph full path. * * @return graph full path */ public String getPath() { return path; } /** * Converts {@link SwhPID} node to long. * * @param swhPID node specified as a {@link SwhPID} * @return internal long node id * @see org.softwareheritage.graph.SwhPID */ public long getNodeId(SwhPID swhPID) { return nodeIdMap.getNodeId(swhPID); } /** * Converts long id node to {@link SwhPID}. * * @param nodeId node specified as a long id * @return external SWH PID * @see org.softwareheritage.graph.SwhPID */ public SwhPID getSwhPID(long nodeId) { return nodeIdMap.getSwhPID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see org.softwareheritage.graph.Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } + public boolean randomAccess() { return graph.randomAccess() && graphTransposed.randomAccess(); } + /** * Returns number of nodes in the graph. * * @return number of nodes in the graph */ - public long getNbNodes() { + public long numNodes() { return graph.numNodes(); } /** * Returns number of edges in the graph. * * @return number of edges in the graph */ - public long getNbEdges() { + public long numArcs() { return graph.numArcs(); } /** * Returns lazy iterator of successors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of successors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } /** * Returns the outdegree of a node. * * @param nodeId node specified as a long id * @return outdegree of a node */ public long outdegree(long nodeId) { return graph.outdegree(nodeId); } /** * Returns lazy iterator of predecessors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of predecessors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator predecessors(long nodeId) { return graphTransposed.successors(nodeId); } /** * Returns the indegree of a node. * * @param nodeId node specified as a long id * @return indegree of a node */ public long indegree(long nodeId) { return graphTransposed.outdegree(nodeId); } - /** - * Returns the degree of a node, depending on graph orientation. - * - * @param nodeId node specified as a long id - * @param useTransposed boolean value to use transposed graph - * @return degree of a node - */ - public long degree(long nodeId, boolean useTransposed) { - return (useTransposed) ? indegree(nodeId) : outdegree(nodeId); - } - - /** - * Returns the neighbors of a node (as a lazy iterator), depending on graph orientation. - * - * @param nodeId node specified as a long id - * @param useTransposed boolean value to use transposed graph - * @return lazy iterator of neighbors of the node, specified as a WebGraph LazyLongIterator - */ - public LazyLongIterator neighbors(long nodeId, boolean useTransposed) { - return (useTransposed) ? predecessors(nodeId) : successors(nodeId); - } - /** * Returns the underlying BVGraph. * - * @param useTransposed boolean value to use transposed graph * @return WebGraph BVGraph */ - public BVGraph getBVGraph(boolean useTransposed) { - return (useTransposed) ? this.graphTransposed : this.graph; + public ImmutableGraph getGraph() { + return this.graph; } } diff --git a/java/src/main/java/org/softwareheritage/graph/Neighbors.java b/java/src/main/java/org/softwareheritage/graph/Neighbors.java index 1b13c88..d7514a1 100644 --- a/java/src/main/java/org/softwareheritage/graph/Neighbors.java +++ b/java/src/main/java/org/softwareheritage/graph/Neighbors.java @@ -1,84 +1,80 @@ package org.softwareheritage.graph; import java.util.Iterator; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.AllowedEdges; import org.softwareheritage.graph.Graph; /** * Iterator class to go over a node neighbors in the graph. *

* Wrapper iterator class to easily deal with {@link AllowedEdges} during traversals. * * @author The Software Heritage developers * @see org.softwareheritage.graph.AllowedEdges */ public class Neighbors implements Iterable { /** Graph used to explore neighbors */ Graph graph; - /** Boolean to specify the use of the transposed graph */ - boolean useTransposed; /** Graph edge restriction */ AllowedEdges edges; /** Source node from which neighbors will be listed */ long srcNodeId; /** * Constructor. * * @param graph graph used to explore neighbors - * @param useTransposed boolean value to use transposed graph * @param edges edges allowed to be used in the graph * @param srcNodeId source node from where to list neighbors */ - public Neighbors(Graph graph, boolean useTransposed, AllowedEdges edges, long srcNodeId) { + public Neighbors(Graph graph, AllowedEdges edges, long srcNodeId) { this.graph = graph; - this.useTransposed = useTransposed; this.edges = edges; this.srcNodeId = srcNodeId; } @Override public Iterator iterator() { return new NeighborsIterator(); } /** * Inner class for {@link Neighbors} iterator. * * @author The Software Heritage developers */ public class NeighborsIterator implements Iterator { LazyLongIterator neighbors; long nextNeighborId; public NeighborsIterator() { - this.neighbors = graph.neighbors(srcNodeId, useTransposed); + this.neighbors = graph.successors(srcNodeId); this.nextNeighborId = -1; } public boolean hasNext() { // Case 1: no edge restriction, bypass type checks and skip to next neighbor if (edges.restrictedTo == null) { nextNeighborId = neighbors.nextLong(); return (nextNeighborId != -1); } // Case 2: edge restriction, look ahead for next neighbor while ((nextNeighborId = neighbors.nextLong()) != -1) { if (edges.isAllowed(srcNodeId, nextNeighborId)) { return true; } } return false; } public Long next() { return nextNeighborId; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java b/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java deleted file mode 100644 index cc4fd85..0000000 --- a/java/src/main/java/org/softwareheritage/graph/algo/NodeIdsConsumer.java +++ /dev/null @@ -1,14 +0,0 @@ -package org.softwareheritage.graph.algo; - -import java.util.function.BiConsumer; - -public interface NodeIdsConsumer extends BiConsumer { - - /** Callback for returning a (potentially large) list of node identifiers. - * The callback will be invoked repeatedly without reallocating the array. - * At each invocation the array might contain more than size node - * identifiers, but only identifiers located up to position size-1 are to - * be considered during that specific invocation. - */ - void accept(long nodeIds[], int size); -} diff --git a/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java b/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java index 0420a41..0c8a400 100644 --- a/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java +++ b/java/src/main/java/org/softwareheritage/graph/algo/Traversal.java @@ -1,494 +1,495 @@ package org.softwareheritage.graph.algo; import java.util.*; -import it.unimi.dsi.bits.LongArrayBitVector; - import org.softwareheritage.graph.AllowedEdges; import org.softwareheritage.graph.Endpoint; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Neighbors; import org.softwareheritage.graph.Node; /** * Traversal algorithms on the compressed graph. *

* Internal implementation of the traversal API endpoints. These methods only input/output internal * long ids, which are converted in the {@link Endpoint} higher-level class to Software Heritage * PID. * * @author The Software Heritage developers * @see org.softwareheritage.graph.Endpoint */ public class Traversal { /** Graph used in the traversal */ Graph graph; - /** Boolean to specify the use of the transposed graph */ - boolean useTransposed; /** Graph edge restriction */ AllowedEdges edges; /** Hash set storing if we have visited a node */ HashSet visited; /** Hash map storing parent node id for each nodes during a traversal */ Map parentNode; /** Number of edges accessed during traversal */ long nbEdgesAccessed; /** random number generator, for random walks */ Random rng; /** * Constructor. * * @param graph graph used in the traversal * @param direction a string (either "forward" or "backward") specifying edge orientation * @param edgesFmt a formatted string describing allowed edges */ public Traversal(Graph graph, String direction, String edgesFmt) { if (!direction.matches("forward|backward")) { throw new IllegalArgumentException("Unknown traversal direction: " + direction); } - this.graph = graph; - this.useTransposed = (direction.equals("backward")); + if (direction.equals("backward")) { + this.graph = graph.transpose(); + } else { + this.graph = graph; + } this.edges = new AllowedEdges(graph, edgesFmt); - long nbNodes = graph.getNbNodes(); this.visited = new HashSet<>(); this.parentNode = new HashMap<>(); this.nbEdgesAccessed = 0; this.rng = new Random(); } /** * Returns number of accessed edges during traversal. * * @return number of edges accessed in last traversal */ public long getNbEdgesAccessed() { return nbEdgesAccessed; } /** * Returns number of accessed nodes during traversal. * * @return number of nodes accessed in last traversal */ public long getNbNodesAccessed() { return this.visited.size(); } /** - * Push version of {@link leaves}: will fire passed callback for each leaf. + * Push version of {@link #leaves} will fire passed callback for each leaf. */ public void leavesVisitor(long srcNodeId, NodeIdConsumer cb) { - Stack stack = new Stack(); + Stack stack = new Stack<>(); this.nbEdgesAccessed = 0; stack.push(srcNodeId); visited.add(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); long neighborsCnt = 0; - nbEdgesAccessed += graph.degree(currentNodeId, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { + nbEdgesAccessed += graph.outdegree(currentNodeId); + for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) { neighborsCnt++; if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); } } if (neighborsCnt == 0) { cb.accept(currentNodeId); } } } /** * Returns the leaves of a subgraph rooted at the specified source node. * * @param srcNodeId source node * @return list of node ids corresponding to the leaves */ public ArrayList leaves(long srcNodeId) { - ArrayList nodeIds = new ArrayList(); - leavesVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId)); + ArrayList nodeIds = new ArrayList<>(); + leavesVisitor(srcNodeId, nodeIds::add); return nodeIds; } /** - * Push version of {@link neighbors}: will fire passed callback on each + * Push version of {@link #neighbors}: will fire passed callback on each * neighbor. */ public void neighborsVisitor(long srcNodeId, NodeIdConsumer cb) { - this.nbEdgesAccessed = graph.degree(srcNodeId, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, srcNodeId)) { + this.nbEdgesAccessed = graph.outdegree(srcNodeId); + for (long neighborNodeId : new Neighbors(graph, edges, srcNodeId)) { cb.accept(neighborNodeId); } } /** * Returns node direct neighbors (linked with exactly one edge). * * @param srcNodeId source node * @return list of node ids corresponding to the neighbors */ public ArrayList neighbors(long srcNodeId) { - ArrayList nodeIds = new ArrayList(); - neighborsVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId)); + ArrayList nodeIds = new ArrayList<>(); + neighborsVisitor(srcNodeId, nodeIds::add); return nodeIds; } /** - * Push version of {@link visitNodes}: will fire passed callback on each + * Push version of {@link #visitNodes}: will fire passed callback on each * visited node. */ public void visitNodesVisitor(long srcNodeId, NodeIdConsumer nodeCb, EdgeIdConsumer edgeCb) { - Stack stack = new Stack(); + Stack stack = new Stack<>(); this.nbEdgesAccessed = 0; stack.push(srcNodeId); visited.add(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); if (nodeCb != null) { nodeCb.accept(currentNodeId); } - nbEdgesAccessed += graph.degree(currentNodeId, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { + nbEdgesAccessed += graph.outdegree(currentNodeId); + for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) { if (edgeCb != null) { edgeCb.accept(currentNodeId, neighborNodeId); } if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); } } } } /** One-argument version to handle callbacks properly */ public void visitNodesVisitor(long srcNodeId, NodeIdConsumer cb) { visitNodesVisitor(srcNodeId, cb, null); } /** * Performs a graph traversal and returns explored nodes. * * @param srcNodeId source node * @return list of explored node ids */ public ArrayList visitNodes(long srcNodeId) { - ArrayList nodeIds = new ArrayList(); - visitNodesVisitor(srcNodeId, (nodeId) -> nodeIds.add(nodeId)); + ArrayList nodeIds = new ArrayList<>(); + visitNodesVisitor(srcNodeId, nodeIds::add); return nodeIds; } /** - * Push version of {@link visitPaths}: will fire passed callback on each + * Push version of {@link #visitPaths}: will fire passed callback on each * discovered (complete) path. */ public void visitPathsVisitor(long srcNodeId, PathConsumer cb) { - Stack currentPath = new Stack(); + Stack currentPath = new Stack<>(); this.nbEdgesAccessed = 0; visitPathsInternalVisitor(srcNodeId, currentPath, cb); } /** * Performs a graph traversal and returns explored paths. * * @param srcNodeId source node * @return list of explored paths (represented as a list of node ids) */ public ArrayList> visitPaths(long srcNodeId) { ArrayList> paths = new ArrayList<>(); - visitPathsVisitor(srcNodeId, (path) -> paths.add(path)); + visitPathsVisitor(srcNodeId, paths::add); return paths; } private void visitPathsInternalVisitor(long currentNodeId, Stack currentPath, PathConsumer cb) { currentPath.push(currentNodeId); long visitedNeighbors = 0; - nbEdgesAccessed += graph.degree(currentNodeId, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { + nbEdgesAccessed += graph.outdegree(currentNodeId); + for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) { visitPathsInternalVisitor(neighborNodeId, currentPath, cb); visitedNeighbors++; } if (visitedNeighbors == 0) { - ArrayList path = new ArrayList(); - for (long nodeId : currentPath) { - path.add(nodeId); - } + ArrayList path = new ArrayList<>(currentPath); cb.accept(path); } currentPath.pop(); } /** * Performs a graph traversal with backtracking, and returns the first * found path from source to destination. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return found path as a list of node ids */ public ArrayList walk(long srcNodeId, T dst, String visitOrder) { - long dstNodeId = -1; + long dstNodeId; if (visitOrder.equals("dfs")) { dstNodeId = walkInternalDFS(srcNodeId, dst); } else if (visitOrder.equals("bfs")) { dstNodeId = walkInternalBFS(srcNodeId, dst); } else { throw new IllegalArgumentException("Unknown visit order: " + visitOrder); } if (dstNodeId == -1) { throw new IllegalArgumentException("Cannot find destination: " + dst); } - ArrayList nodeIds = backtracking(srcNodeId, dstNodeId); - return nodeIds; + return backtracking(srcNodeId, dstNodeId); } /** * Performs a random walk (picking a random successor at each step) from * source to destination. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return found path as a list of node ids or an empty path to indicate * that no suitable path have been found */ public ArrayList randomWalk(long srcNodeId, T dst) { return randomWalk(srcNodeId, dst, 0); } /** * Performs a stubborn random walk (picking a random successor at each * step) from source to destination. The walk is "stubborn" in the sense * that it will not give up the first time if a satisfying target node is * found, but it will retry up to a limited amount of times. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @param retries number of times to retry; 0 means no retries (single walk) * @return found path as a list of node ids or an empty path to indicate * that no suitable path have been found */ public ArrayList randomWalk(long srcNodeId, T dst, int retries) { long curNodeId = srcNodeId; - ArrayList path = new ArrayList(); + ArrayList path = new ArrayList<>(); this.nbEdgesAccessed = 0; boolean found; if (retries < 0) { throw new IllegalArgumentException("Negative number of retries given: " + retries); } while (true) { path.add(curNodeId); - Neighbors neighbors = new Neighbors(graph, useTransposed, edges, curNodeId); + Neighbors neighbors = new Neighbors(graph, edges, curNodeId); curNodeId = randomPick(neighbors.iterator()); if (curNodeId < 0) { found = false; break; } if (isDstNode(curNodeId, dst)) { path.add(curNodeId); found = true; break; } } if (found) { return path; } else if (retries > 0) { // try again return randomWalk(srcNodeId, dst, retries - 1); } else { // not found and no retries left path.clear(); return path; } } /** * Randomly choose an element from an iterator over Longs using reservoir * sampling * * @param elements iterator over selection domain * @return randomly chosen element or -1 if no suitable element was found */ private long randomPick(Iterator elements) { long curPick = -1; long seenCandidates = 0; while (elements.hasNext()) { seenCandidates++; if (Math.round(rng.nextFloat() * (seenCandidates - 1)) == 0) { curPick = elements.next(); } } return curPick; } /** * Internal DFS function of {@link #walk}. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return final destination node or -1 if no path found */ private long walkInternalDFS(long srcNodeId, T dst) { - Stack stack = new Stack(); + Stack stack = new Stack<>(); this.nbEdgesAccessed = 0; stack.push(srcNodeId); visited.add(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); if (isDstNode(currentNodeId, dst)) { return currentNodeId; } - nbEdgesAccessed += graph.degree(currentNodeId, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { + nbEdgesAccessed += graph.outdegree(currentNodeId); + for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) { if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); parentNode.put(neighborNodeId, currentNodeId); } } } return -1; } /** * Internal BFS function of {@link #walk}. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return final destination node or -1 if no path found */ private long walkInternalBFS(long srcNodeId, T dst) { - Queue queue = new LinkedList(); + Queue queue = new LinkedList<>(); this.nbEdgesAccessed = 0; queue.add(srcNodeId); visited.add(srcNodeId); while (!queue.isEmpty()) { long currentNodeId = queue.poll(); if (isDstNode(currentNodeId, dst)) { return currentNodeId; } - nbEdgesAccessed += graph.degree(currentNodeId, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { + nbEdgesAccessed += graph.outdegree(currentNodeId); + for (long neighborNodeId : new Neighbors(graph, edges, currentNodeId)) { if (!visited.contains(neighborNodeId)) { queue.add(neighborNodeId); visited.add(neighborNodeId); parentNode.put(neighborNodeId, currentNodeId); } } } return -1; } /** * Internal function of {@link #walk} to check if a node corresponds to the destination. * * @param nodeId current node * @param dst destination (either a node or a node type) * @return true if the node is a destination, or false otherwise */ private boolean isDstNode(long nodeId, T dst) { if (dst instanceof Long) { long dstNodeId = (Long) dst; return nodeId == dstNodeId; } else if (dst instanceof Node.Type) { Node.Type dstType = (Node.Type) dst; return graph.getNodeType(nodeId) == dstType; } else { return false; } } /** * Internal backtracking function of {@link #walk}. * * @param srcNodeId source node * @param dstNodeId destination node * @return the found path, as a list of node ids */ private ArrayList backtracking(long srcNodeId, long dstNodeId) { - ArrayList path = new ArrayList(); + ArrayList path = new ArrayList<>(); long currentNodeId = dstNodeId; while (currentNodeId != srcNodeId) { path.add(currentNodeId); currentNodeId = parentNode.get(currentNodeId); } path.add(srcNodeId); Collections.reverse(path); return path; } + /** + * Find a common descendant between two given nodes using two parallel BFS + * + * @param lhsNode the first node + * @param rhsNode the second node + * @return the found path, as a list of node ids + */ public Long findCommonDescendant(long lhsNode, long rhsNode) { Queue lhsStack = new ArrayDeque<>(); Queue rhsStack = new ArrayDeque<>(); HashSet lhsVisited = new HashSet<>(); HashSet rhsVisited = new HashSet<>(); lhsStack.add(lhsNode); rhsStack.add(rhsNode); lhsVisited.add(lhsNode); rhsVisited.add(rhsNode); this.nbEdgesAccessed = 0; Long curNode; while (!lhsStack.isEmpty() || !rhsStack.isEmpty()) { if (!lhsStack.isEmpty()) { curNode = lhsStack.poll(); - nbEdgesAccessed += graph.degree(curNode, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, curNode)) { + nbEdgesAccessed += graph.outdegree(curNode); + for (long neighborNodeId : new Neighbors(graph, edges, curNode)) { if (!lhsVisited.contains(neighborNodeId)) { if (rhsVisited.contains(neighborNodeId)) return neighborNodeId; lhsStack.add(neighborNodeId); lhsVisited.add(neighborNodeId); } } } if (!rhsStack.isEmpty()) { curNode = rhsStack.poll(); - nbEdgesAccessed += graph.degree(curNode, useTransposed); - for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, curNode)) { + nbEdgesAccessed += graph.outdegree(curNode); + for (long neighborNodeId : new Neighbors(graph, edges, curNode)) { if (!rhsVisited.contains(neighborNodeId)) { if (lhsVisited.contains(neighborNodeId)) return neighborNodeId; rhsStack.add(neighborNodeId); rhsVisited.add(neighborNodeId); } } } } return null; } } diff --git a/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java b/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java index c080a93..1f57665 100644 --- a/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/backend/MapBuilder.java @@ -1,216 +1,217 @@ package org.softwareheritage.graph.backend; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.Scanner; import java.util.concurrent.*; import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectBigArrays; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhPID; import org.softwareheritage.graph.backend.NodeTypesMap; /** * Create maps needed at runtime by the graph service, in particular: * * - SWH PID → WebGraph long node id * - WebGraph long node id → SWH PID (converse of the former) * - WebGraph long node id → SWH node type (enum) * * @author The Software Heritage developers */ public class MapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(MapBuilder.class); /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV"); System.exit(1); } String graphPath = args[0]; String tmpDir = args[1]; logger.info("starting maps generation..."); precomputeNodeIdMap(graphPath, tmpDir); logger.info("maps generation completed"); } /** * Computes and dumps on disk mapping files. * * @param graphPath path of the compressed graph */ // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { ProgressLogger plPid2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); ProgressLogger plNode2Pid = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plPid2Node.itemsName = "pid→node"; plNode2Pid.itemsName = "node→pid"; // avg speed for pid→node is sometime skewed due to write to the sort // pipe hanging when sort is sorting; hence also desplay local speed plPid2Node.displayLocalSpeed = true; // first half of PID->node mapping: PID -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { logger.info("loading MPH function..."); mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); logger.info("MPH function loaded"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); plPid2Node.expectedUpdates = nbIds; plNode2Pid.expectedUpdates = nbIds; // second half of PID->node mapping: WebGraph MPH (long) -> BFS order (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); logger.info("loading BFS order file..."); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); logger.info("BFS order file loaded"); if (loaded != nbIds) { logger.error("graph contains " + nbIds + " nodes, but read " + loaded); System.exit(2); } // Create mapping SWH PID -> WebGraph node id, by sequentially reading // nodes, hashing them with MPH, and permuting according to BFS order FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator swhPIDIterator = new LineIterator(buffer); // The WebGraph node id -> SWH PID mapping can be obtained from the // PID->node one by numerically sorting on node id and sequentially // writing obtained PIDs to a binary map. Delegates the sorting job to // /usr/bin/sort via pipes ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); // for the binary format of pidToNodeMap, see Python module swh.graph.pid:PidToIntMap // for the binary format of nodeToPidMap, see Python module swh.graph.pid:IntToPidMap try (DataOutputStream pidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.PID_TO_NODE))); BufferedOutputStream nodeToPidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_PID))) { // background handler for sort output, it will be fed PID/node // pairs while pidToNodeMap is being filled, and will itself fill // nodeToPidMap as soon as data from sort is ready SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToPidMap, plNode2Pid); outputHandler.start(); // Type map from WebGraph node ID to SWH type. Used at runtime by // pure Java graph traversals to efficiently check edge // restrictions. final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); final int nbBitsPerNodeType = log2NbTypes; LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); plPid2Node.start("filling pid2node map"); for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) { String strSwhPID = swhPIDIterator.next().toString(); SwhPID swhPID = new SwhPID(strSwhPID); byte[] swhPIDBin = swhPID.toBytes(); long mphId = mphMap.getLong(strSwhPID); - long nodeId = LongBigArrays.get(bfsMap, mphId); + long nodeId = BigArrays.get(bfsMap, mphId); pidToNodeMap.write(swhPIDBin, 0, swhPIDBin.length); pidToNodeMap.writeLong(nodeId); sort_stdin.write((strSwhPID + "\t" + nodeId + "\n") .getBytes(StandardCharsets.US_ASCII)); nodeTypesMap.set(nodeId, swhPID.getType().ordinal()); plPid2Node.lightUpdate(); } plPid2Node.done(); sort_stdin.close(); // write type map logger.info("storing type map"); BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); logger.info("type map stored"); // wait for nodeToPidMap filling try { logger.info("waiting for node2pid map..."); int sortExitCode = sort.waitFor(); if (sortExitCode != 0) { logger.error("sort returned non-zero exit code: " + sortExitCode); System.exit(2); } outputHandler.join(); } catch (InterruptedException e) { logger.error("processing of sort output failed with: " + e); System.exit(2); } } } private static class SortOutputHandler extends Thread { private Scanner input; private OutputStream output; private ProgressLogger pl; SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { this.input = new Scanner(input, StandardCharsets.US_ASCII); this.output = output; this.pl = pl; } public void run() { boolean sortDone = false; logger.info("node2pid: waiting for sort output..."); while (input.hasNextLine()) { if (! sortDone) { sortDone = true; this.pl.start("filling node2pid map"); } String line = input.nextLine(); // format: SWH_PID NODE_ID SwhPID swhPID = new SwhPID(line.split("\\t")[0]); // get PID try { output.write((byte[]) swhPID.toBytes()); } catch (IOException e) { logger.error("writing to node->PID map failed with: " + e); } this.pl.lightUpdate(); } this.pl.done(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/backend/Pp.java b/java/src/main/java/org/softwareheritage/graph/backend/Pp.java index 3cd0eb7..af495a7 100644 --- a/java/src/main/java/org/softwareheritage/graph/backend/Pp.java +++ b/java/src/main/java/org/softwareheritage/graph/backend/Pp.java @@ -1,42 +1,23 @@ package org.softwareheritage.graph.backend; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileWriter; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Writer; -import java.util.zip.GZIPInputStream; - -import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; -import it.unimi.dsi.fastutil.longs.LongBigArrays; -import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.fastutil.objects.ObjectBigArrays; -import it.unimi.dsi.io.FastBufferedReader; -import it.unimi.dsi.io.LineIterator; - -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; -import org.softwareheritage.graph.SwhPID; -import org.softwareheritage.graph.backend.NodeTypesMap; public class Pp { - + @SuppressWarnings("unchecked") public static void main(String[] args) throws IOException { Object2LongFunction mphMap = null; try { mphMap = (Object2LongFunction) BinIO.loadObject("all.mph"); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); System.out.println("mph size: " + nbIds); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java index 27334f9..5d5069b 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java @@ -1,130 +1,115 @@ package org.softwareheritage.graph.benchmark; import com.google.common.primitives.Longs; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.io.ByteDiskQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.Graph; import com.martiansoftware.jsap.*; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.fastutil.Arrays; import java.io.File; import java.io.IOException; public class BFS { - private Graph graph; - - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = new Graph(graphBasename); - System.err.println("Graph loaded."); - } + private final ImmutableGraph graph; private final static Logger LOGGER = LoggerFactory.getLogger(BFS.class); + public BFS(ImmutableGraph graph) { + this.graph = graph; + } + // Partly inlined from it.unimi.dsi.law.big.graph.BFS - private static void bfsperm(final ImmutableGraph graph) throws IOException { + private void bfsperm() throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 final LongArrayBitVector visited = LongArrayBitVector.ofLength(n); final ProgressLogger pl = new ProgressLogger(LOGGER); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting breadth-first visit..."); - long pos = 0; - for (long i = 0; i < n; i++) { if (visited.getBoolean(i)) continue; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); final LazyLongIterator iterator = graph.successors(currentNode); long succ; while((succ = iterator.nextLong()) != -1) { if (!visited.getBoolean(succ)) { visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } } pl.update(); } } pl.done(); queue.close(); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( BFS.class.getName(), "", new Parameter[] { new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'T', "transposed", "Use transposed graph (default: false)"), } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } - public static void main(String[] args) { + public static void main(String[] args) throws IOException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); boolean useTransposed = config.getBoolean("useTransposed"); - ProgressLogger logger = new ProgressLogger(); - long startTime; - double totalTime; - + System.err.println("Loading graph " + graphPath + " ..."); + Graph graph = new Graph(graphPath); + System.err.println("Graph loaded."); - BFS bfs = new BFS(); - try { - bfs.load_graph(graphPath); - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } + if (useTransposed) + graph = graph.transpose(); - logger.start("Parallel BFS visit..."); - try { - BFS.bfsperm(bfs.graph.getBVGraph(useTransposed)); - } catch (IOException e) { - e.printStackTrace(); - } - logger.done(); + BFS bfs = new BFS(graph); + bfs.bfsperm(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java index 0eb6c23..bdf51b6 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java @@ -1,170 +1,170 @@ package org.softwareheritage.graph.benchmark; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.StringJoiner; import java.util.function.Function; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.UnflaggedOption; import org.softwareheritage.graph.Endpoint; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhPID; import org.softwareheritage.graph.benchmark.utils.Random; import org.softwareheritage.graph.benchmark.utils.Statistics; /** * Benchmark common utility functions. * * @author The Software Heritage developers */ public class Benchmark { /** * Input arguments. */ public class Args { /** Basename of the compressed graph */ public String graphPath; /** Number of random nodes to use for the benchmark */ public int nbNodes; /** File name for CSV format benchmark log */ public String logFile; /** Random generator */ public Random random; } /** Command line arguments */ public Args args; /** CSV separator for log file */ final String CSV_SEPARATOR = ";"; /** * Constructor. */ public Benchmark() { this.args = new Args(); } /** * Parses benchmark command line arguments. * * @param args command line arguments */ public void parseCommandLineArgs(String[] args) throws JSAPException { SimpleJSAP jsap = new SimpleJSAP(Benchmark.class.getName(), "Benchmark tool for Software Heritage use-cases scenarios.", new Parameter[] { new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the compressed graph."), new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', "nb-nodes", "Number of random nodes used to do the benchmark."), new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l', "log-file", "File name to output CSV format benchmark log."), new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "seed", "Random generator seed."), }); JSAPResult config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } this.args.graphPath = config.getString("graphPath"); this.args.nbNodes = config.getInt("nbNodes"); this.args.logFile = config.getString("logFile"); this.args.random = config.contains("seed") ? new Random(config.getLong("seed")) : new Random(); } /** * Creates CSV file for log output. */ public void createCSVLogFile() throws IOException { try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile))) { StringJoiner csvHeader = new StringJoiner(CSV_SEPARATOR); csvHeader.add("use case name") .add("SWH PID") .add("number of edges accessed") .add("traversal timing") .add("pid2node timing") .add("node2pid timing"); csvLog.write(csvHeader.toString() + "\n"); } } /** * Times a specific endpoint and outputs individual datapoints along with aggregated statistics. * * @param useCaseName benchmark use-case name * @param graph compressed graph used in the benchmark * @param nodeIds node ids to use as starting point for the endpoint traversal * @param operation endpoint function to benchmark * @param dstFmt destination formatted string as described in the API * @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs") */ public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds, Function operation, String dstFmt, String algorithm) throws IOException { ArrayList timings = new ArrayList<>(); ArrayList timingsNormalized = new ArrayList<>(); ArrayList nbEdgesAccessed = new ArrayList<>(); final boolean append = true; try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile, append))) { for (long nodeId : nodeIds) { SwhPID swhPID = graph.getSwhPID(nodeId); Endpoint.Output output = (dstFmt == null) ? operation.apply(new Endpoint.Input(swhPID)) : operation.apply(new Endpoint.Input(swhPID, dstFmt, algorithm)); StringJoiner csvLine = new StringJoiner(CSV_SEPARATOR); csvLine.add(useCaseName) .add(swhPID.toString()) .add(Long.toString(output.meta.nbEdgesAccessed)) .add(Double.toString(output.meta.timings.traversal)) .add(Double.toString(output.meta.timings.pid2node)) .add(Double.toString(output.meta.timings.node2pid)); csvLog.write(csvLine.toString() + "\n"); timings.add(output.meta.timings.traversal); nbEdgesAccessed.add((double) output.meta.nbEdgesAccessed); if (output.meta.nbEdgesAccessed != 0) { timingsNormalized.add(output.meta.timings.traversal / output.meta.nbEdgesAccessed); } } } System.out.println("\n" + useCaseName + " use-case:"); System.out.println("timings:"); Statistics stats = new Statistics(timings); stats.printAll(); System.out.println("timings normalized:"); Statistics statsNormalized = new Statistics(timingsNormalized); statsNormalized.printAll(); System.out.println("nb edges accessed:"); Statistics statsNbEdgesAccessed = new Statistics(nbEdgesAccessed); statsNbEdgesAccessed.printAll(); } /** - * Same as {@link timeEndpoint} but without destination or algorithm specified to endpoint call. + * Same as {@link #timeEndpoint} but without destination or algorithm specified to endpoint call. */ public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds, Function operation) throws IOException { timeEndpoint(useCaseName, graph, nodeIds, operation, null, null); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java index e6a5cd2..722dd74 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ForkCC.java @@ -1,223 +1,216 @@ package org.softwareheritage.graph.benchmark; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; -import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.big.webgraph.Transform; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; public class ForkCC { public Boolean includeRootDir; private Graph graph; private Long emptySnapshot; private LongArrayBitVector visited; private LongArrayBitVector whitelist; private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = new Graph(graphBasename); + this.graph = new Graph(graphBasename).symmetrize(); System.err.println("Graph loaded."); this.emptySnapshot = null; this.whitelist = null; this.visited = null; this.includeRootDir = null; } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( ForkCC.class.getName(), "", new Parameter[] { new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't', "whitelist", "Whitelist of origins"), new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, 'w', "numthreads", "Number of threads"), new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'R', "includerootdir", "Include root directory (default: false)"), } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private boolean nodeIsEmptySnapshot(Long node) { if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private Boolean shouldVisit(Long node){ Node.Type nt = this.graph.getNodeType(node); if (nt == Node.Type.CNT) { return false; } if (nt == Node.Type.DIR && !includeRootDir) return false; if (this.nodeIsEmptySnapshot(node)) return false; if (visited.getBoolean(node)) return false; return true; } - private ArrayList> compute(final ImmutableGraph graph, ProgressLogger pl) throws IOException { + private ArrayList> compute(ProgressLogger pl) throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 visited = LongArrayBitVector.ofLength(n); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); ArrayList> components = new ArrayList<>(); for (long i = 0; i < n; i++) { if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) continue; ArrayList component = new ArrayList<>(); queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); Node.Type cur_nt = this.graph.getNodeType(currentNode); if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) { // TODO: add a check that the origin has >=1 non-empty snapshot component.add(currentNode); } final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!shouldVisit(succ)) continue; if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (component.size() > 0) { components.add(component); } } pl.done(); queue.close(); return components; } private static void printDistribution(ArrayList> components) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { System.out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { System.out.println(node); } } private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); - this.whitelist = LongArrayBitVector.ofLength(this.graph.getNbNodes()); + this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); Scanner scanner = null; try { scanner = new Scanner(new File(path)); while(scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); int numThreads = config.getInt("numThreads"); String whitelistPath = config.getString("whitelistPath"); boolean includeRootDir = config.getBoolean("includeRootDir"); ForkCC forkCc = new ForkCC(); try { forkCc.load_graph(graphPath); forkCc.includeRootDir = includeRootDir; } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } if (whitelistPath != null) { forkCc.parseWhitelist(whitelistPath); } - ImmutableGraph symmetric = Transform.union( - forkCc.graph.getBVGraph(false), - forkCc.graph.getBVGraph(true) - ); - ProgressLogger logger = new ProgressLogger(); try { - ArrayList> components = forkCc.compute(symmetric, logger); + ArrayList> components = forkCc.compute(logger); printDistribution(components); // printLargestComponent(components); } catch (IOException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java index a6f1c79..402632c 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java @@ -1,93 +1,93 @@ package org.softwareheritage.graph.benchmark; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import java.io.IOException; import java.util.ArrayList; public class ListEmptyOrigins { private Graph graph; private Long emptySnapshot; private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = new Graph(graphBasename); System.err.println("Graph loaded."); this.emptySnapshot = null; } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( ListEmptyOrigins.class.getName(), "", new Parameter[] { new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private boolean nodeIsEmptySnapshot(Long node) { System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node); if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private ArrayList compute(ImmutableGraph graph) { final long n = graph.numNodes(); ArrayList bad = new ArrayList<>(); for (long i = 0; i < n; i++) { Node.Type nt = this.graph.getNodeType(i); if (nt != Node.Type.ORI) continue; final LazyLongIterator iterator = graph.successors(i); long succ; boolean found = false; while ((succ = iterator.nextLong()) != -1) { if (this.graph.outdegree(succ) > 0) { found = true; } } if (!found) bad.add(i); } return bad; } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); ListEmptyOrigins leo = new ListEmptyOrigins(); try { leo.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } - ArrayList badlist = leo.compute(leo.graph.getBVGraph(false)); + ArrayList badlist = leo.compute(leo.graph); for (Long bad : badlist) { System.out.println(bad); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java index abcec76..df93c96 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java @@ -1,67 +1,67 @@ package org.softwareheritage.graph.benchmark.utils; import java.util.PrimitiveIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; /** * Random related utility class. * * @author The Software Heritage developers */ public class Random { /** Internal pseudorandom generator */ java.util.Random random; /** * Constructor. */ public Random() { this.random = new java.util.Random(); } /** * Constructor. * * @param seed random generator seed */ public Random(long seed) { this.random = new java.util.Random(seed); } /** * Generates random node ids. * * @param graph graph used to pick node ids * @param nbNodes number of node ids to generate * @return an array of random node ids */ public long[] generateNodeIds(Graph graph, int nbNodes) { - return random.longs(nbNodes, 0, graph.getNbNodes()).toArray(); + return random.longs(nbNodes, 0, graph.numNodes()).toArray(); } /** * Generates random node ids with a specific type. * * @param graph graph used to pick node ids * @param nbNodes number of node ids to generate * @param expectedType specific node type to pick * @return an array of random node ids */ public long[] generateNodeIdsOfType(Graph graph, int nbNodes, Node.Type expectedType) { - PrimitiveIterator.OfLong nodes = random.longs(0, graph.getNbNodes()).iterator(); + PrimitiveIterator.OfLong nodes = random.longs(0, graph.numNodes()).iterator(); long[] nodeIds = new long[nbNodes]; long nextId; for (int i = 0; i < nbNodes; i++) { do { nextId = nodes.nextLong(); } while (graph.getNodeType(nextId) != expectedType); nodeIds[i] = nextId; } return nodeIds; } } diff --git a/swh/graph/graph.py b/swh/graph/graph.py index fda4c68..fcc71ba 100644 --- a/swh/graph/graph.py +++ b/swh/graph/graph.py @@ -1,185 +1,185 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import contextlib import functools from swh.graph.backend import Backend from swh.graph.dot import dot_to_svg, graph_dot, KIND_TO_SHAPE BASE_URL = "https://archive.softwareheritage.org/browse" KIND_TO_URL_FRAGMENT = { "ori": "/origin/{}", "snp": "/snapshot/{}", "rel": "/release/{}", "rev": "/revision/{}", "dir": "/directory/{}", "cnt": "/content/sha1_git:{}/", } def call_async_gen(generator, *args, **kwargs): loop = asyncio.get_event_loop() it = generator(*args, **kwargs).__aiter__() while True: try: res = loop.run_until_complete(it.__anext__()) yield res except StopAsyncIteration: break class Neighbors: """Neighbor iterator with custom O(1) length method""" def __init__(self, graph, iterator, length_func): self.graph = graph self.iterator = iterator self.length_func = length_func def __iter__(self): return self def __next__(self): succ = self.iterator.nextLong() if succ == -1: raise StopIteration return GraphNode(self.graph, succ) def __len__(self): return self.length_func() class GraphNode: """Node in the SWH graph""" def __init__(self, graph, node_id): self.graph = graph self.id = node_id def children(self): return Neighbors( self.graph, self.graph.java_graph.successors(self.id), lambda: self.graph.java_graph.outdegree(self.id), ) def parents(self): return Neighbors( self.graph, self.graph.java_graph.predecessors(self.id), lambda: self.graph.java_graph.indegree(self.id), ) def simple_traversal(self, ttype, direction="forward", edges="*"): for node in call_async_gen( self.graph.backend.simple_traversal, ttype, direction, edges, self.id ): yield self.graph[node] def leaves(self, *args, **kwargs): yield from self.simple_traversal("leaves", *args, **kwargs) def visit_nodes(self, *args, **kwargs): yield from self.simple_traversal("visit_nodes", *args, **kwargs) def visit_edges(self, direction="forward", edges="*"): for src, dst in call_async_gen( self.graph.backend.visit_edges, direction, edges, self.id ): yield (self.graph[src], self.graph[dst]) def visit_paths(self, direction="forward", edges="*"): for path in call_async_gen( self.graph.backend.visit_paths, direction, edges, self.id ): yield [self.graph[node] for node in path] def walk(self, dst, direction="forward", edges="*", traversal="dfs"): for node in call_async_gen( self.graph.backend.walk, direction, edges, traversal, self.id, dst ): yield self.graph[node] def _count(self, ttype, direction="forward", edges="*"): return self.graph.backend.count(ttype, direction, edges, self.id) count_leaves = functools.partialmethod(_count, ttype="leaves") count_neighbors = functools.partialmethod(_count, ttype="neighbors") count_visit_nodes = functools.partialmethod(_count, ttype="visit_nodes") @property def pid(self): return self.graph.node2pid[self.id] @property def kind(self): return self.pid.split(":")[2] def __str__(self): return self.pid def __repr__(self): return "<{}>".format(self.pid) def dot_fragment(self): swh, version, kind, hash = self.pid.split(":") label = "{}:{}..{}".format(kind, hash[0:2], hash[-2:]) url = BASE_URL + KIND_TO_URL_FRAGMENT[kind].format(hash) shape = KIND_TO_SHAPE[kind] return '{} [label="{}", href="{}", target="_blank", shape="{}"];'.format( self.id, label, url, shape ) def _repr_svg_(self): nodes = [self, *list(self.children()), *list(self.parents())] dot = graph_dot(nodes) svg = dot_to_svg(dot) return svg class Graph: def __init__(self, backend, node2pid, pid2node): self.backend = backend self.java_graph = backend.entry.get_graph() self.node2pid = node2pid self.pid2node = pid2node def stats(self): return self.backend.stats() @property def path(self): return self.java_graph.getPath() def __len__(self): - return self.java_graph.getNbNodes() + return self.java_graph.numNodes() def __getitem__(self, node_id): if isinstance(node_id, int): self.node2pid[node_id] # check existence return GraphNode(self, node_id) elif isinstance(node_id, str): node_id = self.pid2node[node_id] return GraphNode(self, node_id) def __iter__(self): for pid, pos in self.backend.pid2node: yield self[pid] def iter_prefix(self, prefix): for pid, pos in self.backend.pid2node.iter_prefix(prefix): yield self[pid] def iter_type(self, pid_type): for pid, pos in self.backend.pid2node.iter_type(pid_type): yield self[pid] @contextlib.contextmanager def load(graph_path): with Backend(graph_path) as backend: yield Graph(backend, backend.node2pid, backend.pid2node)