diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java index 1d63774..dadf2b6 100644 --- a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java +++ b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java @@ -1,91 +1,88 @@ package org.softwareheritage.graph; import java.util.ArrayList; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; - /** * Edge restriction based on node types, used when visiting the graph. *

* Software Heritage * graph contains multiple node types (contents, directories, revisions, ...) and restricting * the traversal to specific node types is necessary for many querying operations: use cases. * * @author The Software Heritage developers */ public class AllowedEdges { - /** Graph on which edge restriction is performed */ - Graph graph; /** * 2D boolean matrix storing access rights for all combination of src/dst node types (first * dimension is source, second dimension is destination), when edge restriction is not enforced * this array is set to null for early bypass. */ public boolean[][] restrictedTo; + /** Graph on which edge restriction is performed */ + Graph graph; /** * Constructor. * * @param graph the graph on which to perform edge restriction * @param edgesFmt a formatted string describing allowed edges */ public AllowedEdges(Graph graph, String edgesFmt) { this.graph = graph; int nbNodeTypes = Node.Type.values().length; this.restrictedTo = new boolean[nbNodeTypes][nbNodeTypes]; // Special values (null, empty, "*") if (edgesFmt == null || edgesFmt.isEmpty()) { return; } if (edgesFmt.equals("*")) { // Allows for quick bypass (with simple null check) when no edge restriction restrictedTo = null; return; } // Format: "src1:dst1,src2:dst2,[...]" String[] edgeTypes = edgesFmt.split(","); for (String edgeType : edgeTypes) { String[] nodeTypes = edgeType.split(":"); if (nodeTypes.length != 2) { throw new IllegalArgumentException("Cannot parse edge type: " + edgeType); } ArrayList srcTypes = Node.Type.parse(nodeTypes[0]); ArrayList dstTypes = Node.Type.parse(nodeTypes[1]); for (Node.Type srcType : srcTypes) { for (Node.Type dstType : dstTypes) { restrictedTo[srcType.ordinal()][dstType.ordinal()] = true; } } } } /** * Checks if a given edge can be followed during graph traversal. * * @param srcNodeId edge source node * @param dstNodeId edge destination node * @return true if allowed and false otherwise */ public boolean isAllowed(long srcNodeId, long dstNodeId) { Node.Type srcType = graph.getNodeType(srcNodeId); Node.Type dstType = graph.getNodeType(dstNodeId); return restrictedTo[srcType.ordinal()][dstType.ordinal()]; } /** * Works like {@link AllowedEdges#isAllowed(long, long)} but with node types directly instead of * node ids. * * @see AllowedEdges#isAllowed(long, long) */ public boolean isAllowed(Node.Type srcType, Node.Type dstType) { return restrictedTo[srcType.ordinal()][dstType.ordinal()]; } } diff --git a/java/src/main/java/org/softwareheritage/graph/Entry.java b/java/src/main/java/org/softwareheritage/graph/Entry.java index b496687..efd4915 100644 --- a/java/src/main/java/org/softwareheritage/graph/Entry.java +++ b/java/src/main/java/org/softwareheritage/graph/Entry.java @@ -1,192 +1,193 @@ package org.softwareheritage.graph; -import java.util.ArrayList; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.PropertyNamingStrategy; + import java.io.DataOutputStream; import java.io.FileOutputStream; import java.io.IOException; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.PropertyNamingStrategy; +import java.util.ArrayList; public class Entry { - private Graph graph; - private final long PATH_SEPARATOR_ID = -1; + private Graph graph; public void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = new Graph(graphBasename); System.err.println("Graph loaded."); } public Graph get_graph() { return graph.copy(); } public String stats() { try { Stats stats = new Stats(graph.getPath()); ObjectMapper objectMapper = new ObjectMapper(); objectMapper.setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE); return objectMapper.writeValueAsString(stats); } catch (IOException e) { throw new RuntimeException("Cannot read stats: " + e); } } - private interface NodeCountVisitor { - void accept(long nodeId, Traversal.NodeIdConsumer consumer); - } - private int count_visitor(NodeCountVisitor f, long srcNodeId) { - int[] count = { 0 }; - f.accept(srcNodeId, (node) -> { count[0]++; }); + int[] count = {0}; + f.accept(srcNodeId, (node) -> { + count[0]++; + }); return count[0]; } public int count_leaves(String direction, String edgesFmt, long srcNodeId) { Traversal t = new Traversal(this.graph.copy(), direction, edgesFmt); return count_visitor(t::leavesVisitor, srcNodeId); } public int count_neighbors(String direction, String edgesFmt, long srcNodeId) { Traversal t = new Traversal(this.graph.copy(), direction, edgesFmt); return count_visitor(t::neighborsVisitor, srcNodeId); } public int count_visit_nodes(String direction, String edgesFmt, long srcNodeId) { Traversal t = new Traversal(this.graph.copy(), direction, edgesFmt); return count_visitor(t::visitNodesVisitor, srcNodeId); } public QueryHandler get_handler(String clientFIFO) { return new QueryHandler(this.graph.copy(), clientFIFO); } + private interface NodeCountVisitor { + void accept(long nodeId, Traversal.NodeIdConsumer consumer); + } + public class QueryHandler { Graph graph; DataOutputStream out; String clientFIFO; public QueryHandler(Graph graph, String clientFIFO) { this.graph = graph; this.clientFIFO = clientFIFO; this.out = null; } public void writeNode(long nodeId) { try { out.writeLong(nodeId); } catch (IOException e) { throw new RuntimeException("Cannot write response to client: " + e); } } public void writeEdge(long srcId, long dstId) { writeNode(srcId); writeNode(dstId); } public void writePath(ArrayList path) { for (Long nodeId : path) { writeNode(nodeId); } writeNode(PATH_SEPARATOR_ID); } public void open() { try { FileOutputStream file = new FileOutputStream(this.clientFIFO); this.out = new DataOutputStream(file); } catch (IOException e) { throw new RuntimeException("Cannot open client FIFO: " + e); } } public void close() { try { out.close(); } catch (IOException e) { throw new RuntimeException("Cannot write response to client: " + e); } } public void leaves(String direction, String edgesFmt, long srcNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); t.leavesVisitor(srcNodeId, this::writeNode); close(); } public void neighbors(String direction, String edgesFmt, long srcNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); t.neighborsVisitor(srcNodeId, this::writeNode); close(); } public void visit_nodes(String direction, String edgesFmt, long srcNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); t.visitNodesVisitor(srcNodeId, this::writeNode); close(); } public void visit_edges(String direction, String edgesFmt, long srcNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); t.visitNodesVisitor(srcNodeId, null, this::writeEdge); close(); } public void visit_paths(String direction, String edgesFmt, long srcNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); t.visitPathsVisitor(srcNodeId, this::writePath); close(); } public void walk(String direction, String edgesFmt, String algorithm, long srcNodeId, long dstNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); for (Long nodeId : t.walk(srcNodeId, dstNodeId, algorithm)) { writeNode(nodeId); } close(); } public void walk_type(String direction, String edgesFmt, String algorithm, long srcNodeId, String dst) { open(); Node.Type dstType = Node.Type.fromStr(dst); Traversal t = new Traversal(this.graph, direction, edgesFmt); for (Long nodeId : t.walk(srcNodeId, dstType, algorithm)) { writeNode(nodeId); } close(); } public void random_walk(String direction, String edgesFmt, int retries, long srcNodeId, long dstNodeId) { open(); Traversal t = new Traversal(this.graph, direction, edgesFmt); for (Long nodeId : t.randomWalk(srcNodeId, dstNodeId, retries)) { writeNode(nodeId); } close(); } public void random_walk_type(String direction, String edgesFmt, int retries, long srcNodeId, String dst) { open(); Node.Type dstType = Node.Type.fromStr(dst); Traversal t = new Traversal(this.graph, direction, edgesFmt); for (Long nodeId : t.randomWalk(srcNodeId, dstType, retries)) { writeNode(nodeId); } close(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java index 2fce7cb..8103229 100644 --- a/java/src/main/java/org/softwareheritage/graph/Graph.java +++ b/java/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,253 +1,256 @@ package org.softwareheritage.graph; -import java.io.IOException; - -import it.unimi.dsi.big.webgraph.*; - +import it.unimi.dsi.big.webgraph.BVGraph; +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.big.webgraph.Transform; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; +import java.io.IOException; + /** * Main class storing the compressed graph and node id mappings. *

* The compressed graph is stored using the WebGraph * ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent * identifiers (PID) while WebGraph uses integers internally. These two mappings (long id ↔ * PID) are used for the input (users refer to the graph using PID) and the output (convert back to * PID for users results). However, since graph traversal can be restricted depending on the node * type (see {@link AllowedEdges}), a long id → node type map is stored as well to avoid a full * PID lookup. * * @author The Software Heritage developers * @see org.softwareheritage.graph.AllowedEdges * @see org.softwareheritage.graph.maps.NodeIdMap * @see org.softwareheritage.graph.maps.NodeTypesMap */ public class Graph extends ImmutableGraph { /** File extension for the SWH PID to long node id map */ public static final String PID_TO_NODE = ".pid2node.bin"; /** File extension for the long node id to SWH PID map */ public static final String NODE_TO_PID = ".node2pid.bin"; /** File extension for the long node id to node type map */ public static final String NODE_TO_TYPE = ".node2type.map"; /** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */ ImmutableGraph graph; /** Transposed compressed graph (used for backward traversals) */ ImmutableGraph graphTransposed; /** Path and basename of the compressed graph */ String path; /** Mapping long id ↔ SWH PIDs */ NodeIdMap nodeIdMap; /** Mapping long id → node types */ NodeTypesMap nodeTypesMap; /** * Constructor. * * @param path path and basename of the compressed graph to load */ public Graph(String path) throws IOException { this.path = path; this.graph = BVGraph.loadMapped(path); this.graphTransposed = BVGraph.loadMapped(path + "-transposed"); this.nodeTypesMap = new NodeTypesMap(path); this.nodeIdMap = new NodeIdMap(path, numNodes()); } protected Graph(ImmutableGraph graph, ImmutableGraph graphTransposed, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { this.graph = graph; this.graphTransposed = graphTransposed; this.path = path; this.nodeIdMap = nodeIdMap; this.nodeTypesMap = nodeTypesMap; } /** * Return a flyweight copy of the graph. */ @Override public Graph copy() { return new Graph(this.graph.copy(), this.graphTransposed.copy(), this.path, this.nodeIdMap, this.nodeTypesMap); } @Override public boolean randomAccess() { return graph.randomAccess() && graphTransposed.randomAccess(); } /** * Return a transposed version of the graph. */ public Graph transpose() { return new Graph(this.graphTransposed, this.graph, this.path, this.nodeIdMap, this.nodeTypesMap); } /** * Return a symmetric version of the graph. */ public Graph symmetrize() { ImmutableGraph symmetric = Transform.union(graph, graphTransposed); return new Graph(symmetric, symmetric, this.path, this.nodeIdMap, this.nodeTypesMap); } /** * Cleans up graph resources after use. */ public void cleanUp() throws IOException { nodeIdMap.close(); } + /** * Returns number of nodes in the graph. * * @return number of nodes in the graph */ @Override public long numNodes() { return graph.numNodes(); } /** * Returns number of edges in the graph. * * @return number of edges in the graph */ @Override public long numArcs() { return graph.numArcs(); } /** * Returns lazy iterator of successors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of successors of the node, specified as a WebGraph LazyLongIterator */ @Override public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } /** * Returns lazy iterator of successors of a node while following a specific set of edge types. * * @param nodeId node specified as a long id * @param allowedEdges the specification of which edges can be traversed * @return lazy iterator of successors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator successors(long nodeId, AllowedEdges allowedEdges) { if (allowedEdges.restrictedTo == null) { // All edges are allowed, bypass edge check return this.successors(nodeId); } else { LazyLongIterator allSuccessors = this.successors(nodeId); return new LazyLongIterator() { @Override public long nextLong() { long neighbor; while ((neighbor = allSuccessors.nextLong()) != -1) { if (allowedEdges.isAllowed(nodeId, neighbor)) { return neighbor; } } return -1; } @Override public long skip(final long n) { long i; - for (i = 0; i < n && nextLong() != -1; i++); + for (i = 0; i < n && nextLong() != -1; i++) ; return i; } }; } } /** * Returns the outdegree of a node. * * @param nodeId node specified as a long id * @return outdegree of a node */ @Override public long outdegree(long nodeId) { return graph.outdegree(nodeId); } /** * Returns lazy iterator of predecessors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of predecessors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator predecessors(long nodeId) { return this.transpose().successors(nodeId); } /** * Returns the indegree of a node. * * @param nodeId node specified as a long id * @return indegree of a node */ public long indegree(long nodeId) { return this.transpose().outdegree(nodeId); } /** * Returns the underlying BVGraph. * * @return WebGraph BVGraph */ public ImmutableGraph getGraph() { return this.graph; } /** * Returns the graph full path. * * @return graph full path */ public String getPath() { return path; } /** * Converts {@link SwhPID} node to long. * * @param swhPID node specified as a {@link SwhPID} * @return internal long node id * @see org.softwareheritage.graph.SwhPID */ public long getNodeId(SwhPID swhPID) { return nodeIdMap.getNodeId(swhPID); } /** * Converts long id node to {@link SwhPID}. * * @param nodeId node specified as a long id * @return external SWH PID * @see org.softwareheritage.graph.SwhPID */ public SwhPID getSwhPID(long nodeId) { return nodeIdMap.getSwhPID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see org.softwareheritage.graph.Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/Node.java b/java/src/main/java/org/softwareheritage/graph/Node.java index 0f18ee0..6f10d4f 100644 --- a/java/src/main/java/org/softwareheritage/graph/Node.java +++ b/java/src/main/java/org/softwareheritage/graph/Node.java @@ -1,105 +1,117 @@ package org.softwareheritage.graph; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * A node in the Software Heritage graph. * * @author The Software Heritage developers */ public class Node { /** * Software Heritage graph node types, as described in the * data model. */ public enum Type { /** Content node */ CNT, /** Directory node */ DIR, /** Origin node */ ORI, /** Release node */ REL, /** Revision node */ REV, /** Snapshot node */ SNP; /** * Converts integer to corresponding SWH node type. * * @param intType node type represented as an integer * @return the corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public static Node.Type fromInt(int intType) { switch (intType) { - case 0: return CNT; - case 1: return DIR; - case 2: return ORI; - case 3: return REL; - case 4: return REV; - case 5: return SNP; + case 0: + return CNT; + case 1: + return DIR; + case 2: + return ORI; + case 3: + return REL; + case 4: + return REV; + case 5: + return SNP; } return null; } /** * Converts node types to the corresponding int value * * @param type node type as an enum * @return the corresponding int value */ public static int toInt(Node.Type type) { - switch (type) { - case CNT: return 0; - case DIR: return 1; - case ORI: return 2; - case REL: return 3; - case REV: return 4; - case SNP: return 5; + switch (type) { + case CNT: + return 0; + case DIR: + return 1; + case ORI: + return 2; + case REL: + return 3; + case REV: + return 4; + case SNP: + return 5; } throw new IllegalArgumentException("Unknown node type: " + type); } /** * Converts string to corresponding SWH node type. * * @param strType node type represented as a string * @return the corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public static Node.Type fromStr(String strType) { if (!strType.matches("cnt|dir|ori|rel|rev|snp")) { throw new IllegalArgumentException("Unknown node type: " + strType); } return Node.Type.valueOf(strType.toUpperCase()); } /** * Parses SWH node type possible values from formatted string (see the API * syntax). * * @param strFmtType node types represented as a formatted string * @return a list containing the {@link Node.Type} values * @see org.softwareheritage.graph.Node.Type */ public static ArrayList parse(String strFmtType) { ArrayList types = new ArrayList<>(); if (strFmtType.equals("*")) { List nodeTypes = Arrays.asList(Node.Type.values()); types.addAll(nodeTypes); } else { types.add(Node.Type.fromStr(strFmtType)); } return types; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/Stats.java b/java/src/main/java/org/softwareheritage/graph/Stats.java index b7200d6..92d4fd8 100644 --- a/java/src/main/java/org/softwareheritage/graph/Stats.java +++ b/java/src/main/java/org/softwareheritage/graph/Stats.java @@ -1,68 +1,67 @@ package org.softwareheritage.graph; import java.io.FileInputStream; import java.io.IOException; import java.util.Properties; /** * Statistics on the compressed graph. *

* These statistics are not computed but directly read from WebGraph generated .stats and .properties files. * * @author The Software Heritage developers */ public class Stats { - public static class Counts { - public long nodes; - public long edges; - } - - public static class Ratios { - public double compression; - public double bitsPerNode; - public double bitsPerEdge; - public double avgLocality; - } - - public static class Degree { - public long min; - public long max; - public double avg; - } - public Counts counts; public Ratios ratios; public Degree indegree; public Degree outdegree; - /** * Constructor. * * @param graphPath path and basename of compressed graph */ public Stats(String graphPath) throws IOException { Properties properties = new Properties(); properties.load(new FileInputStream(graphPath + ".properties")); properties.load(new FileInputStream(graphPath + ".stats")); this.counts = new Counts(); this.ratios = new Ratios(); this.indegree = new Degree(); this.outdegree = new Degree(); this.counts.nodes = Long.parseLong(properties.getProperty("nodes")); this.counts.edges = Long.parseLong(properties.getProperty("arcs")); this.ratios.compression = Double.parseDouble(properties.getProperty("compratio")); this.ratios.bitsPerNode = Double.parseDouble(properties.getProperty("bitspernode")); this.ratios.bitsPerEdge = Double.parseDouble(properties.getProperty("bitsperlink")); this.ratios.avgLocality = Double.parseDouble(properties.getProperty("avglocality")); this.indegree.min = Long.parseLong(properties.getProperty("minindegree")); this.indegree.max = Long.parseLong(properties.getProperty("maxindegree")); this.indegree.avg = Double.parseDouble(properties.getProperty("avgindegree")); this.outdegree.min = Long.parseLong(properties.getProperty("minoutdegree")); this.outdegree.max = Long.parseLong(properties.getProperty("maxoutdegree")); this.outdegree.avg = Double.parseDouble(properties.getProperty("avgoutdegree")); } + + public static class Counts { + public long nodes; + public long edges; + } + + public static class Ratios { + public double compression; + public double bitsPerNode; + public double bitsPerEdge; + public double avgLocality; + } + + public static class Degree { + public long min; + public long max; + public double avg; + } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhPID.java b/java/src/main/java/org/softwareheritage/graph/SwhPID.java index c355834..c44abdc 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhPID.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhPID.java @@ -1,126 +1,124 @@ package org.softwareheritage.graph; -import java.lang.System; - import com.fasterxml.jackson.annotation.JsonValue; -import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.DecoderException; - -import org.softwareheritage.graph.Node; +import org.apache.commons.codec.binary.Hex; /** * A Software Heritage PID, see persistent * identifier documentation. * * @author The Software Heritage developers */ public class SwhPID { /** Fixed hash length of the PID */ public static final int HASH_LENGTH = 40; /** Full PID as a string */ String swhPID; /** PID node type */ Node.Type type; /** * Constructor. * * @param swhPID full PID as a string */ public SwhPID(String swhPID) { this.swhPID = swhPID; // PID format: 'swh:1:type:hash' String[] parts = swhPID.split(":"); if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) { throw new IllegalArgumentException("malformed SWH PID: " + swhPID); } this.type = Node.Type.fromStr(parts[2]); if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) { throw new IllegalArgumentException("malformed SWH PID: " + swhPID); } } + /** + * Creates a SwhPID from a compact binary representation. + *

+ * The binary format is specified in the Python module + * swh.graph.pid:str_to_bytes . + */ + public static SwhPID fromBytes(byte[] input) { + byte[] digest = new byte[20]; + System.arraycopy(input, 2, digest, 0, digest.length); + + String pidStr = String.format( + "swh:%d:%s:%s", + input[0], + Node.Type.fromInt(input[1]).toString().toLowerCase(), + Hex.encodeHexString(digest) + ); + return new SwhPID(pidStr); + } + @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SwhPID)) return false; SwhPID other = (SwhPID) otherObj; return swhPID.equals(other.getSwhPID()); } @Override public int hashCode() { return swhPID.hashCode(); } @Override public String toString() { return swhPID; } - /** Converts PID to a compact binary representation. - * + /** + * Converts PID to a compact binary representation. + *

* The binary format is specified in the Python module * swh.graph.pid:str_to_bytes . */ public byte[] toBytes() { byte[] bytes = new byte[22]; byte[] digest; bytes[0] = (byte) 1; // namespace version bytes[1] = (byte) Node.Type.toInt(this.type); // PID type try { digest = Hex.decodeHex(this.swhPID.substring(10)); // SHA1 hash System.arraycopy(digest, 0, bytes, 2, digest.length); } catch (DecoderException e) { throw new IllegalArgumentException("invalid hex sequence in PID: " + this.swhPID); } return bytes; } - /** Creates a SwhPID from a compact binary representation. - * - * The binary format is specified in the Python module - * swh.graph.pid:str_to_bytes . - */ - public static SwhPID fromBytes(byte[] input) { - byte[] digest = new byte[20]; - System.arraycopy(input, 2, digest, 0, digest.length); - - String pidStr = String.format( - "swh:%d:%s:%s", - input[0], - Node.Type.fromInt(input[1]).toString().toLowerCase(), - Hex.encodeHexString(digest) - ); - return new SwhPID(pidStr); - } - /** * Returns full PID as a string. * * @return full PID string */ @JsonValue public String getSwhPID() { return swhPID; } /** * Returns PID node type. * * @return PID corresponding {@link Node.Type} * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType() { return type; } } diff --git a/java/src/main/java/org/softwareheritage/graph/Traversal.java b/java/src/main/java/org/softwareheritage/graph/Traversal.java index 421c02e..5c4902f 100644 --- a/java/src/main/java/org/softwareheritage/graph/Traversal.java +++ b/java/src/main/java/org/softwareheritage/graph/Traversal.java @@ -1,523 +1,526 @@ package org.softwareheritage.graph; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import org.softwareheritage.graph.server.Endpoint; + import java.util.*; import java.util.function.Consumer; import java.util.function.LongConsumer; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import org.softwareheritage.graph.server.Endpoint; - /** * Traversal algorithms on the compressed graph. *

* Internal implementation of the traversal API endpoints. These methods only input/output internal * long ids, which are converted in the {@link Endpoint} higher-level class to Software Heritage * PID. * * @author The Software Heritage developers * @see Endpoint */ public class Traversal { /** Graph used in the traversal */ Graph graph; /** Graph edge restriction */ AllowedEdges edges; /** Hash set storing if we have visited a node */ HashSet visited; /** Hash map storing parent node id for each nodes during a traversal */ Map parentNode; /** Number of edges accessed during traversal */ long nbEdgesAccessed; /** random number generator, for random walks */ Random rng; /** * Constructor. * * @param graph graph used in the traversal * @param direction a string (either "forward" or "backward") specifying edge orientation * @param edgesFmt a formatted string describing allowed edges */ public Traversal(Graph graph, String direction, String edgesFmt) { if (!direction.matches("forward|backward")) { throw new IllegalArgumentException("Unknown traversal direction: " + direction); } if (direction.equals("backward")) { this.graph = graph.transpose(); } else { this.graph = graph; } this.edges = new AllowedEdges(graph, edgesFmt); this.visited = new HashSet<>(); this.parentNode = new HashMap<>(); this.nbEdgesAccessed = 0; this.rng = new Random(); } /** * Returns number of accessed edges during traversal. * * @return number of edges accessed in last traversal */ public long getNbEdgesAccessed() { return nbEdgesAccessed; } /** * Returns number of accessed nodes during traversal. * * @return number of nodes accessed in last traversal */ public long getNbNodesAccessed() { return this.visited.size(); } /** * Push version of {@link #leaves} will fire passed callback for each leaf. */ public void leavesVisitor(long srcNodeId, NodeIdConsumer cb) { Stack stack = new Stack<>(); this.nbEdgesAccessed = 0; stack.push(srcNodeId); visited.add(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); long neighborsCnt = 0; nbEdgesAccessed += graph.outdegree(currentNodeId); LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { neighborsCnt++; if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); } } if (neighborsCnt == 0) { cb.accept(currentNodeId); } } } /** * Returns the leaves of a subgraph rooted at the specified source node. * * @param srcNodeId source node * @return list of node ids corresponding to the leaves */ public ArrayList leaves(long srcNodeId) { ArrayList nodeIds = new ArrayList<>(); leavesVisitor(srcNodeId, nodeIds::add); return nodeIds; } /** * Push version of {@link #neighbors}: will fire passed callback on each * neighbor. */ public void neighborsVisitor(long srcNodeId, NodeIdConsumer cb) { this.nbEdgesAccessed = graph.outdegree(srcNodeId); LazyLongIterator it = graph.successors(srcNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { cb.accept(neighborNodeId); } } /** * Returns node direct neighbors (linked with exactly one edge). * * @param srcNodeId source node * @return list of node ids corresponding to the neighbors */ public ArrayList neighbors(long srcNodeId) { ArrayList nodeIds = new ArrayList<>(); neighborsVisitor(srcNodeId, nodeIds::add); return nodeIds; } /** * Push version of {@link #visitNodes}: will fire passed callback on each * visited node. */ public void visitNodesVisitor(long srcNodeId, NodeIdConsumer nodeCb, EdgeIdConsumer edgeCb) { Stack stack = new Stack<>(); this.nbEdgesAccessed = 0; stack.push(srcNodeId); visited.add(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); if (nodeCb != null) { nodeCb.accept(currentNodeId); } nbEdgesAccessed += graph.outdegree(currentNodeId); LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { if (edgeCb != null) { edgeCb.accept(currentNodeId, neighborNodeId); } if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); } } } } /** One-argument version to handle callbacks properly */ public void visitNodesVisitor(long srcNodeId, NodeIdConsumer cb) { visitNodesVisitor(srcNodeId, cb, null); } /** * Performs a graph traversal and returns explored nodes. * * @param srcNodeId source node * @return list of explored node ids */ public ArrayList visitNodes(long srcNodeId) { ArrayList nodeIds = new ArrayList<>(); visitNodesVisitor(srcNodeId, nodeIds::add); return nodeIds; } /** * Push version of {@link #visitPaths}: will fire passed callback on each * discovered (complete) path. */ public void visitPathsVisitor(long srcNodeId, PathConsumer cb) { Stack currentPath = new Stack<>(); this.nbEdgesAccessed = 0; visitPathsInternalVisitor(srcNodeId, currentPath, cb); } /** * Performs a graph traversal and returns explored paths. * * @param srcNodeId source node * @return list of explored paths (represented as a list of node ids) */ public ArrayList> visitPaths(long srcNodeId) { ArrayList> paths = new ArrayList<>(); visitPathsVisitor(srcNodeId, paths::add); return paths; } private void visitPathsInternalVisitor(long currentNodeId, Stack currentPath, PathConsumer cb) { currentPath.push(currentNodeId); long visitedNeighbors = 0; nbEdgesAccessed += graph.outdegree(currentNodeId); LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { visitPathsInternalVisitor(neighborNodeId, currentPath, cb); visitedNeighbors++; } if (visitedNeighbors == 0) { ArrayList path = new ArrayList<>(currentPath); cb.accept(path); } currentPath.pop(); } /** * Performs a graph traversal with backtracking, and returns the first * found path from source to destination. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return found path as a list of node ids */ public ArrayList walk(long srcNodeId, T dst, String visitOrder) { long dstNodeId; if (visitOrder.equals("dfs")) { dstNodeId = walkInternalDFS(srcNodeId, dst); } else if (visitOrder.equals("bfs")) { dstNodeId = walkInternalBFS(srcNodeId, dst); } else { throw new IllegalArgumentException("Unknown visit order: " + visitOrder); } if (dstNodeId == -1) { throw new IllegalArgumentException("Cannot find destination: " + dst); } return backtracking(srcNodeId, dstNodeId); } /** * Performs a random walk (picking a random successor at each step) from * source to destination. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return found path as a list of node ids or an empty path to indicate * that no suitable path have been found */ public ArrayList randomWalk(long srcNodeId, T dst) { return randomWalk(srcNodeId, dst, 0); } /** * Performs a stubborn random walk (picking a random successor at each * step) from source to destination. The walk is "stubborn" in the sense * that it will not give up the first time if a satisfying target node is * found, but it will retry up to a limited amount of times. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @param retries number of times to retry; 0 means no retries (single walk) * @return found path as a list of node ids or an empty path to indicate * that no suitable path have been found */ public ArrayList randomWalk(long srcNodeId, T dst, int retries) { long curNodeId = srcNodeId; ArrayList path = new ArrayList<>(); this.nbEdgesAccessed = 0; boolean found; if (retries < 0) { throw new IllegalArgumentException("Negative number of retries given: " + retries); } while (true) { path.add(curNodeId); LazyLongIterator successors = graph.successors(curNodeId, edges); curNodeId = randomPick(successors); if (curNodeId < 0) { found = false; break; } if (isDstNode(curNodeId, dst)) { path.add(curNodeId); found = true; break; } } if (found) { return path; } else if (retries > 0) { // try again return randomWalk(srcNodeId, dst, retries - 1); } else { // not found and no retries left path.clear(); return path; } } /** * Randomly choose an element from an iterator over Longs using reservoir * sampling * * @param elements iterator over selection domain * @return randomly chosen element or -1 if no suitable element was found */ private long randomPick(LazyLongIterator elements) { long curPick = -1; long seenCandidates = 0; - for (long element; (element = elements.nextLong()) != -1;) { + for (long element; (element = elements.nextLong()) != -1; ) { seenCandidates++; if (Math.round(rng.nextFloat() * (seenCandidates - 1)) == 0) { curPick = element; } } return curPick; } /** * Internal DFS function of {@link #walk}. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return final destination node or -1 if no path found */ private long walkInternalDFS(long srcNodeId, T dst) { Stack stack = new Stack<>(); this.nbEdgesAccessed = 0; stack.push(srcNodeId); visited.add(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); if (isDstNode(currentNodeId, dst)) { return currentNodeId; } nbEdgesAccessed += graph.outdegree(currentNodeId); LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); parentNode.put(neighborNodeId, currentNodeId); } } } return -1; } /** * Internal BFS function of {@link #walk}. * * @param srcNodeId source node * @param dst destination (either a node or a node type) * @return final destination node or -1 if no path found */ private long walkInternalBFS(long srcNodeId, T dst) { Queue queue = new LinkedList<>(); this.nbEdgesAccessed = 0; queue.add(srcNodeId); visited.add(srcNodeId); while (!queue.isEmpty()) { long currentNodeId = queue.poll(); if (isDstNode(currentNodeId, dst)) { return currentNodeId; } nbEdgesAccessed += graph.outdegree(currentNodeId); LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { if (!visited.contains(neighborNodeId)) { queue.add(neighborNodeId); visited.add(neighborNodeId); parentNode.put(neighborNodeId, currentNodeId); } } } return -1; } /** * Internal function of {@link #walk} to check if a node corresponds to the destination. * * @param nodeId current node * @param dst destination (either a node or a node type) * @return true if the node is a destination, or false otherwise */ private boolean isDstNode(long nodeId, T dst) { if (dst instanceof Long) { long dstNodeId = (Long) dst; return nodeId == dstNodeId; } else if (dst instanceof Node.Type) { Node.Type dstType = (Node.Type) dst; return graph.getNodeType(nodeId) == dstType; } else { return false; } } /** * Internal backtracking function of {@link #walk}. * * @param srcNodeId source node * @param dstNodeId destination node * @return the found path, as a list of node ids */ private ArrayList backtracking(long srcNodeId, long dstNodeId) { ArrayList path = new ArrayList<>(); long currentNodeId = dstNodeId; while (currentNodeId != srcNodeId) { path.add(currentNodeId); currentNodeId = parentNode.get(currentNodeId); } path.add(srcNodeId); Collections.reverse(path); return path; } /** * Find a common descendant between two given nodes using two parallel BFS * * @param lhsNode the first node * @param rhsNode the second node * @return the found path, as a list of node ids */ public Long findCommonDescendant(long lhsNode, long rhsNode) { Queue lhsStack = new ArrayDeque<>(); Queue rhsStack = new ArrayDeque<>(); HashSet lhsVisited = new HashSet<>(); HashSet rhsVisited = new HashSet<>(); lhsStack.add(lhsNode); rhsStack.add(rhsNode); lhsVisited.add(lhsNode); rhsVisited.add(rhsNode); this.nbEdgesAccessed = 0; Long curNode; while (!lhsStack.isEmpty() || !rhsStack.isEmpty()) { if (!lhsStack.isEmpty()) { curNode = lhsStack.poll(); nbEdgesAccessed += graph.outdegree(curNode); LazyLongIterator it = graph.successors(curNode, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { if (!lhsVisited.contains(neighborNodeId)) { if (rhsVisited.contains(neighborNodeId)) return neighborNodeId; lhsStack.add(neighborNodeId); lhsVisited.add(neighborNodeId); } } } if (!rhsStack.isEmpty()) { curNode = rhsStack.poll(); nbEdgesAccessed += graph.outdegree(curNode); LazyLongIterator it = graph.successors(curNode, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { if (!rhsVisited.contains(neighborNodeId)) { if (lhsVisited.contains(neighborNodeId)) return neighborNodeId; rhsStack.add(neighborNodeId); rhsVisited.add(neighborNodeId); } } } } return null; } public interface NodeIdConsumer extends LongConsumer { - /** Callback for incrementally receiving node identifiers during a graph + /** + * Callback for incrementally receiving node identifiers during a graph * visit. */ void accept(long nodeId); } public interface EdgeIdConsumer { - /** Callback for incrementally receiving edge identifiers during a graph + /** + * Callback for incrementally receiving edge identifiers during a graph * visit. */ void accept(long srcId, long dstId); } public interface PathConsumer extends Consumer> { - /** Callback for incrementally receiving node paths (made of node + /** + * Callback for incrementally receiving node paths (made of node * identifiers) during a graph visit. */ void accept(ArrayList path); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java b/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java index dae2af1..038a3e6 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java @@ -1,47 +1,45 @@ package org.softwareheritage.graph.benchmark; -import java.io.IOException; -import java.util.ArrayList; - import com.martiansoftware.jsap.JSAPException; import it.unimi.dsi.big.webgraph.LazyLongIterator; - import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.benchmark.Benchmark; import org.softwareheritage.graph.benchmark.utils.Statistics; import org.softwareheritage.graph.benchmark.utils.Timing; +import java.io.IOException; +import java.util.ArrayList; + /** * Benchmark to time edge access time. * * @author The Software Heritage developers */ public class AccessEdge { /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException, JSAPException { Benchmark bench = new Benchmark(); bench.parseCommandLineArgs(args); Graph graph = new Graph(bench.args.graphPath); long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes); ArrayList timings = new ArrayList<>(); for (long nodeId : nodeIds) { long startTime = Timing.start(); LazyLongIterator neighbors = graph.successors(nodeId); long firstNeighbor = neighbors.nextLong(); double duration = Timing.stop(startTime); timings.add(duration); } System.out.println("Used " + bench.args.nbNodes + " random edges (results are in seconds):"); Statistics stats = new Statistics(timings); stats.printAll(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java index 5d5069b..44f9b92 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java @@ -1,115 +1,111 @@ package org.softwareheritage.graph.benchmark; import com.google.common.primitives.Longs; +import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; +import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.Graph; -import com.martiansoftware.jsap.*; - -import it.unimi.dsi.logging.ProgressLogger; -import it.unimi.dsi.fastutil.Arrays; - import java.io.File; import java.io.IOException; public class BFS { - private final ImmutableGraph graph; - private final static Logger LOGGER = LoggerFactory.getLogger(BFS.class); + private final ImmutableGraph graph; public BFS(ImmutableGraph graph) { this.graph = graph; } + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP( + BFS.class.getName(), + "", + new Parameter[]{ + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + + new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, + 'T', "transposed", "Use transposed graph (default: false)"), + } + ); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + public static void main(String[] args) throws IOException { + JSAPResult config = parse_args(args); + String graphPath = config.getString("graphPath"); + boolean useTransposed = config.getBoolean("useTransposed"); + + System.err.println("Loading graph " + graphPath + " ..."); + Graph graph = new Graph(graphPath); + System.err.println("Graph loaded."); + + if (useTransposed) + graph = graph.transpose(); + + BFS bfs = new BFS(graph); + bfs.bfsperm(); + } + // Partly inlined from it.unimi.dsi.law.big.graph.BFS private void bfsperm() throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue - int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); + int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 final LongArrayBitVector visited = LongArrayBitVector.ofLength(n); final ProgressLogger pl = new ProgressLogger(LOGGER); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting breadth-first visit..."); for (long i = 0; i < n; i++) { if (visited.getBoolean(i)) continue; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); final LazyLongIterator iterator = graph.successors(currentNode); long succ; - while((succ = iterator.nextLong()) != -1) { + while ((succ = iterator.nextLong()) != -1) { if (!visited.getBoolean(succ)) { visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } } pl.update(); } } pl.done(); queue.close(); } - - - private static JSAPResult parse_args(String[] args) { - JSAPResult config = null; - try { - SimpleJSAP jsap = new SimpleJSAP( - BFS.class.getName(), - "", - new Parameter[] { - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'g', "graph", "Basename of the compressed graph"), - - new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, - 'T', "transposed", "Use transposed graph (default: false)"), - } - ); - - config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - } catch (JSAPException e) { - e.printStackTrace(); - } - return config; - } - - public static void main(String[] args) throws IOException { - JSAPResult config = parse_args(args); - String graphPath = config.getString("graphPath"); - boolean useTransposed = config.getBoolean("useTransposed"); - - System.err.println("Loading graph " + graphPath + " ..."); - Graph graph = new Graph(graphPath); - System.err.println("Graph loaded."); - - if (useTransposed) - graph = graph.transpose(); - - BFS bfs = new BFS(graph); - bfs.bfsperm(); - } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java index 005d3f1..884dea2 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java @@ -1,170 +1,162 @@ package org.softwareheritage.graph.benchmark; +import com.martiansoftware.jsap.*; +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.SwhPID; +import org.softwareheritage.graph.benchmark.utils.Random; +import org.softwareheritage.graph.benchmark.utils.Statistics; +import org.softwareheritage.graph.server.Endpoint; + import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.StringJoiner; import java.util.function.Function; -import com.martiansoftware.jsap.FlaggedOption; -import com.martiansoftware.jsap.JSAP; -import com.martiansoftware.jsap.JSAPException; -import com.martiansoftware.jsap.JSAPResult; -import com.martiansoftware.jsap.Parameter; -import com.martiansoftware.jsap.SimpleJSAP; -import com.martiansoftware.jsap.UnflaggedOption; - -import org.softwareheritage.graph.server.Endpoint; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.SwhPID; -import org.softwareheritage.graph.benchmark.utils.Random; -import org.softwareheritage.graph.benchmark.utils.Statistics; - /** * Benchmark common utility functions. * * @author The Software Heritage developers */ public class Benchmark { - /** - * Input arguments. - */ - public class Args { - /** Basename of the compressed graph */ - public String graphPath; - /** Number of random nodes to use for the benchmark */ - public int nbNodes; - /** File name for CSV format benchmark log */ - public String logFile; - /** Random generator */ - public Random random; - } - - /** Command line arguments */ - public Args args; /** CSV separator for log file */ final String CSV_SEPARATOR = ";"; - + /** Command line arguments */ + public Args args; /** * Constructor. */ public Benchmark() { this.args = new Args(); } /** * Parses benchmark command line arguments. * * @param args command line arguments */ public void parseCommandLineArgs(String[] args) throws JSAPException { SimpleJSAP jsap = new SimpleJSAP(Benchmark.class.getName(), - "Benchmark tool for Software Heritage use-cases scenarios.", - new Parameter[] { - new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - JSAP.NOT_GREEDY, "The basename of the compressed graph."), - new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', - "nb-nodes", "Number of random nodes used to do the benchmark."), - new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l', - "log-file", "File name to output CSV format benchmark log."), - new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', - "seed", "Random generator seed."), - }); + "Benchmark tool for Software Heritage use-cases scenarios.", + new Parameter[]{ + new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + JSAP.NOT_GREEDY, "The basename of the compressed graph."), + new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', + "nb-nodes", "Number of random nodes used to do the benchmark."), + new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l', + "log-file", "File name to output CSV format benchmark log."), + new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', + "seed", "Random generator seed."), + }); JSAPResult config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } this.args.graphPath = config.getString("graphPath"); this.args.nbNodes = config.getInt("nbNodes"); this.args.logFile = config.getString("logFile"); this.args.random = config.contains("seed") ? new Random(config.getLong("seed")) : new Random(); } /** * Creates CSV file for log output. */ public void createCSVLogFile() throws IOException { try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile))) { StringJoiner csvHeader = new StringJoiner(CSV_SEPARATOR); csvHeader.add("use case name") - .add("SWH PID") - .add("number of edges accessed") - .add("traversal timing") - .add("pid2node timing") - .add("node2pid timing"); + .add("SWH PID") + .add("number of edges accessed") + .add("traversal timing") + .add("pid2node timing") + .add("node2pid timing"); csvLog.write(csvHeader.toString() + "\n"); } } /** * Times a specific endpoint and outputs individual datapoints along with aggregated statistics. * * @param useCaseName benchmark use-case name * @param graph compressed graph used in the benchmark * @param nodeIds node ids to use as starting point for the endpoint traversal * @param operation endpoint function to benchmark * @param dstFmt destination formatted string as described in the API * @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs") */ public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds, Function operation, String dstFmt, String algorithm) - throws IOException { + throws IOException { ArrayList timings = new ArrayList<>(); ArrayList timingsNormalized = new ArrayList<>(); ArrayList nbEdgesAccessed = new ArrayList<>(); final boolean append = true; try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile, append))) { for (long nodeId : nodeIds) { SwhPID swhPID = graph.getSwhPID(nodeId); Endpoint.Output output = (dstFmt == null) - ? operation.apply(new Endpoint.Input(swhPID)) - : operation.apply(new Endpoint.Input(swhPID, dstFmt, algorithm)); + ? operation.apply(new Endpoint.Input(swhPID)) + : operation.apply(new Endpoint.Input(swhPID, dstFmt, algorithm)); StringJoiner csvLine = new StringJoiner(CSV_SEPARATOR); csvLine.add(useCaseName) - .add(swhPID.toString()) - .add(Long.toString(output.meta.nbEdgesAccessed)) - .add(Double.toString(output.meta.timings.traversal)) - .add(Double.toString(output.meta.timings.pid2node)) - .add(Double.toString(output.meta.timings.node2pid)); + .add(swhPID.toString()) + .add(Long.toString(output.meta.nbEdgesAccessed)) + .add(Double.toString(output.meta.timings.traversal)) + .add(Double.toString(output.meta.timings.pid2node)) + .add(Double.toString(output.meta.timings.node2pid)); csvLog.write(csvLine.toString() + "\n"); timings.add(output.meta.timings.traversal); nbEdgesAccessed.add((double) output.meta.nbEdgesAccessed); if (output.meta.nbEdgesAccessed != 0) { timingsNormalized.add(output.meta.timings.traversal / output.meta.nbEdgesAccessed); } } } System.out.println("\n" + useCaseName + " use-case:"); System.out.println("timings:"); Statistics stats = new Statistics(timings); stats.printAll(); System.out.println("timings normalized:"); Statistics statsNormalized = new Statistics(timingsNormalized); statsNormalized.printAll(); System.out.println("nb edges accessed:"); Statistics statsNbEdgesAccessed = new Statistics(nbEdgesAccessed); statsNbEdgesAccessed.printAll(); } /** * Same as {@link #timeEndpoint} but without destination or algorithm specified to endpoint call. */ public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds, Function operation) throws IOException { timeEndpoint(useCaseName, graph, nodeIds, operation, null, null); } + + /** + * Input arguments. + */ + public class Args { + /** Basename of the compressed graph */ + public String graphPath; + /** Number of random nodes to use for the benchmark */ + public int nbNodes; + /** File name for CSV format benchmark log */ + public String logFile; + /** Random generator */ + public Random random; + } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java index b060d55..910ca76 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java @@ -1,45 +1,44 @@ package org.softwareheritage.graph.benchmark; -import java.io.IOException; - import com.martiansoftware.jsap.JSAPException; - -import org.softwareheritage.graph.server.Endpoint; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.server.Endpoint; + +import java.io.IOException; /** * Benchmark Software Heritage browsing * use-cases scenarios. * * @author The Software Heritage developers */ public class Browsing { /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException, JSAPException { Benchmark bench = new Benchmark(); bench.parseCommandLineArgs(args); Graph graph = new Graph(bench.args.graphPath); long[] dirNodeIds = - bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.DIR); + bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.DIR); long[] revNodeIds = - bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.REV); + bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.REV); Endpoint dirEndpoint = new Endpoint(graph, "forward", "dir:cnt,dir:dir"); Endpoint revEndpoint = new Endpoint(graph, "forward", "rev:rev"); System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):"); bench.createCSVLogFile(); bench.timeEndpoint("ls", graph, dirNodeIds, dirEndpoint::neighbors); bench.timeEndpoint("ls -R", graph, dirNodeIds, dirEndpoint::visitPaths); bench.timeEndpoint("git log", graph, revNodeIds, revEndpoint::visitNodes); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java b/java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java index d4ccdff..d9c3846 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java @@ -1,134 +1,136 @@ package org.softwareheritage.graph.benchmark; import com.martiansoftware.jsap.*; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.Traversal; import org.softwareheritage.graph.benchmark.utils.Timing; import java.io.IOException; import java.util.Scanner; -import java.util.concurrent.*; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; public class GenDistribution { private Graph graph; - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = new Graph(graphBasename); - System.err.println("Graph loaded."); - } - private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( - GenDistribution.class.getName(), - "", - new Parameter[] { - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'g', "graph", "Basename of the compressed graph"), - new FlaggedOption("srcType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 's', "srctype", "Source node type"), - new FlaggedOption("dstType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'd', "dsttype", "Destination node type"), - new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'e', "edges", "Edges constraints"), - - new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, - 't', "numthreads", "Number of threads"), - } + GenDistribution.class.getName(), + "", + new Parameter[]{ + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + new FlaggedOption("srcType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 's', "srctype", "Source node type"), + new FlaggedOption("dstType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'd', "dsttype", "Destination node type"), + new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'e', "edges", "Edges constraints"), + + new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, + 't', "numthreads", "Number of threads"), + } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); Node.Type srcType = Node.Type.fromStr(config.getString("srcType")); Node.Type dstType = Node.Type.fromStr(config.getString("dstType")); String edgesFmt = config.getString("edgesFmt"); int numThreads = config.getInt("numThreads"); GenDistribution tp = new GenDistribution(); try { tp.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } final long END_OF_QUEUE = -1L; ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); service.submit(() -> { try { Scanner input = new Scanner(System.in); while (input.hasNextLong()) { long node = input.nextLong(); if (tp.graph.getNodeType(node) == srcType) { queue.put(node); } } } catch (InterruptedException e) { e.printStackTrace(); } finally { for (int i = 0; i < numThreads; ++i) { try { queue.put(END_OF_QUEUE); } catch (InterruptedException e) { e.printStackTrace(); } } } }); for (int i = 0; i < numThreads; ++i) { service.submit(() -> { Graph thread_graph = tp.graph.copy(); long startTime; double totalTime; while (true) { Long node = null; try { node = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } if (node == null || node == END_OF_QUEUE) { return; } Traversal t = new Traversal(thread_graph, "backward", edgesFmt); - int[] count = { 0 }; + int[] count = {0}; startTime = Timing.start(); t.visitNodesVisitor(node, (curnode) -> { if (tp.graph.getNodeType(curnode) == dstType) { count[0]++; } }); totalTime = Timing.stop(startTime); System.out.format("%d %d %d %d %f\n", node, count[0], t.getNbNodesAccessed(), t.getNbEdgesAccessed(), totalTime ); } }); } service.shutdown(); } + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename); + System.err.println("Graph loaded."); + } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java index 402632c..8ee8ede 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java @@ -1,93 +1,93 @@ package org.softwareheritage.graph.benchmark; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import java.io.IOException; import java.util.ArrayList; public class ListEmptyOrigins { private Graph graph; private Long emptySnapshot; - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = new Graph(graphBasename); - System.err.println("Graph loaded."); - this.emptySnapshot = null; - } - private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( - ListEmptyOrigins.class.getName(), - "", - new Parameter[] { - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'g', "graph", "Basename of the compressed graph"), - } + ListEmptyOrigins.class.getName(), + "", + new Parameter[]{ + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } + public static void main(String[] args) { + JSAPResult config = parse_args(args); + String graphPath = config.getString("graphPath"); + + ListEmptyOrigins leo = new ListEmptyOrigins(); + try { + leo.load_graph(graphPath); + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + ArrayList badlist = leo.compute(leo.graph); + for (Long bad : badlist) { + System.out.println(bad); + } + } + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename); + System.err.println("Graph loaded."); + this.emptySnapshot = null; + } + private boolean nodeIsEmptySnapshot(Long node) { System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node); if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private ArrayList compute(ImmutableGraph graph) { final long n = graph.numNodes(); ArrayList bad = new ArrayList<>(); for (long i = 0; i < n; i++) { Node.Type nt = this.graph.getNodeType(i); if (nt != Node.Type.ORI) continue; final LazyLongIterator iterator = graph.successors(i); long succ; boolean found = false; while ((succ = iterator.nextLong()) != -1) { if (this.graph.outdegree(succ) > 0) { found = true; } } if (!found) bad.add(i); } return bad; } - - public static void main(String[] args) { - JSAPResult config = parse_args(args); - String graphPath = config.getString("graphPath"); - - ListEmptyOrigins leo = new ListEmptyOrigins(); - try { - leo.load_graph(graphPath); - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } - ArrayList badlist = leo.compute(leo.graph); - for (Long bad : badlist) { - System.out.println(bad); - } - } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java index f8aec35..324f38e 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java @@ -1,52 +1,51 @@ package org.softwareheritage.graph.benchmark; -import java.io.IOException; - import com.martiansoftware.jsap.JSAPException; - -import org.softwareheritage.graph.server.Endpoint; import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.server.Endpoint; + +import java.io.IOException; /** * Benchmark Software Heritage provenance * use-cases scenarios. * * @author The Software Heritage developers */ public class Provenance { /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException, JSAPException { Benchmark bench = new Benchmark(); bench.parseCommandLineArgs(args); Graph graph = new Graph(bench.args.graphPath); long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes); Endpoint commitProvenanceEndpoint = new Endpoint(graph, "backward", "dir:dir,cnt:dir,dir:rev"); Endpoint originProvenanceEndpoint = new Endpoint(graph, "backward", "*"); System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):"); bench.createCSVLogFile(); bench.timeEndpoint( - "commit provenance (dfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "dfs"); + "commit provenance (dfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "dfs"); bench.timeEndpoint( - "commit provenance (bfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "bfs"); + "commit provenance (bfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "bfs"); bench.timeEndpoint( - "complete commit provenance", graph, nodeIds, commitProvenanceEndpoint::leaves); + "complete commit provenance", graph, nodeIds, commitProvenanceEndpoint::leaves); bench.timeEndpoint( - "origin provenance (dfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "dfs"); + "origin provenance (dfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "dfs"); bench.timeEndpoint( - "origin provenance (bfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "bfs"); + "origin provenance (bfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "bfs"); bench.timeEndpoint( - "complete origin provenance", graph, nodeIds, originProvenanceEndpoint::leaves); + "complete origin provenance", graph, nodeIds, originProvenanceEndpoint::leaves); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java index c42b96a..3a6c6dc 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java @@ -1,38 +1,37 @@ package org.softwareheritage.graph.benchmark; -import java.io.IOException; - import com.martiansoftware.jsap.JSAPException; - -import org.softwareheritage.graph.server.Endpoint; import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.server.Endpoint; + +import java.io.IOException; /** * Benchmark Software Heritage vault * use-case scenario. * * @author The Software Heritage developers */ public class Vault { /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException, JSAPException { Benchmark bench = new Benchmark(); bench.parseCommandLineArgs(args); Graph graph = new Graph(bench.args.graphPath); long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes); Endpoint endpoint = new Endpoint(graph, "forward", "*"); System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):"); bench.createCSVLogFile(); bench.timeEndpoint("git bundle", graph, nodeIds, endpoint::visitNodes); } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java index df93c96..ee4c530 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java +++ b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java @@ -1,67 +1,67 @@ package org.softwareheritage.graph.benchmark.utils; -import java.util.PrimitiveIterator; - import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; +import java.util.PrimitiveIterator; + /** * Random related utility class. * * @author The Software Heritage developers */ public class Random { /** Internal pseudorandom generator */ java.util.Random random; /** * Constructor. */ public Random() { this.random = new java.util.Random(); } /** * Constructor. * * @param seed random generator seed */ public Random(long seed) { this.random = new java.util.Random(seed); } /** * Generates random node ids. * * @param graph graph used to pick node ids * @param nbNodes number of node ids to generate * @return an array of random node ids */ public long[] generateNodeIds(Graph graph, int nbNodes) { return random.longs(nbNodes, 0, graph.numNodes()).toArray(); } /** * Generates random node ids with a specific type. * * @param graph graph used to pick node ids * @param nbNodes number of node ids to generate * @param expectedType specific node type to pick * @return an array of random node ids */ public long[] generateNodeIdsOfType(Graph graph, int nbNodes, Node.Type expectedType) { PrimitiveIterator.OfLong nodes = random.longs(0, graph.numNodes()).iterator(); long[] nodeIds = new long[nbNodes]; long nextId; for (int i = 0; i < nbNodes; i++) { do { nextId = nodes.nextLong(); } while (graph.getNodeType(nextId) != expectedType); nodeIds[i] = nextId; } return nodeIds; } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java index 2eebc4c..fc9667c 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java @@ -1,217 +1,219 @@ package org.softwareheritage.graph.experiments.forks; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.benchmark.BFS; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Map; +import java.util.Scanner; +import java.util.TreeMap; public class ForkCC { public Boolean includeRootDir; private Graph graph; private Long emptySnapshot; private LongArrayBitVector visited; private LongArrayBitVector whitelist; - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = new Graph(graphBasename).symmetrize(); - System.err.println("Graph loaded."); - this.emptySnapshot = null; - this.whitelist = null; - this.visited = null; - this.includeRootDir = null; - } - private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( - ForkCC.class.getName(), - "", - new Parameter[] { - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'g', "graph", "Basename of the compressed graph"), - new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, - 't', "whitelist", "Whitelist of origins"), - new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, - 'w', "numthreads", "Number of threads"), - new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, - 'R', "includerootdir", "Include root directory (default: false)"), - } + ForkCC.class.getName(), + "", + new Parameter[]{ + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, + 't', "whitelist", "Whitelist of origins"), + new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, + 'w', "numthreads", "Number of threads"), + new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, + 'R', "includerootdir", "Include root directory (default: false)"), + } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } + private static void printDistribution(ArrayList> components) { + TreeMap distribution = new TreeMap<>(); + for (ArrayList component : components) { + distribution.merge((long) component.size(), 1L, Long::sum); + } + + for (Map.Entry entry : distribution.entrySet()) { + System.out.format("%d %d\n", entry.getKey(), entry.getValue()); + } + } + + private static void printLargestComponent(ArrayList> components) { + int indexLargest = 0; + for (int i = 1; i < components.size(); ++i) { + if (components.get(i).size() > components.get(indexLargest).size()) + indexLargest = i; + } + + ArrayList component = components.get(indexLargest); + for (Long node : component) { + System.out.println(node); + } + } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + int numThreads = config.getInt("numThreads"); + String whitelistPath = config.getString("whitelistPath"); + boolean includeRootDir = config.getBoolean("includeRootDir"); + + ForkCC forkCc = new ForkCC(); + try { + forkCc.load_graph(graphPath); + forkCc.includeRootDir = includeRootDir; + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + if (whitelistPath != null) { + forkCc.parseWhitelist(whitelistPath); + } + + ProgressLogger logger = new ProgressLogger(); + try { + ArrayList> components = forkCc.compute(logger); + printDistribution(components); + // printLargestComponent(components); + } catch (IOException e) { + e.printStackTrace(); + } + logger.done(); + } + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename).symmetrize(); + System.err.println("Graph loaded."); + this.emptySnapshot = null; + this.whitelist = null; + this.visited = null; + this.includeRootDir = null; + } + private boolean nodeIsEmptySnapshot(Long node) { if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } - private Boolean shouldVisit(Long node){ + private Boolean shouldVisit(Long node) { Node.Type nt = this.graph.getNodeType(node); if (nt == Node.Type.CNT) { return false; } if (nt == Node.Type.DIR && !includeRootDir) return false; if (this.nodeIsEmptySnapshot(node)) return false; if (visited.getBoolean(node)) return false; return true; } - private ArrayList> compute(ProgressLogger pl) throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue - int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); + int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 visited = LongArrayBitVector.ofLength(n); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); ArrayList> components = new ArrayList<>(); for (long i = 0; i < n; i++) { if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) continue; ArrayList component = new ArrayList<>(); queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); Node.Type cur_nt = this.graph.getNodeType(currentNode); if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) { // TODO: add a check that the origin has >=1 non-empty snapshot component.add(currentNode); } final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!shouldVisit(succ)) continue; if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (component.size() > 0) { components.add(component); } } pl.done(); queue.close(); return components; } - private static void printDistribution(ArrayList> components) { - TreeMap distribution = new TreeMap<>(); - for (ArrayList component : components) { - distribution.merge((long) component.size(), 1L, Long::sum); - } - - for (Map.Entry entry : distribution.entrySet()) { - System.out.format("%d %d\n", entry.getKey(), entry.getValue()); - } - } - - private static void printLargestComponent(ArrayList> components) { - int indexLargest = 0; - for (int i = 1; i < components.size(); ++i) { - if (components.get(i).size() > components.get(indexLargest).size()) - indexLargest = i; - } - - ArrayList component = components.get(indexLargest); - for (Long node : component) { - System.out.println(node); - } - } - private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); Scanner scanner = null; try { scanner = new Scanner(new File(path)); - while(scanner.hasNextLong()) { + while (scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } - - public static void main(String[] args) { - JSAPResult config = parse_args(args); - - String graphPath = config.getString("graphPath"); - int numThreads = config.getInt("numThreads"); - String whitelistPath = config.getString("whitelistPath"); - boolean includeRootDir = config.getBoolean("includeRootDir"); - - ForkCC forkCc = new ForkCC(); - try { - forkCc.load_graph(graphPath); - forkCc.includeRootDir = includeRootDir; - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } - - if (whitelistPath != null) { - forkCc.parseWhitelist(whitelistPath); - } - - ProgressLogger logger = new ProgressLogger(); - try { - ArrayList> components = forkCc.compute(logger); - printDistribution(components); - // printLargestComponent(components); - } catch (IOException e) { - e.printStackTrace(); - } - logger.done(); - } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/MapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/MapBuilder.java index 4bb6506..90fcd50 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/MapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/MapBuilder.java @@ -1,215 +1,212 @@ package org.softwareheritage.graph.maps; -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.util.Scanner; -import java.util.concurrent.*; - import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.logging.ProgressLogger; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhPID; +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.Scanner; +import java.util.concurrent.TimeUnit; + /** * Create maps needed at runtime by the graph service, in particular: - * + *

* - SWH PID → WebGraph long node id * - WebGraph long node id → SWH PID (converse of the former) * - WebGraph long node id → SWH node type (enum) * * @author The Software Heritage developers */ public class MapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(MapBuilder.class); /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV"); System.exit(1); } String graphPath = args[0]; String tmpDir = args[1]; logger.info("starting maps generation..."); precomputeNodeIdMap(graphPath, tmpDir); logger.info("maps generation completed"); } /** * Computes and dumps on disk mapping files. * * @param graphPath path of the compressed graph */ // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String graphPath, String tmpDir) - throws IOException - { + throws IOException { ProgressLogger plPid2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); ProgressLogger plNode2Pid = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plPid2Node.itemsName = "pid→node"; plNode2Pid.itemsName = "node→pid"; // avg speed for pid→node is sometime skewed due to write to the sort // pipe hanging when sort is sorting; hence also desplay local speed plPid2Node.displayLocalSpeed = true; // first half of PID->node mapping: PID -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { logger.info("loading MPH function..."); mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); logger.info("MPH function loaded"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); plPid2Node.expectedUpdates = nbIds; plNode2Pid.expectedUpdates = nbIds; // second half of PID->node mapping: WebGraph MPH (long) -> BFS order (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); logger.info("loading BFS order file..."); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); logger.info("BFS order file loaded"); if (loaded != nbIds) { logger.error("graph contains " + nbIds + " nodes, but read " + loaded); System.exit(2); } // Create mapping SWH PID -> WebGraph node id, by sequentially reading // nodes, hashing them with MPH, and permuting according to BFS order FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, - StandardCharsets.US_ASCII)); + StandardCharsets.US_ASCII)); LineIterator swhPIDIterator = new LineIterator(buffer); // The WebGraph node id -> SWH PID mapping can be obtained from the // PID->node one by numerically sorting on node id and sequentially // writing obtained PIDs to a binary map. Delegates the sorting job to // /usr/bin/sort via pipes ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "--numeric-sort", "--key", "2", - "--buffer-size", SORT_BUFFER_SIZE, - "--temporary-directory", tmpDir); + "--buffer-size", SORT_BUFFER_SIZE, + "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); // for the binary format of pidToNodeMap, see Python module swh.graph.pid:PidToIntMap // for the binary format of nodeToPidMap, see Python module swh.graph.pid:IntToPidMap try (DataOutputStream pidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.PID_TO_NODE))); BufferedOutputStream nodeToPidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_PID))) { // background handler for sort output, it will be fed PID/node // pairs while pidToNodeMap is being filled, and will itself fill // nodeToPidMap as soon as data from sort is ready SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToPidMap, plNode2Pid); outputHandler.start(); // Type map from WebGraph node ID to SWH type. Used at runtime by // pure Java graph traversals to efficiently check edge // restrictions. final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) - / Math.log(2)); + / Math.log(2)); final int nbBitsPerNodeType = log2NbTypes; LongArrayBitVector nodeTypesBitVector = - LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); + LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); plPid2Node.start("filling pid2node map"); for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) { String strSwhPID = swhPIDIterator.next().toString(); SwhPID swhPID = new SwhPID(strSwhPID); byte[] swhPIDBin = swhPID.toBytes(); long mphId = mphMap.getLong(strSwhPID); long nodeId = BigArrays.get(bfsMap, mphId); pidToNodeMap.write(swhPIDBin, 0, swhPIDBin.length); pidToNodeMap.writeLong(nodeId); sort_stdin.write((strSwhPID + "\t" + nodeId + "\n") - .getBytes(StandardCharsets.US_ASCII)); + .getBytes(StandardCharsets.US_ASCII)); nodeTypesMap.set(nodeId, swhPID.getType().ordinal()); plPid2Node.lightUpdate(); } plPid2Node.done(); sort_stdin.close(); // write type map logger.info("storing type map"); BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); logger.info("type map stored"); // wait for nodeToPidMap filling try { logger.info("waiting for node2pid map..."); int sortExitCode = sort.waitFor(); if (sortExitCode != 0) { logger.error("sort returned non-zero exit code: " + sortExitCode); System.exit(2); } outputHandler.join(); } catch (InterruptedException e) { logger.error("processing of sort output failed with: " + e); System.exit(2); } } } private static class SortOutputHandler extends Thread { private Scanner input; private OutputStream output; private ProgressLogger pl; SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { this.input = new Scanner(input, StandardCharsets.US_ASCII); this.output = output; this.pl = pl; } public void run() { boolean sortDone = false; logger.info("node2pid: waiting for sort output..."); while (input.hasNextLine()) { - if (! sortDone) { + if (!sortDone) { sortDone = true; this.pl.start("filling node2pid map"); } String line = input.nextLine(); // format: SWH_PID NODE_ID SwhPID swhPID = new SwhPID(line.split("\\t")[0]); // get PID try { output.write((byte[]) swhPID.toBytes()); } catch (IOException e) { logger.error("writing to node->PID map failed with: " + e); } this.pl.lightUpdate(); } this.pl.done(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java b/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java index 4c16130..64f577c 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java @@ -1,62 +1,62 @@ package org.softwareheritage.graph.maps; +import it.unimi.dsi.io.ByteBufferInputStream; + import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; -import it.unimi.dsi.io.ByteBufferInputStream; - /** * Wrapper class around very big mmap()-ed file. *

* Java has a limit for mmap()-ed files because of unsupported 64-bit indexing. The dsiutils ByteBufferInputStream is used to overcome this * Java limit. * * @author The Software Heritage developers */ public class MapFile { /** Memory-mapped file buffer */ ByteBufferInputStream bufferMap; /** Fixed line length of the mmap()-ed file */ int lineLength; /** * Constructor. * * @param path file path to mmap() * @param lineLength fixed length of a line in the file */ public MapFile(String path, int lineLength) throws IOException { this.bufferMap = null; this.lineLength = lineLength; try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { FileChannel fileChannel = mapFile.getChannel(); bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); } } /** * Returns a specific line in the file. * * @param lineIndex line number in the file * @return the line at the specified position */ public byte[] readAtLine(long lineIndex) { byte[] buffer = new byte[lineLength]; long position = lineIndex * (long) lineLength; bufferMap.position(position); bufferMap.read(buffer, 0, lineLength); return buffer; } /** * Closes the mmap()-ed file. */ public void close() throws IOException { bufferMap.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java index fd21f50..5ddabb3 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,114 +1,114 @@ package org.softwareheritage.graph.maps; -import java.io.IOException; - import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhPID; +import java.io.IOException; + /** * Mapping between internal long node id and external SWH PID. - * + *

* Mappings in both directions are pre-computed and dumped on disk in the * {@link MapBuilder} class, then they are loaded here using mmap(). * * @author The Software Heritage developers * @see org.softwareheritage.graph.maps.MapBuilder */ public class NodeIdMap { /** Fixed length of full SWH PID */ public static final int SWH_ID_LENGTH = 50; /** Fixed length of long node id */ public static final int NODE_ID_LENGTH = 20; /** Fixed length of binary SWH PID buffer */ public static final int SWH_ID_BIN_SIZE = 22; /** Fixed length of binary node id buffer */ public static final int NODE_ID_BIN_SIZE = 8; /** Graph path and basename */ String graphPath; /** Number of ids to map */ long nbIds; /** mmap()-ed PID_TO_NODE file */ MapFile swhToNodeMap; /** mmap()-ed NODE_TO_PID file */ MapFile nodeToSwhMap; /** * Constructor. * * @param graphPath full graph path * @param nbNodes number of nodes in the graph */ public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; this.swhToNodeMap = new MapFile(graphPath + Graph.PID_TO_NODE, SWH_ID_BIN_SIZE + NODE_ID_BIN_SIZE); this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_PID, SWH_ID_BIN_SIZE); } /** * Converts SWH PID to corresponding long node id. * * @param swhPID node represented as a {@link SwhPID} * @return corresponding node as a long id * @see org.softwareheritage.graph.SwhPID */ public long getNodeId(SwhPID swhPID) { // The file is sorted by swhPID, hence we can binary search on swhPID to get corresponding // nodeId long start = 0; long end = nbIds - 1; while (start <= end) { long lineNumber = (start + end) / 2L; byte[] buffer = swhToNodeMap.readAtLine(lineNumber); byte[] pidBuffer = new byte[SWH_ID_BIN_SIZE]; byte[] nodeBuffer = new byte[NODE_ID_BIN_SIZE]; System.arraycopy(buffer, 0, pidBuffer, 0, SWH_ID_BIN_SIZE); System.arraycopy(buffer, SWH_ID_BIN_SIZE, nodeBuffer, 0, NODE_ID_BIN_SIZE); String currentSwhPID = SwhPID.fromBytes(pidBuffer).getSwhPID(); long currentNodeId = java.nio.ByteBuffer.wrap(nodeBuffer).getLong(); int cmp = currentSwhPID.compareTo(swhPID.toString()); if (cmp == 0) { return currentNodeId; } else if (cmp < 0) { start = lineNumber + 1; } else { end = lineNumber - 1; } } throw new IllegalArgumentException("Unknown SWH PID: " + swhPID); } /** * Converts a node long id to corresponding SWH PID. * * @param nodeId node as a long id * @return corresponding node as a {@link SwhPID} * @see org.softwareheritage.graph.SwhPID */ public SwhPID getSwhPID(long nodeId) { // Each line in NODE_TO_PID is formatted as: swhPID // The file is ordered by nodeId, meaning node0's swhPID is at line 0, hence we can read the // nodeId-th line to get corresponding swhPID if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } return SwhPID.fromBytes(nodeToSwhMap.readAtLine(nodeId)); } /** * Closes the mapping files. */ public void close() throws IOException { swhToNodeMap.close(); nodeToSwhMap.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java index fccfa18..c2747a0 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java @@ -1,55 +1,54 @@ package org.softwareheritage.graph.maps; -import java.io.IOException; - import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; - import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; +import java.io.IOException; + /** * Mapping between long node id and SWH node type as described in the data * model. - * + *

* The type mapping is pre-computed and dumped on disk in the {@link MapBuilder} * class, then it is loaded in-memory here using * fastutil LongBigList. To be * space-efficient, the mapping is stored as a bitmap using minimum number of * bits per {@link Node.Type}. * * @author The Software Heritage developers */ public class NodeTypesMap { /** * Array storing for each node its type */ public LongBigList nodeTypesMap; /** * Constructor. * * @param graphPath path and basename of the compressed graph */ public NodeTypesMap(String graphPath) throws IOException { try { nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + Graph.NODE_TO_TYPE); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Unknown class object: " + e); } } /** * Returns node type from a node long id. * * @param nodeId node as a long id * @return corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType(long nodeId) { long type = nodeTypesMap.getLong(nodeId); return Node.Type.fromInt((int) type); } } diff --git a/java/src/main/java/org/softwareheritage/graph/server/App.java b/java/src/main/java/org/softwareheritage/graph/server/App.java index c9edc14..78836b0 100644 --- a/java/src/main/java/org/softwareheritage/graph/server/App.java +++ b/java/src/main/java/org/softwareheritage/graph/server/App.java @@ -1,193 +1,198 @@ package org.softwareheritage.graph.server; -import java.io.IOException; -import java.util.List; -import java.util.Map; - import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.PropertyNamingStrategy; -import com.martiansoftware.jsap.FlaggedOption; -import com.martiansoftware.jsap.JSAP; -import com.martiansoftware.jsap.JSAPException; -import com.martiansoftware.jsap.JSAPResult; -import com.martiansoftware.jsap.Parameter; -import com.martiansoftware.jsap.SimpleJSAP; -import com.martiansoftware.jsap.Switch; -import com.martiansoftware.jsap.UnflaggedOption; +import com.martiansoftware.jsap.*; import io.javalin.Javalin; import io.javalin.http.Context; import io.javalin.plugin.json.JavalinJackson; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Stats; import org.softwareheritage.graph.SwhPID; +import java.io.IOException; +import java.util.List; +import java.util.Map; + /** * Web framework of the swh-graph server REST API. * * @author The Software Heritage developers */ public class App { /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException, JSAPException { SimpleJSAP jsap = new SimpleJSAP(App.class.getName(), - "Server to load and query a compressed graph representation of Software Heritage archive.", - new Parameter[] { - new FlaggedOption("port", JSAP.INTEGER_PARSER, "5009", JSAP.NOT_REQUIRED, 'p', "port", - "Binding port of the server."), - new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - JSAP.NOT_GREEDY, "The basename of the compressed graph."), - new Switch("timings", 't', "timings", "Show timings in API result metadata."), - }); + "Server to load and query a compressed graph representation of Software Heritage archive.", + new Parameter[]{ + new FlaggedOption("port", JSAP.INTEGER_PARSER, "5009", JSAP.NOT_REQUIRED, 'p', "port", + "Binding port of the server."), + new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + JSAP.NOT_GREEDY, "The basename of the compressed graph."), + new Switch("timings", 't', "timings", "Show timings in API result metadata."), + }); JSAPResult config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } String graphPath = config.getString("graphPath"); int port = config.getInt("port"); boolean showTimings = config.getBoolean("timings"); startServer(graphPath, port, showTimings); } /** * Loads compressed graph and starts the web server to query it. * * @param graphPath basename of the compressed graph * @param port binding port of the server * @param showTimings true if timings should be in results metadata, false otherwise */ private static void startServer(String graphPath, int port, boolean showTimings) - throws IOException { + throws IOException { Graph graph = new Graph(graphPath); Stats stats = new Stats(graphPath); // Clean up on exit Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { graph.cleanUp(); } catch (IOException e) { System.out.println("Could not clean up graph on exit: " + e); } } }); // Configure Jackson JSON to use snake case naming style ObjectMapper objectMapper = JavalinJackson.getObjectMapper(); objectMapper.setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE); JavalinJackson.configure(objectMapper); Javalin app = Javalin.create().start(port); - app.before("/stats/*", ctx -> { checkQueryStrings(ctx, ""); }); - app.before("/leaves/*", ctx -> { checkQueryStrings(ctx, "direction|edges"); }); - app.before("/neighbors/*", ctx -> { checkQueryStrings(ctx, "direction|edges"); }); - app.before("/visit/*", ctx -> { checkQueryStrings(ctx, "direction|edges"); }); - app.before("/walk/*", ctx -> { checkQueryStrings(ctx, "direction|edges|traversal"); }); + app.before("/stats/*", ctx -> { + checkQueryStrings(ctx, ""); + }); + app.before("/leaves/*", ctx -> { + checkQueryStrings(ctx, "direction|edges"); + }); + app.before("/neighbors/*", ctx -> { + checkQueryStrings(ctx, "direction|edges"); + }); + app.before("/visit/*", ctx -> { + checkQueryStrings(ctx, "direction|edges"); + }); + app.before("/walk/*", ctx -> { + checkQueryStrings(ctx, "direction|edges|traversal"); + }); - app.get("/stats/", ctx -> { ctx.json(stats); }); + app.get("/stats/", ctx -> { + ctx.json(stats); + }); // Graph traversal endpoints // By default the traversal is a forward DFS using all edges app.get("/leaves/:src", ctx -> { SwhPID src = new SwhPID(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); Endpoint.Output output = endpoint.leaves(new Endpoint.Input(src)); ctx.json(formatEndpointOutput(output, showTimings)); }); app.get("/neighbors/:src", ctx -> { SwhPID src = new SwhPID(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); Endpoint.Output output = endpoint.neighbors(new Endpoint.Input(src)); ctx.json(formatEndpointOutput(output, showTimings)); }); app.get("/visit/nodes/:src", ctx -> { SwhPID src = new SwhPID(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); Endpoint.Output output = endpoint.visitNodes(new Endpoint.Input(src)); ctx.json(formatEndpointOutput(output, showTimings)); }); app.get("/visit/paths/:src", ctx -> { SwhPID src = new SwhPID(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); Endpoint.Output output = endpoint.visitPaths(new Endpoint.Input(src)); ctx.json(formatEndpointOutput(output, showTimings)); }); app.get("/walk/:src/:dst", ctx -> { SwhPID src = new SwhPID(ctx.pathParam("src")); String dstFmt = ctx.pathParam("dst"); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); String algorithm = ctx.queryParam("traversal", "dfs"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); Endpoint.Output output = endpoint.walk(new Endpoint.Input(src, dstFmt, algorithm)); ctx.json(formatEndpointOutput(output, showTimings)); }); app.exception(IllegalArgumentException.class, (e, ctx) -> { ctx.status(400); ctx.result(e.getMessage()); }); } /** * Checks query strings names provided to the REST API. * * @param ctx Javalin HTTP request context * @param allowedFmt a regular expression describing allowed query strings names * @throws IllegalArgumentException unknown query string provided */ private static void checkQueryStrings(Context ctx, String allowedFmt) { Map> queryParamMap = ctx.queryParamMap(); for (String key : queryParamMap.keySet()) { if (!key.matches(allowedFmt)) { throw new IllegalArgumentException("Unknown query string: " + key); } } } /** * Formats endpoint result into final JSON for the REST API. *

* Removes unwanted information if necessary, such as timings (to prevent use of side channels * attacks). * * @param output endpoint operation output which needs formatting * @param showTimings true if timings should be in results metadata, false otherwise * @return final Object with desired JSON format */ private static Object formatEndpointOutput(Endpoint.Output output, boolean showTimings) { if (showTimings) { return output; } else { Map metaNoTimings = Map.of("nb_edges_accessed", output.meta.nbEdgesAccessed); Map outputNoTimings = Map.of("result", output.result, "meta", metaNoTimings); return outputNoTimings; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java b/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java index c1e3268..16c96e3 100644 --- a/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java +++ b/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java @@ -1,308 +1,308 @@ package org.softwareheritage.graph.server; -import java.util.ArrayList; - import org.softwareheritage.graph.*; import org.softwareheritage.graph.benchmark.utils.Timing; +import java.util.ArrayList; + /** * REST API endpoints wrapper functions. *

* Graph operations are segmented between high-level class (this one) and the low-level class * ({@link Traversal}). The {@link Endpoint} class creates wrappers for each endpoints by performing * all the input/output node ids conversions and logging timings. * * @author The Software Heritage developers * @see Traversal */ public class Endpoint { - /** - * Wrapper class to unify traversal methods input signatures. - */ - public static class Input { - /** Source node of endpoint call specified as a {@link SwhPID} */ - public SwhPID src; - /** - * Destination formatted string as described in the API - */ - public String dstFmt; - /** Traversal algorithm used in endpoint call (either "dfs" or "bfs") */ - public String algorithm; - - public Input(SwhPID src) { - this.src = src; - } - - public Input(SwhPID src, String dstFmt, String algorithm) { - this.src = src; - this.dstFmt = dstFmt; - this.algorithm = algorithm; - } - } - - /** - * Wrapper class to return both the endpoint result and metadata (such as timings). - */ - public static class Output { - /** The result content itself */ - public T result; - /** Various metadata about the result */ - public Meta meta; - - public Output() { - this.result = null; - this.meta = new Meta(); - } - - /** - * Endpoint result metadata. - */ - public class Meta { - /** Operations timings */ - public Timings timings; - /** Number of edges accessed during traversal */ - public long nbEdgesAccessed; - - public Meta() { - this.timings = new Timings(); - this.nbEdgesAccessed = 0; - } - - /** - * Wrapper class for JSON output format. - */ - public class Timings { - /** Time in seconds to do the traversal */ - public double traversal; - /** Time in seconds to convert input SWH PID to node id */ - public double pid2node; - /** Time in seconds to convert output node ids to SWH PIDs */ - public double node2pid; - } - } - } - /** Graph where traversal endpoint is performed */ Graph graph; /** Internal traversal API */ Traversal traversal; /** * Constructor. * * @param graph the graph used for traversal endpoint * @param direction a string (either "forward" or "backward") specifying edge orientation * @param edgesFmt a formatted string describing allowed edges */ public Endpoint(Graph graph, String direction, String edgesFmt) { this.graph = graph; this.traversal = new Traversal(graph, direction, edgesFmt); } /** * Converts a list of (internal) long node ids to a list of corresponding (external) SWH PIDs. * * @param nodeIds the list of long node ids * @return a list of corresponding SWH PIDs */ private ArrayList convertNodesToSwhPIDs(ArrayList nodeIds) { ArrayList swhPIDs = new ArrayList<>(); for (long nodeId : nodeIds) { swhPIDs.add(graph.getSwhPID(nodeId)); } return swhPIDs; } /** * Converts a list of (internal) long node ids to the corresponding {@link SwhPath}. * * @param nodeIds the list of long node ids * @return the corresponding {@link SwhPath} * @see org.softwareheritage.graph.SwhPath */ private SwhPath convertNodesToSwhPath(ArrayList nodeIds) { SwhPath path = new SwhPath(); for (long nodeId : nodeIds) { path.add(graph.getSwhPID(nodeId)); } return path; } /** * Converts a list of paths made of (internal) long node ids to one made of {@link SwhPath}-s. * * @param pathsNodeId the list of paths with long node ids * @return a list of corresponding {@link SwhPath} * @see org.softwareheritage.graph.SwhPath */ private ArrayList convertPathsToSwhPIDs(ArrayList> pathsNodeId) { ArrayList paths = new ArrayList<>(); for (ArrayList path : pathsNodeId) { paths.add(convertNodesToSwhPath(path)); } return paths; } /** * Leaves endpoint wrapper. * * @param input input parameters for the underlying endpoint call * @return the resulting list of {@link SwhPID} from endpoint call and operation metadata * @see org.softwareheritage.graph.SwhPID * @see Traversal#leaves(long) */ public Output leaves(Input input) { Output> output = new Output<>(); long startTime; startTime = Timing.start(); long srcNodeId = graph.getNodeId(input.src); output.meta.timings.pid2node = Timing.stop(startTime); startTime = Timing.start(); ArrayList nodeIds = traversal.leaves(srcNodeId); output.meta.timings.traversal = Timing.stop(startTime); output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); startTime = Timing.start(); output.result = convertNodesToSwhPIDs(nodeIds); output.meta.timings.node2pid = Timing.stop(startTime); return output; } /** * Neighbors endpoint wrapper. * * @param input input parameters for the underlying endpoint call * @return the resulting list of {@link SwhPID} from endpoint call and operation metadata * @see org.softwareheritage.graph.SwhPID * @see Traversal#neighbors(long) */ public Output neighbors(Input input) { Output> output = new Output<>(); long startTime; startTime = Timing.start(); long srcNodeId = graph.getNodeId(input.src); output.meta.timings.pid2node = Timing.stop(startTime); startTime = Timing.start(); ArrayList nodeIds = traversal.neighbors(srcNodeId); output.meta.timings.traversal = Timing.stop(startTime); output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); startTime = Timing.start(); output.result = convertNodesToSwhPIDs(nodeIds); output.meta.timings.node2pid = Timing.stop(startTime); return output; } /** * Walk endpoint wrapper. * * @param input input parameters for the underlying endpoint call * @return the resulting {@link SwhPath} from endpoint call and operation metadata * @see org.softwareheritage.graph.SwhPID * @see org.softwareheritage.graph.SwhPath * @see Traversal#walk */ public Output walk(Input input) { Output output = new Output<>(); long startTime; startTime = Timing.start(); long srcNodeId = graph.getNodeId(input.src); output.meta.timings.pid2node = Timing.stop(startTime); ArrayList nodeIds = new ArrayList(); // Destination is either a SWH PID or a node type try { SwhPID dstSwhPID = new SwhPID(input.dstFmt); long dstNodeId = graph.getNodeId(dstSwhPID); startTime = Timing.start(); nodeIds = traversal.walk(srcNodeId, dstNodeId, input.algorithm); output.meta.timings.traversal = Timing.stop(startTime); } catch (IllegalArgumentException ignored1) { try { Node.Type dstType = Node.Type.fromStr(input.dstFmt); startTime = Timing.start(); nodeIds = traversal.walk(srcNodeId, dstType, input.algorithm); output.meta.timings.traversal = Timing.stop(startTime); } catch (IllegalArgumentException ignored2) { } } output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); startTime = Timing.start(); output.result = convertNodesToSwhPath(nodeIds); output.meta.timings.node2pid = Timing.stop(startTime); return output; } /** * VisitNodes endpoint wrapper. * * @param input input parameters for the underlying endpoint call * @return the resulting list of {@link SwhPID} from endpoint call and operation metadata * @see org.softwareheritage.graph.SwhPID * @see Traversal#visitNodes(long) */ public Output visitNodes(Input input) { Output> output = new Output<>(); long startTime; startTime = Timing.start(); long srcNodeId = graph.getNodeId(input.src); output.meta.timings.pid2node = Timing.stop(startTime); startTime = Timing.start(); ArrayList nodeIds = traversal.visitNodes(srcNodeId); output.meta.timings.traversal = Timing.stop(startTime); output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); startTime = Timing.start(); output.result = convertNodesToSwhPIDs(nodeIds); output.meta.timings.node2pid = Timing.stop(startTime); return output; } /** * VisitPaths endpoint wrapper. * * @param input input parameters for the underlying endpoint call * @return the resulting list of {@link SwhPath} from endpoint call and operation metadata * @see org.softwareheritage.graph.SwhPID * @see org.softwareheritage.graph.SwhPath * @see Traversal#visitPaths(long) */ public Output visitPaths(Input input) { Output> output = new Output<>(); long startTime; startTime = Timing.start(); long srcNodeId = graph.getNodeId(input.src); output.meta.timings.pid2node = Timing.stop(startTime); startTime = Timing.start(); ArrayList> paths = traversal.visitPaths(srcNodeId); output.meta.timings.traversal = Timing.stop(startTime); output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); startTime = Timing.start(); output.result = convertPathsToSwhPIDs(paths); output.meta.timings.node2pid = Timing.stop(startTime); return output; } + + /** + * Wrapper class to unify traversal methods input signatures. + */ + public static class Input { + /** Source node of endpoint call specified as a {@link SwhPID} */ + public SwhPID src; + /** + * Destination formatted string as described in the API + */ + public String dstFmt; + /** Traversal algorithm used in endpoint call (either "dfs" or "bfs") */ + public String algorithm; + + public Input(SwhPID src) { + this.src = src; + } + + public Input(SwhPID src, String dstFmt, String algorithm) { + this.src = src; + this.dstFmt = dstFmt; + this.algorithm = algorithm; + } + } + + /** + * Wrapper class to return both the endpoint result and metadata (such as timings). + */ + public static class Output { + /** The result content itself */ + public T result; + /** Various metadata about the result */ + public Meta meta; + + public Output() { + this.result = null; + this.meta = new Meta(); + } + + /** + * Endpoint result metadata. + */ + public class Meta { + /** Operations timings */ + public Timings timings; + /** Number of edges accessed during traversal */ + public long nbEdgesAccessed; + + public Meta() { + this.timings = new Timings(); + this.nbEdgesAccessed = 0; + } + + /** + * Wrapper class for JSON output format. + */ + public class Timings { + /** Time in seconds to do the traversal */ + public double traversal; + /** Time in seconds to convert input SWH PID to node id */ + public double pid2node; + /** Time in seconds to convert output node ids to SWH PIDs */ + public double node2pid; + } + } + } }