diff --git a/api/server/pom.xml b/api/server/pom.xml index 12e6484..99ff2b1 100644 --- a/api/server/pom.xml +++ b/api/server/pom.xml @@ -1,132 +1,141 @@ 4.0.0 org.softwareheritage.graph swh-graph 1.0 swh-graph https://www.softwareheritage.org/ UTF-8 11 junit junit 4.11 test org.hamcrest hamcrest 2.1 test io.javalin javalin 3.0.0 org.slf4j slf4j-simple 1.7.26 com.fasterxml.jackson.core jackson-databind 2.9.8 it.unimi.dsi webgraph-big 3.5.1 it.unimi.dsi fastutil 8.2.2 maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 -verbose -Xlint:all maven-surefire-plugin 2.22.1 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0 maven-assembly-plugin org.softwareheritage.graph.App jar-with-dependencies make-assembly package single + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.1.1 + + + diff --git a/api/server/src/main/java/org/softwareheritage/graph/AllowedEdges.java b/api/server/src/main/java/org/softwareheritage/graph/AllowedEdges.java index 784207d..8c74758 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/AllowedEdges.java +++ b/api/server/src/main/java/org/softwareheritage/graph/AllowedEdges.java @@ -1,58 +1,90 @@ package org.softwareheritage.graph; import java.util.ArrayList; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; +/** + * Edge restriction based on node types, used when visiting the graph. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class AllowedEdges { + /** Graph on which edge restriction is performed */ Graph graph; - // First dimension is source node type, second dimension is destination node type + /** + * 2D boolean matrix storing access rights for all combination of src/dst node types (first + * dimension is source, second dimension is destination) + */ boolean[][] allowed; + /** + * Constructor. + * + * @param graph the graph on which to perform edge restriction + * @param edgesFmt a formatted string describing allowed edges (TODO: link API doc) + */ public AllowedEdges(Graph graph, String edgesFmt) { this.graph = graph; int nbNodeTypes = Node.Type.values().length; this.allowed = new boolean[nbNodeTypes][nbNodeTypes]; // Special values (null, empty, "*") if (edgesFmt == null || edgesFmt.isEmpty()) { return; } if (edgesFmt.equals("*")) { for (int i = 0; i < nbNodeTypes; i++) { for (int j = 0; j < nbNodeTypes; j++) { allowed[i][j] = true; } } return; } // Format: "src1:dst1,src2:dst2,[...]" + // TODO: link API doc String[] edgeTypes = edgesFmt.split(","); for (String edgeType : edgeTypes) { String[] nodeTypes = edgeType.split(":"); if (nodeTypes.length != 2) { throw new IllegalArgumentException("Cannot parse edge type: " + edgeType); } ArrayList srcTypes = Node.Type.parse(nodeTypes[0]); ArrayList dstTypes = Node.Type.parse(nodeTypes[1]); for (Node.Type srcType : srcTypes) { for (Node.Type dstType : dstTypes) { allowed[srcType.ordinal()][dstType.ordinal()] = true; } } } } + /** + * Checks if a given edge can be followed during graph traversal. + * + * @param srcNodeId edge source node + * @param dstNodeId edge destination node + * @return true if allowed and false otherwise + */ public boolean isAllowed(long srcNodeId, long dstNodeId) { Node.Type srcType = graph.getNodeType(srcNodeId); Node.Type dstType = graph.getNodeType(dstNodeId); return isAllowed(srcType, dstType); } + /** + * Works like {@link AllowedEdges#isAllowed(long, long)} but with node types directly instead of + * node ids. + * + * @see AllowedEdges#isAllowed(long, long) + */ public boolean isAllowed(Node.Type srcType, Node.Type dstType) { return allowed[srcType.ordinal()][dstType.ordinal()]; } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/App.java b/api/server/src/main/java/org/softwareheritage/graph/App.java index e02a6a5..841e37c 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/App.java +++ b/api/server/src/main/java/org/softwareheritage/graph/App.java @@ -1,124 +1,132 @@ package org.softwareheritage.graph; import java.io.IOException; import java.util.List; import java.util.Map; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.PropertyNamingStrategy; import io.javalin.Javalin; import io.javalin.http.Context; import io.javalin.plugin.json.JavalinJackson; import org.softwareheritage.graph.Endpoint; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.algo.Stats; +/** + * Entrypoint of the swh-graph server REST API. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class App { public static void main(String[] args) throws IOException { String path = args[0]; Graph graph = new Graph(path); Stats stats = new Stats(path); // Clean up on exit Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { graph.cleanUp(); } catch (IOException e) { System.out.println("Could not clean up graph on exit: " + e); } } }); // Configure Jackson JSON to use snake case naming style ObjectMapper objectMapper = JavalinJackson.getObjectMapper(); objectMapper.setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE); JavalinJackson.configure(objectMapper); Javalin app = Javalin.create().start(5009); app.before("/stats/*", ctx -> { checkQueryStrings(ctx, ""); }); app.before("/leaves/*", ctx -> { checkQueryStrings(ctx, "direction|edges"); }); app.before("/neighbors/*", ctx -> { checkQueryStrings(ctx, "direction|edges"); }); app.before("/visit/*", ctx -> { checkQueryStrings(ctx, "direction|edges"); }); app.before("/walk/*", ctx -> { checkQueryStrings(ctx, "direction|edges|traversal"); }); app.get("/stats/", ctx -> { ctx.json(stats); }); // Graph traversal endpoints // By default the traversal is a forward DFS using all edges app.get("/leaves/:src", ctx -> { SwhId src = new SwhId(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); ctx.json(endpoint.leaves(src)); }); app.get("/neighbors/:src", ctx -> { SwhId src = new SwhId(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); ctx.json(endpoint.neighbors(src)); }); // TODO: anonymous class to return both nodes/paths? (waiting on node types map merged/refactor) /*app.get("/visit/:src", ctx -> { SwhId src = new SwhId(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Visit visit = new Visit(graph, src, edgesFmt, direction, Visit.OutputFmt.NODES_AND_PATHS); ctx.json(visit); });*/ app.get("/visit/nodes/:src", ctx -> { SwhId src = new SwhId(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); ctx.json(endpoint.visitNodes(src)); }); app.get("/visit/paths/:src", ctx -> { SwhId src = new SwhId(ctx.pathParam("src")); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); ctx.json(endpoint.visitPaths(src)); }); app.get("/walk/:src/:dst", ctx -> { SwhId src = new SwhId(ctx.pathParam("src")); String dstFmt = ctx.pathParam("dst"); String direction = ctx.queryParam("direction", "forward"); String edgesFmt = ctx.queryParam("edges", "*"); String algorithm = ctx.queryParam("traversal", "dfs"); Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); ctx.json(endpoint.walk(src, dstFmt, algorithm)); }); app.exception(IllegalArgumentException.class, (e, ctx) -> { ctx.status(400); ctx.result(e.getMessage()); }); } private static void checkQueryStrings(Context ctx, String allowedFmt) { Map> queryParamMap = ctx.queryParamMap(); for (String key : queryParamMap.keySet()) { if (!key.matches(allowedFmt)) { throw new IllegalArgumentException("Unknown query string: " + key); } } } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/Endpoint.java b/api/server/src/main/java/org/softwareheritage/graph/Endpoint.java index 5742cad..a851291 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Endpoint.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Endpoint.java @@ -1,85 +1,166 @@ package org.softwareheritage.graph; import java.util.ArrayList; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.SwhPath; import org.softwareheritage.graph.algo.Traversal; +/** + * REST API endpoints wrapper functions. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Endpoint { + /** Graph where traversal endpoint is performed */ Graph graph; + /** Internal traversal API */ Traversal traversal; + /** + * Constructor. + * + * @param graph the graph used for traversal endpoint + * @param direction a string (either "forward" or "backward") specifying edge orientation + * @param edgesFmt a formatted string describing allowed edges (TODO: link API doc) + */ public Endpoint(Graph graph, String direction, String edgesFmt) { this.graph = graph; this.traversal = new Traversal(graph, direction, edgesFmt); } + /** + * Converts a list of (internal) long node ids to a list of corresponding (external) SWH PIDs. + * + * @param nodeIds the list of long node ids + * @return a list of corresponding SWH PIDs + */ private ArrayList convertNodesToSwhIds(ArrayList nodeIds) { ArrayList swhIds = new ArrayList<>(); for (long nodeId : nodeIds) { swhIds.add(graph.getSwhId(nodeId)); } return swhIds; } + /** + * Converts a list of (internal) long node ids to the corresponding {@link SwhPath}. + * + * @param nodeIds the list of long node ids + * @return the corresponding {@link SwhPath} + * @see org.softwareheritage.graph.SwhPath + */ private SwhPath convertNodesToSwhPath(ArrayList nodeIds) { SwhPath path = new SwhPath(); for (long nodeId : nodeIds) { path.add(graph.getSwhId(nodeId)); } return path; } + /** + * Converts a list of paths made of (internal) long node ids to one made of {@link SwhPath}-s. + * + * @param pathsNodeId the list of paths with long node ids + * @return a list of corresponding {@link SwhPath} + * @see org.softwareheritage.graph.SwhPath + */ private ArrayList convertPathsToSwhIds(ArrayList> pathsNodeId) { ArrayList paths = new ArrayList<>(); for (ArrayList path : pathsNodeId) { paths.add(convertNodesToSwhPath(path)); } return paths; } + /** + * Leaves endpoint wrapper (performs input/output node ids conversions). + * + * @param src source node of endpoint call specified as a SwhId + * @return the resulting list of SwhId from endpoint call + * @see org.softwareheritage.graph.SwhId + * @see org.softwareheritage.graph.algo.Traversal#leaves(long) + */ public ArrayList leaves(SwhId src) { long srcNodeId = graph.getNodeId(src); ArrayList nodeIds = traversal.leaves(srcNodeId); return convertNodesToSwhIds(nodeIds); } + /** + * Neighbors endpoint wrapper (performs input/output node ids conversions). + * + * @param src source node of endpoint call specified as a SwhId + * @return the resulting list of SwhId from endpoint call + * @see org.softwareheritage.graph.SwhId + * @see org.softwareheritage.graph.algo.Traversal#neighbors(long) + */ public ArrayList neighbors(SwhId src) { long srcNodeId = graph.getNodeId(src); ArrayList nodeIds = traversal.neighbors(srcNodeId); return convertNodesToSwhIds(nodeIds); } + /** + * Walk endpoint wrapper (performs input/output node ids conversions). + * + * @param src source node of endpoint call specified as a SwhId + * @param dstFmt destination used in endpoint call (TODO: link to API doc) + * @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs") + * @return the resulting {@link SwhPath} from endpoint call + * @see org.softwareheritage.graph.SwhId + * @see org.softwareheritage.graph.SwhPath + * @see org.softwareheritage.graph.algo.Traversal#walk + */ public SwhPath walk(SwhId src, String dstFmt, String algorithm) { long srcNodeId = graph.getNodeId(src); ArrayList nodeIds = null; // Destination is either a SWH ID or a node type try { SwhId dstSwhId = new SwhId(dstFmt); long dstNodeId = graph.getNodeId(dstSwhId); nodeIds = traversal.walk(srcNodeId, dstNodeId, algorithm); } catch (IllegalArgumentException ignored1) { try { Node.Type dstType = Node.Type.fromStr(dstFmt); nodeIds = traversal.walk(srcNodeId, dstType, algorithm); } catch (IllegalArgumentException ignored2) { } } return convertNodesToSwhPath(nodeIds); } + /** + * VisitNodes endpoint wrapper (performs input/output node ids conversions). + * + * @param src source node of endpoint call specified as a SwhId + * @return the resulting list of SwhId from endpoint call + * @see org.softwareheritage.graph.SwhId + * @see org.softwareheritage.graph.algo.Traversal#visitNodes(long) + */ public ArrayList visitNodes(SwhId src) { long srcNodeId = graph.getNodeId(src); ArrayList nodeIds = traversal.visitNodes(srcNodeId); return convertNodesToSwhIds(nodeIds); } + /** + * VisitPaths endpoint wrapper (performs input/output node ids conversions). + * + * @param src source node of endpoint call specified as a SwhId + * @return the resulting list of {@link SwhPath} from endpoint call + * @see org.softwareheritage.graph.SwhId + * @see org.softwareheritage.graph.SwhPath + * @see org.softwareheritage.graph.algo.Traversal#visitPaths(long) + */ public ArrayList visitPaths(SwhId src) { long srcNodeId = graph.getNodeId(src); ArrayList> paths = traversal.visitPaths(srcNodeId); return convertPathsToSwhIds(paths); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/Graph.java b/api/server/src/main/java/org/softwareheritage/graph/Graph.java index e265f67..dc90a40 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Graph.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,82 +1,183 @@ package org.softwareheritage.graph; import java.io.IOException; import it.unimi.dsi.big.webgraph.BVGraph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.backend.NodeIdMap; import org.softwareheritage.graph.backend.NodeTypesMap; +/** + * Main class storing the compressed graph and node id mappings. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Graph { + /** File extension for the SWH PID to long node id map */ public static final String PID_TO_NODE = ".pid2node.csv"; + /** File extension for the long node id to SWH PID map */ public static final String NODE_TO_PID = ".node2pid.csv"; + /** File extension for the long node id to node typ map */ public static final String NODE_TO_TYPE = ".node2type.map"; + /** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */ BVGraph graph; + /** Transposed compressed graph (used for backward traversals) */ BVGraph graphTransposed; + /** Path and basename of the compressed graph */ String path; + /** Mapping long id ↔ SWH PIDs */ NodeIdMap nodeIdMap; + /** Mapping long id → node types */ NodeTypesMap nodeTypesMap; + /** + * Constructor. + * + * @param path path and basename of the compressed graph to load + */ public Graph(String path) throws IOException { this.graph = BVGraph.load(path); this.graphTransposed = BVGraph.load(path + "-transposed"); this.path = path; this.nodeIdMap = new NodeIdMap(path, getNbNodes()); this.nodeTypesMap = new NodeTypesMap(path); } + /** + * Cleans up graph resources after use. + */ public void cleanUp() throws IOException { nodeIdMap.close(); } + /** + * Returns the graph full path. + * + * @return graph full path + */ public String getPath() { return path; } + /** + * Converts {@link SwhId} node to long. + * + * @param swhId node specified as a {@link SwhId} + * @return internal long node id + * @see org.softwareheritage.graph.SwhId + */ public long getNodeId(SwhId swhId) { return nodeIdMap.getNodeId(swhId); } + /** + * Converts long id node to {@link SwhId}. + * + * @param nodeId node specified as a long id + * @return external SWH PID + * @see org.softwareheritage.graph.SwhId + */ public SwhId getSwhId(long nodeId) { return nodeIdMap.getSwhId(nodeId); } + /** + * Returns node type. + * + * @param nodeId node specified as a long id + * @return corresponding node type + * @see org.softwareheritage.graph.Node.Type + */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } + /** + * Returns number of nodes in the graph. + * + * @return number of nodes in the graph + */ public long getNbNodes() { return graph.numNodes(); } + /** + * Returns number of edges in the graph. + * + * @return number of edges in the graph + */ public long getNbEdges() { return graph.numArcs(); } + /** + * Returns list of successors of a node. + * + * @param nodeId node specified as a long id + * @return list of successors of the node, specified as a {@link LongBigArrays} + * @see it.unimi.dsi.fastutil.longs.LongBigArrays + */ public long[][] successors(long nodeId) { return graph.successorBigArray(nodeId); } + /** + * Returns the outdegree of a node. + * + * @param nodeId node specified as a long id + * @return outdegree of a node + */ public long outdegree(long nodeId) { return graph.outdegree(nodeId); } + /** + * Returns list of predecessors of a node. + * + * @param nodeId node specified as a long id + * @return list of predecessors of the node, specified as a {@link LongBigArrays} + * @see it.unimi.dsi.fastutil.longs.LongBigArrays + */ public long[][] predecessors(long nodeId) { return graphTransposed.successorBigArray(nodeId); } + /** + * Returns the indegree of a node. + * + * @param nodeId node specified as a long id + * @return indegree of a node + */ public long indegree(long nodeId) { return graphTransposed.outdegree(nodeId); } + /** + * Returns the degree of a node, depending on graph orientation. + * + * @param nodeId node specified as a long id + * @param useTransposed boolean value to use transposed graph + * @return degree of a node + */ public long degree(long nodeId, boolean useTransposed) { return (useTransposed) ? indegree(nodeId) : outdegree(nodeId); } + /** + * Returns the neighbors of a node, depending on graph orientation. + * + * @param nodeId node specified as a long id + * @param useTransposed boolean value to use transposed graph + * @return list of neighbors of the node, specified as a {@link LongBigArrays} + * @see it.unimi.dsi.fastutil.longs.LongBigArrays + */ public long[][] neighbors(long nodeId, boolean useTransposed) { return (useTransposed) ? predecessors(nodeId) : successors(nodeId); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/Neighbors.java b/api/server/src/main/java/org/softwareheritage/graph/Neighbors.java index c31f430..f2d08e3 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Neighbors.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Neighbors.java @@ -1,56 +1,84 @@ package org.softwareheritage.graph; import java.util.Iterator; import it.unimi.dsi.fastutil.longs.LongBigArrays; import org.softwareheritage.graph.AllowedEdges; import org.softwareheritage.graph.Graph; +/** + * Iterator class to go over a node neighbors in the graph. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Neighbors implements Iterable { + /** Graph used to explore neighbors */ Graph graph; + /** Boolean to specify the use of the transposed graph */ boolean useTransposed; + /** Graph edge restriction */ AllowedEdges edges; + /** Source node from which neighbors will be listed */ long srcNodeId; + /** + * Constructor. + * + * @param graph graph used to explore neighbors + * @param useTransposed boolean value to use transposed graph + * @param edges edges allowed to be used in the graph + * @param srcNodeId source node from where to list neighbors + */ public Neighbors(Graph graph, boolean useTransposed, AllowedEdges edges, long srcNodeId) { this.graph = graph; this.useTransposed = useTransposed; this.edges = edges; this.srcNodeId = srcNodeId; } @Override public Iterator iterator() { return new NeighborsIterator(); } + /** + * Inner class for {@link Neighbors} iterator. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class NeighborsIterator implements Iterator { long nextNeighborIdx; long nbNeighbors; long[][] neighbors; public NeighborsIterator() { this.nextNeighborIdx = -1; this.nbNeighbors = graph.degree(srcNodeId, useTransposed); this.neighbors = graph.neighbors(srcNodeId, useTransposed); } // Look ahead because with edge restriction not all neighbors are considered public boolean hasNext() { for (long lookAheadIdx = nextNeighborIdx + 1; lookAheadIdx < nbNeighbors; lookAheadIdx++) { long nextNodeId = LongBigArrays.get(neighbors, lookAheadIdx); if (edges.isAllowed(srcNodeId, nextNodeId)) { nextNeighborIdx = lookAheadIdx; return true; } } return false; } public Long next() { long nextNodeId = LongBigArrays.get(neighbors, nextNeighborIdx); return nextNodeId; } } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/Node.java b/api/server/src/main/java/org/softwareheritage/graph/Node.java index 4fe9823..a883737 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Node.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Node.java @@ -1,48 +1,86 @@ package org.softwareheritage.graph; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +/** + * A node in the Software Heritage graph. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Node { + /** + * Software Heritage graph node types, as described in the + * data model. + */ public enum Type { + /** Content node */ CNT, + /** Directory node */ DIR, + /** Release node */ REL, + /** Revision node */ REV, + /** Snapshot node */ SNP; + /** + * Converts integer to corresponding SWH node type. + * + * @param intType node type represented as an integer + * @return the corresponding {@link Node.Type} value + * @see org.softwareheritage.graph.Node.Type + */ public static Node.Type fromInt(int intType) { switch (intType) { case 0: return CNT; case 1: return DIR; case 2: return REL; case 3: return REV; case 4: return SNP; } return null; } + /** + * Parses SWH node type from string. + * + * @param strType node type represented as a string + * @return the corresponding {@link Node.Type} value + * @see org.softwareheritage.graph.Node.Type + */ public static Node.Type fromStr(String strType) { return Node.Type.valueOf(strType.toUpperCase()); } - public static ArrayList parse(String strType) { + /** + * Parses SWH node type possible values from formatted string (TODO: link API doc). + * + * @param strFmtType node types represented as a formatted string (TODO: link API doc) + * @return a list containing the {@link Node.Type} values + * @see org.softwareheritage.graph.Node.Type + */ + public static ArrayList parse(String strFmtType) { ArrayList types = new ArrayList<>(); - if (strType.equals("*")) { + if (strFmtType.equals("*")) { List nodeTypes = Arrays.asList(Node.Type.values()); types.addAll(nodeTypes); } else { - types.add(Node.Type.fromStr(strType)); + types.add(Node.Type.fromStr(strFmtType)); } return types; } } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/SwhId.java b/api/server/src/main/java/org/softwareheritage/graph/SwhId.java index a5ad7c1..5e70e1b 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/SwhId.java +++ b/api/server/src/main/java/org/softwareheritage/graph/SwhId.java @@ -1,67 +1,101 @@ package org.softwareheritage.graph; import com.fasterxml.jackson.annotation.JsonValue; import org.softwareheritage.graph.Node; +/** + * A Software Heritage PID, see persistent + * identifier documentation. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class SwhId { + /** Fixed hash length of the PID */ public static final int HASH_LENGTH = 40; + /** Full PID as a string */ String swhId; + /** PID node type */ Node.Type type; + /** PID hex-encoded SHA1 hash */ String hash; - // SWH ID format: 'swh:1:type:hash' - // https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html + /** + * Constructor. + * + * @param swhId full PID as a string + */ public SwhId(String swhId) { this.swhId = swhId; + // PID format: 'swh:1:type:hash' String[] parts = swhId.split(":"); if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) { throw new IllegalArgumentException("Expected SWH ID format to be 'swh:1:type:hash', got: " + swhId); } String type = parts[2]; if (!type.matches("cnt|dir|rel|rev|snp")) { throw new IllegalArgumentException("Unknown SWH ID type in: " + swhId); } this.type = Node.Type.fromStr(type); this.hash = parts[3]; if (!hash.matches("[0-9a-f]{" + HASH_LENGTH + "}")) { throw new IllegalArgumentException("Wrong SWH ID hash format in: " + swhId); } } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SwhId)) return false; SwhId other = (SwhId) otherObj; return swhId.equals(other.getSwhId()); } @Override public int hashCode() { return swhId.hashCode(); } @Override public String toString() { return swhId; } + /** + * Returns full PID as a string. + * + * @return full PID string + */ @JsonValue public String getSwhId() { return swhId; } + /** + * Returns PID node type. + * + * @return PID corresponding {@link Node.Type} + * @see org.softwareheritage.graph.Node.Type + */ public Node.Type getType() { return type; } + /** + * Returns PID hex-encoded SHA1 hash. + * + * @return PID string hash + */ public String getHash() { return hash; } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/SwhPath.java b/api/server/src/main/java/org/softwareheritage/graph/SwhPath.java index be9c80b..4d45849 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/SwhPath.java +++ b/api/server/src/main/java/org/softwareheritage/graph/SwhPath.java @@ -1,76 +1,123 @@ package org.softwareheritage.graph; import java.util.ArrayList; import com.fasterxml.jackson.annotation.JsonValue; import org.softwareheritage.graph.SwhId; +/** + * Wrapper class to store a list of {@link SwhId}. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + * @see org.softwareheritage.graph.SwhId + */ + public class SwhPath { ArrayList path; + /** + * Constructor. + */ public SwhPath() { this.path = new ArrayList(); } + /** + * Constructor. + * + * @param swhIds variable number of string PIDs to initialize this path with + */ public SwhPath(String ...swhIds) { this(); for (String swhId : swhIds) { add(new SwhId(swhId)); } } + /** + * Constructor. + * + * @param swhIds variable number of @{link SwhId} to initialize this path with + * @see org.softwareheritage.graph.SwhId + */ public SwhPath(SwhId ...swhIds) { this(); for (SwhId swhId : swhIds) { add(swhId); } } + /** + * Returns this path as a list of {@link SwhId}. + * + * @return list of {@link SwhId} constituting the path + * @see org.softwareheritage.graph.SwhId + */ @JsonValue public ArrayList getPath() { return path; } + /** + * Adds a {@link SwhId} to this path. + * + * @param {@link SwhId} to add to this path + * @see org.softwareheritage.graph.SwhId + */ public void add(SwhId swhId) { path.add(swhId); } + /** + * Returns the {@link SwhId} at the specified position in this path. + * + * @param index position of the {@link SwhId} to return + * @return {@link SwhId} at the specified position + * @see org.softwareheritage.graph.SwhId + */ public SwhId get(int index) { return path.get(index); } + /** + * Returns the number of elements in this path. + * + * @return number of elements in this path + */ public int size() { return path.size(); } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SwhPath)) return false; SwhPath other = (SwhPath) otherObj; if (size() != other.size()) { return false; } for (int i = 0; i < size(); i++) { SwhId thisSwhId = get(i); SwhId otherSwhId = other.get(i); if (!thisSwhId.equals(otherSwhId)) { return false; } } return true; } @Override public String toString() { String str = new String(); for (SwhId swhId : path) { str += swhId + "/"; } return str; } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/algo/Stats.java b/api/server/src/main/java/org/softwareheritage/graph/algo/Stats.java index e5304f7..67683c9 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/algo/Stats.java +++ b/api/server/src/main/java/org/softwareheritage/graph/algo/Stats.java @@ -1,54 +1,67 @@ package org.softwareheritage.graph.algo; import java.io.FileInputStream; import java.io.IOException; import java.util.Properties; +/** + * Statistics on the compressed graph. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Stats { public class Counts { public long nodes; public long edges; } public class Ratios { public double compression; public double bitsPerNode; public double bitsPerEdge; public double avgLocality; } public class Degree { public long min; public long max; public double avg; } public Counts counts; public Ratios ratios; public Degree indegree; public Degree outdegree; + /** + * Constructor. + * + * @param graphPath full path of compressed graph + */ public Stats(String graphPath) throws IOException { Properties properties = new Properties(); properties.load(new FileInputStream(graphPath + ".properties")); properties.load(new FileInputStream(graphPath + ".stats")); this.counts = new Counts(); this.ratios = new Ratios(); this.indegree = new Degree(); this.outdegree = new Degree(); this.counts.nodes = Long.parseLong(properties.getProperty("nodes")); this.counts.edges = Long.parseLong(properties.getProperty("arcs")); this.ratios.compression = Double.parseDouble(properties.getProperty("compratio")); this.ratios.bitsPerNode = Double.parseDouble(properties.getProperty("bitspernode")); this.ratios.bitsPerEdge = Double.parseDouble(properties.getProperty("bitsperlink")); this.ratios.avgLocality = Double.parseDouble(properties.getProperty("avglocality")); this.indegree.min = Long.parseLong(properties.getProperty("minindegree")); this.indegree.max = Long.parseLong(properties.getProperty("maxindegree")); this.indegree.avg = Double.parseDouble(properties.getProperty("avgindegree")); this.outdegree.min = Long.parseLong(properties.getProperty("minoutdegree")); this.outdegree.max = Long.parseLong(properties.getProperty("maxoutdegree")); this.outdegree.avg = Double.parseDouble(properties.getProperty("avgoutdegree")); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/algo/Traversal.java b/api/server/src/main/java/org/softwareheritage/graph/algo/Traversal.java index c9c2e04..966710a 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/algo/Traversal.java +++ b/api/server/src/main/java/org/softwareheritage/graph/algo/Traversal.java @@ -1,219 +1,309 @@ package org.softwareheritage.graph.algo; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.Queue; import java.util.Stack; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.longs.LongBigArrays; import org.softwareheritage.graph.AllowedEdges; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Neighbors; import org.softwareheritage.graph.Node; +/** + * Traversal algorithms on the compressed graph. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Traversal { + /** Graph used in the traversal */ Graph graph; + /** Boolean to specify the use of the transposed graph */ boolean useTransposed; + /** Graph edge restriction */ AllowedEdges edges; - // Big array storing if we have visited a node + /** + * Bit array storing if we have visited a node + * @see it.unimi.dsi.bits.LongArrayBitVector + */ LongArrayBitVector visited; - // Big array storing parents to retrieve the path when backtracking + /** + * Array storing parent node id for each nodes during a traversal + * @see it.unimi.dsi.fastutil.longs.LongBigArrays + */ long[][] nodeParent; + /** + * Constructor. + * + * @param graph graph used in the traversal + * @param direction a string (either "forward" or "backward") specifying edge orientation + * @param edgesFmt a formatted string describing allowed edges (TODO: link API doc) + */ public Traversal(Graph graph, String direction, String edgesFmt) { if (!direction.matches("forward|backward")) { throw new IllegalArgumentException("Unknown traversal direction: " + direction); } this.graph = graph; this.useTransposed = (direction.equals("backward")); this.edges = new AllowedEdges(graph, edgesFmt); long nbNodes = graph.getNbNodes(); this.visited = LongArrayBitVector.ofLength(nbNodes); this.nodeParent = LongBigArrays.newBigArray(nbNodes); } + /** + * Returns the leaves of a subgraph rooted at the specified source node. + * + * @param srcNodeId source node + * @return list of node ids corresponding to the leaves + */ public ArrayList leaves(long srcNodeId) { ArrayList nodeIds = new ArrayList(); Stack stack = new Stack(); this.visited.fill(false); stack.push(srcNodeId); visited.set(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); long neighborsCnt = 0; for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { neighborsCnt++; if (!visited.getBoolean(neighborNodeId)) { stack.push(neighborNodeId); visited.set(neighborNodeId); } } if (neighborsCnt == 0) { nodeIds.add(currentNodeId); } } return nodeIds; } + /** + * Returns node direct neighbors (linked with exactly one edge). + * + * @param srcNodeId source node + * @return list of node ids corresponding to the neighbors + */ public ArrayList neighbors(long srcNodeId) { ArrayList nodeIds = new ArrayList(); for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, srcNodeId)) { nodeIds.add(neighborNodeId); } return nodeIds; } + /** + * Performs a graph traversal and returns explored nodes. + * + * @param srcNodeId source node + * @return list of explored node ids + */ public ArrayList visitNodes(long srcNodeId) { ArrayList nodeIds = new ArrayList(); Stack stack = new Stack(); this.visited.fill(false); stack.push(srcNodeId); visited.set(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); nodeIds.add(currentNodeId); for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { if (!visited.getBoolean(neighborNodeId)) { stack.push(neighborNodeId); visited.set(neighborNodeId); } } } return nodeIds; } + /** + * Performs a graph traversal and returns explored paths. + * + * @param srcNodeId source node + * @return list of explored paths (represented as a list of node ids) + */ public ArrayList> visitPaths(long srcNodeId) { ArrayList> paths = new ArrayList<>(); Stack currentPath = new Stack(); visitPathsInternal(srcNodeId, paths, currentPath); return paths; } + /** + * Internal recursive function of {@link #visitPaths}. + * + * @param currentNodeId current node + * @param paths list of currently stored paths + * @param currentPath current path as node ids + */ private void visitPathsInternal( long currentNodeId, ArrayList> paths, Stack currentPath) { currentPath.push(currentNodeId); long visitedNeighbors = 0; for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { visitPathsInternal(neighborNodeId, paths, currentPath); visitedNeighbors++; } if (visitedNeighbors == 0) { ArrayList path = new ArrayList(); for (long nodeId : currentPath) { path.add(nodeId); } paths.add(path); } currentPath.pop(); } + /** + * Performs a graph traversal and returns the first found path from source to destination. + * + * @param srcNodeId source node + * @param dst destination (either a node or a node type) + * @return found path as a list of node ids + */ public ArrayList walk(long srcNodeId, T dst, String algorithm) { long dstNodeId = -1; if (algorithm.equals("dfs")) { dstNodeId = walkInternalDfs(srcNodeId, dst); } else if (algorithm.equals("bfs")) { dstNodeId = walkInternalBfs(srcNodeId, dst); } else { throw new IllegalArgumentException("Unknown traversal algorithm: " + algorithm); } if (dstNodeId == -1) { throw new IllegalArgumentException("Unable to find destination point: " + dst); } ArrayList nodeIds = backtracking(srcNodeId, dstNodeId); return nodeIds; } + /** + * Internal DFS function of {@link #walk}. + * + * @param srcNodeId source node + * @param dst destination (either a node or a node type) + * @return final destination node or -1 if no path found + */ private long walkInternalDfs(long srcNodeId, T dst) { Stack stack = new Stack(); this.visited.fill(false); stack.push(srcNodeId); visited.set(srcNodeId); while (!stack.isEmpty()) { long currentNodeId = stack.pop(); if (isDstNode(currentNodeId, dst)) { return currentNodeId; } for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { if (!visited.getBoolean(neighborNodeId)) { stack.push(neighborNodeId); visited.set(neighborNodeId); LongBigArrays.set(nodeParent, neighborNodeId, currentNodeId); } } } return -1; } + /** + * Internal BFS function of {@link #walk}. + * + * @param srcNodeId source node + * @param dst destination (either a node or a node type) + * @return final destination node or -1 if no path found + */ private long walkInternalBfs(long srcNodeId, T dst) { Queue queue = new LinkedList(); this.visited.fill(false); queue.add(srcNodeId); visited.set(srcNodeId); while (!queue.isEmpty()) { long currentNodeId = queue.poll(); if (isDstNode(currentNodeId, dst)) { return currentNodeId; } for (long neighborNodeId : new Neighbors(graph, useTransposed, edges, currentNodeId)) { if (!visited.getBoolean(neighborNodeId)) { queue.add(neighborNodeId); visited.set(neighborNodeId); LongBigArrays.set(nodeParent, neighborNodeId, currentNodeId); } } } return -1; } + /** + * Internal function of {@link #walk} to check if a node corresponds to the destination. + * + * @param nodeId current node + * @param dst destination (either a node or a node type) + * @return true if the node is a destination, or false otherwise + */ private boolean isDstNode(long nodeId, T dst) { if (dst instanceof Long) { long dstNodeId = (Long) dst; return nodeId == dstNodeId; } else if (dst instanceof Node.Type) { Node.Type dstType = (Node.Type) dst; return graph.getNodeType(nodeId) == dstType; } else { return false; } } + /** + * Internal backtracking function of {@link #walk}. + * + * @param srcNodeId source node + * @param dstNodeId destination node + * @return the found path, as a list of node ids + */ private ArrayList backtracking(long srcNodeId, long dstNodeId) { ArrayList path = new ArrayList(); long currentNodeId = dstNodeId; while (currentNodeId != srcNodeId) { path.add(currentNodeId); currentNodeId = LongBigArrays.get(nodeParent, currentNodeId); } path.add(srcNodeId); Collections.reverse(path); return path; } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java b/api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java index de3f8ad..46549ff 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java @@ -1,36 +1,64 @@ package org.softwareheritage.graph.backend; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import it.unimi.dsi.io.ByteBufferInputStream; +/** + * Wrapper class around very big mmap()-ed file. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class MapFile { + /** + * Memory-mapped file buffer + * @see it.unimi.dsi.io.ByteBufferInputStream + */ ByteBufferInputStream bufferMap; + /** Fixed line length of the mmap()-ed file */ int lineLength; + /** + * Constructor. + * + * @param path file path to mmap() + * @param lineLength fixed length of a line in the file + */ public MapFile(String path, int lineLength) throws IOException { this.bufferMap = null; this.lineLength = lineLength; try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { FileChannel fileChannel = mapFile.getChannel(); bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); } } + /** + * Returns a specific line in the file. + * + * @param lineIndex line number in the file + * @return the line at the specified position + */ public String readAtLine(long lineIndex) { byte[] buffer = new byte[lineLength]; long position = lineIndex * (long) lineLength; bufferMap.position(position); bufferMap.read(buffer, 0, lineLength); String line = new String(buffer); return line.trim(); } + /** + * Closes the mmap()-ed file. + */ public void close() throws IOException { bufferMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java index b510281..73c0d8f 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java @@ -1,76 +1,111 @@ package org.softwareheritage.graph.backend; import java.io.IOException; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.backend.MapFile; +/** + * Mapping between internal long node id and external SWH PID. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class NodeIdMap { + /** Fixed length of full SWH PID */ public static final int SWH_ID_LENGTH = 50; + /** Fixed length of long node id */ public static final int NODE_ID_LENGTH = 20; + /** Full graph path */ String graphPath; + /** Number of ids to map */ long nbIds; + /** mmap()-ed PID_TO_NODE file */ MapFile swhToNodeMap; + /** mmap()-ed NODE_TO_PID file */ MapFile nodeToSwhMap; + /** + * Constructor. + * + * @param graphPath full graph path + * @param nbNodes number of nodes in the graph + */ public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; // +1 are for spaces and end of lines int swhToNodeLineLength = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1; int nodeToSwhLineLength = SWH_ID_LENGTH + 1; this.swhToNodeMap = new MapFile(graphPath + Graph.PID_TO_NODE, swhToNodeLineLength); this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_PID, nodeToSwhLineLength); } - // SWH id (string) -> WebGraph node id (long) - // Each line in PID_TO_NODE is formatted as: swhId nodeId - // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId + /** + * Converts SWH PID to corresponding long node id. + * + * @param swhId node represented as a {@link SwhId} + * @return corresponding node as a long id + * @see org.softwareheritage.graph.SwhId + */ public long getNodeId(SwhId swhId) { + // Each line in PID_TO_NODE is formatted as: swhId nodeId + // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId long start = 0; long end = nbIds - 1; while (start <= end) { long lineNumber = (start + end) / 2L; String[] parts = swhToNodeMap.readAtLine(lineNumber).split(" "); if (parts.length != 2) { break; } String currentSwhId = parts[0]; long currentNodeId = Long.parseLong(parts[1]); int cmp = currentSwhId.compareTo(swhId.toString()); if (cmp == 0) { return currentNodeId; } else if (cmp < 0) { start = lineNumber + 1; } else { end = lineNumber - 1; } } throw new IllegalArgumentException("Unknown SWH id: " + swhId); } - // WebGraph node id (long) -> SWH id (string) - // Each line in NODE_TO_PID is formatted as: swhId - // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the - // nodeId-th line to get corresponding swhId + /** + * Converts a node long id to corresponding SWH PID. + * + * @param nodeId node as a long id + * @return corresponding node as a {@link SwhId} + * @see org.softwareheritage.graph.SwhId + */ public SwhId getSwhId(long nodeId) { + // Each line in NODE_TO_PID is formatted as: swhId + // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the + // nodeId-th line to get corresponding swhId if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } String swhId = nodeToSwhMap.readAtLine(nodeId); return new SwhId(swhId); } + /** + * Closes the mapping files. + */ public void close() throws IOException { swhToNodeMap.close(); nodeToSwhMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java index 143fa65..2a80a67 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java @@ -1,26 +1,51 @@ package org.softwareheritage.graph.backend; import java.io.IOException; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; +/** + * Mapping between long node id and SWH node type as described in the data model. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class NodeTypesMap { + /** + * Array storing for each node its type + * @see it.unimi.dsi.fastutil.longs.LongBigList + */ LongBigList nodeTypesMap; + /** + * Constructor. + * + * @param graphPath full path of the compressed graph + */ public NodeTypesMap(String graphPath) throws IOException { try { nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + Graph.NODE_TO_TYPE); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Unknown class object: " + e); } } + /** + * Returns node type from a node long id. + * + * @param nodeId node as a long id + * @return corresponding {@link Node.Type} value + * @see org.softwareheritage.graph.Node.Type + */ public Node.Type getType(long nodeId) { long type = nodeTypesMap.getLong(nodeId); return Node.Type.fromInt((int) type); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java index 3ef285f..72ada80 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java @@ -1,106 +1,120 @@ package org.softwareheritage.graph.backend; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Writer; import java.util.zip.GZIPInputStream; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectBigArrays; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.backend.NodeTypesMap; +/** + * Pre-processing steps (such as dumping mapping files on disk) before running the graph service. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class Setup { public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Expected parameters: "); System.exit(1); } String nodesPath = args[0]; String graphPath = args[1]; System.out.println("Pre-computing node id maps..."); long startTime = System.nanoTime(); precomputeNodeIdMap(nodesPath, graphPath); long endTime = System.nanoTime(); double duration = (double) (endTime - startTime) / 1_000_000_000; System.out.println("Done in: " + duration + " seconds"); } + /** + * Computes and dumps on disk mapping files. + * + * @param nodesPath path of the compressed csv nodes file + * @param graphPath path of the compressed graph + */ // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException { // First internal mapping: SWH id (string) -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); if (loaded != nbIds) { throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); } // Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) InputStream nodesStream = new GZIPInputStream(new FileInputStream(nodesPath)); FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodesStream, "UTF-8")); LineIterator swhIdIterator = new LineIterator(buffer); try (Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE)); Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) { // nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds); // To effectively run edge restriction during graph traversals, we store node id (long) -> SWH // type map. This is represented as a bitmap using minimum number of bits per Node.Type. final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); final int nbBitsPerNodeType = log2NbTypes; LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) { String strSwhId = swhIdIterator.next().toString(); long mphId = mphMap.getLong(strSwhId); long nodeId = LongBigArrays.get(bfsMap, mphId); String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); String line = strSwhId + " " + paddedNodeId + "\n"; swhToNodeMap.write(line); ObjectBigArrays.set(nodeToSwhId, nodeId, strSwhId); SwhId swhId = new SwhId(strSwhId); nodeTypesMap.set(nodeId, swhId.getType().ordinal()); } BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); for (long iNode = 0; iNode < nbIds; iNode++) { String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n"; nodeToSwhMap.write(line); } } } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/benchmark/LinuxLog.java b/api/server/src/main/java/org/softwareheritage/graph/benchmark/LinuxLog.java index f6e9eae..88faec9 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/benchmark/LinuxLog.java +++ b/api/server/src/main/java/org/softwareheritage/graph/benchmark/LinuxLog.java @@ -1,31 +1,39 @@ package org.softwareheritage.graph.benchmark; import java.io.IOException; import org.softwareheritage.graph.Endpoint; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhId; +/** + * Linux git log experiment to benchmark graph traversal. + * + * @author Thibault Allançon + * @version 1.0 + * @since 1.0 + */ + public class LinuxLog { public static void main(String[] args) throws IOException { String path = args[0]; Graph graph = new Graph(path); // A linux kernel commit on Sun Dec 31 final SwhId commit = new SwhId("swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35"); final long expectedCount = 723640; System.out.println("git log " + commit); System.out.println("Expecting " + expectedCount + " commits"); long startTime = System.nanoTime(); Endpoint endpoint = new Endpoint(graph, "forward", "rev:rev"); long count = endpoint.visitNodes(commit).size(); if (count != expectedCount) { throw new IllegalArgumentException("Counted " + count + " commits"); } long endTime = System.nanoTime(); double duration = (double) (endTime - startTime) / 1_000_000_000; System.out.println("Visit operation done in: " + duration + " seconds"); } }