diff --git a/api/server/src/main/java/org/softwareheritage/graph/App.java b/api/server/src/main/java/org/softwareheritage/graph/App.java index da332a0..ef070aa 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/App.java +++ b/api/server/src/main/java/org/softwareheritage/graph/App.java @@ -1,51 +1,57 @@ package org.softwareheritage.graph; import java.io.IOException; import java.util.Optional; import io.javalin.Javalin; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.algo.Stats; import org.softwareheritage.graph.algo.Visit; public class App { public static void main(String[] args) throws IOException, Exception { String path = args[0]; Graph graph = new Graph(path); + Runtime.getRuntime().addShutdownHook(new Thread() { + public void run() { + graph.cleanUp(); + } + }); + Javalin app = Javalin.create().start(5010); app.get("/stats/:src_type/:dst_type", ctx -> { try { String srcType = ctx.pathParam("src_type"); String dstType = ctx.pathParam("dst_type"); ctx.json(new Stats(srcType, dstType)); } catch (IllegalArgumentException e) { ctx.status(404); } catch (Exception e) { ctx.status(400); ctx.result(e.toString()); } }); app.get("/visit/:swh_id", ctx -> { try { SwhId start = new SwhId(ctx.pathParam("swh_id")); // By default, traversal is a forward DFS using all edges String algorithm = Optional.ofNullable(ctx.queryParam("traversal")).orElse("dfs"); String direction = Optional.ofNullable(ctx.queryParam("direction")).orElse("forward"); String edges = Optional.ofNullable(ctx.queryParam("edges")).orElse("cnt:dir:rel:rev:snp"); ctx.json(new Visit(graph, start, edges, algorithm, direction)); } catch (IllegalArgumentException e) { ctx.status(400); ctx.result(e.toString()); } }); app.error(404, ctx -> { ctx.result("Not found"); }); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/Graph.java b/api/server/src/main/java/org/softwareheritage/graph/Graph.java index bc4527e..350c5be 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Graph.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,73 +1,77 @@ package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.NodeIdMap; import org.softwareheritage.graph.SwhId; public class Graph { BVGraph graph; BVGraph graphTransposed; String path; NodeIdMap nodeIdMap; public Graph(String graphPath) throws Exception { this.graph = BVGraph.load(graphPath); this.graphTransposed = BVGraph.load(graphPath + "-transposed"); this.path = graphPath; - this.nodeIdMap = new NodeIdMap(graphPath); + this.nodeIdMap = new NodeIdMap(graphPath, getNbNodes()); + } + + public void cleanUp() { + nodeIdMap.close(); } public String getPath() { return path; } public long getNode(SwhId swhId) { return nodeIdMap.getNode(swhId); } public SwhId getSwhId(long node) { return nodeIdMap.getSwhId(node); } public long getNbNodes() { return graph.numNodes(); } public long getNbEdges() { return graph.numArcs(); } public LazyLongIterator successors(long node) { return graph.successors(node); } public long outdegree(long node) { return graph.outdegree(node); } public LazyLongIterator predecessors(long node) { return graphTransposed.successors(node); } public long indegree(long node) { return graphTransposed.outdegree(node); } public long degree(long node, boolean isTransposed) { if (isTransposed) { return indegree(node); } else { return outdegree(node); } } public LazyLongIterator neighbors(long node, boolean isTransposed) { if (isTransposed) { return predecessors(node); } else { return successors(node); } } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java index d222a99..cc65ebb 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java +++ b/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java @@ -1,23 +1,135 @@ package org.softwareheritage.graph; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; + +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectBigArrays; +import it.unimi.dsi.io.FastBufferedReader; +import it.unimi.dsi.io.LineIterator; + import org.softwareheritage.graph.SwhId; +import org.softwareheritage.graph.utils.MMapInputFile; +import org.softwareheritage.graph.utils.MMapOutputFile; + +// TODO: +// - Add option to dump or not the mapping +// - Add error handling when node/swh ids not found -// TODO: decide on how to do the disk-based node id map public class NodeIdMap { + private static final int SWH_ID_SIZE = 50; + private static final int NODE_ID_SIZE = 20; + // +1 are for spaces and end of lines + private static final int SWH_TO_NODE_LINE_LENGTH = SWH_ID_SIZE + 1 + NODE_ID_SIZE + 1; + private static final int NODE_TO_SWH_LINE_LENGTH = SWH_ID_SIZE + 1; + String graphPath; + long nbIds; + MMapInputFile swhToNodeMap; + MMapInputFile nodeToSwhMap; - public NodeIdMap(String graphPath) { + public NodeIdMap(String graphPath, long nbNodes) { this.graphPath = graphPath; + this.nbIds = nbNodes; + + try { + dump(); + } catch (Exception e) { + System.out.println("Could not dump mapping: " + e); + } + + this.swhToNodeMap = new MMapInputFile(graphPath + ".swhToNodeMap.csv", SWH_TO_NODE_LINE_LENGTH); + this.nodeToSwhMap = new MMapInputFile(graphPath + ".nodeToSwhMap.csv", NODE_TO_SWH_LINE_LENGTH); } + // SWH id (string) -> WebGraph node id (long) + // Each line in .swhToNode.csv is formatted as: swhId nodeId + // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId public long getNode(SwhId swhId) { - return 42; + long start = 0; + long end = nbIds; + + while (start <= end) { + long lineNumber = (start + end) / 2L; + String[] parts = swhToNodeMap.readLine(lineNumber).split(" "); + String currentSwhId = parts[0]; + long currentNodeId = Long.parseLong(parts[1]); + + int cmp = currentSwhId.compareTo(swhId.toString()); + if (cmp == 0) { + return currentNodeId; + } else if (cmp < 0) { + start = lineNumber + 1; + } else { + end = lineNumber - 1; + } + } + + return -1; } + // WebGraph node id (long) -> SWH id (string) + // Each line in .nodeToSwh.csv is formatted as: swhId + // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the + // nodeId-th line to get corresponding swhId public SwhId getSwhId(long node) { - return null; + String swhId = nodeToSwhMap.readLine(node); + return new SwhId(swhId); + } + + void dump() throws ClassNotFoundException, IOException { + // First internal mapping: SWH id (string) -> WebGraph MPH (long) + @SuppressWarnings("unchecked") + Object2LongFunction mphMap = + (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); + + // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) + long[][] bfsMap = LongBigArrays.newBigArray(nbIds); + long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); + if (loaded != nbIds) { + throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); + } + + // Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) + MMapOutputFile swhToNodeMapOut = + new MMapOutputFile(graphPath + ".swhToNodeMap.csv", SWH_TO_NODE_LINE_LENGTH, nbIds); + MMapOutputFile nodeToSwhMapOut = + new MMapOutputFile(graphPath + ".nodeToSwhMap.csv", NODE_TO_SWH_LINE_LENGTH, nbIds); + + InputStream nodeFile = new GZIPInputStream(new FileInputStream(graphPath + ".nodes.csv.gz")); + FastBufferedReader fileBuffer = new FastBufferedReader(new InputStreamReader(nodeFile, "UTF-8")); + LineIterator lineIterator = new LineIterator(fileBuffer); + + for (long iNode = 0; iNode < nbIds && lineIterator.hasNext(); iNode++) { + String swhId = lineIterator.next().toString(); + long mphId = mphMap.getLong(swhId); + long nodeId = LongBigArrays.get(bfsMap, mphId); + + { + String paddedNodeId = String.format("%0" + NODE_ID_SIZE + "d", nodeId); + String line = swhId + " " + paddedNodeId + "\n"; + long lineIndex = iNode; + swhToNodeMapOut.writeLine(line, lineIndex); + } + + { + String line = swhId + "\n"; + long lineIndex = nodeId; + nodeToSwhMapOut.writeLine(line, lineIndex); + } + } + + swhToNodeMapOut.close(); + nodeToSwhMapOut.close(); } - public void dump() { + public void close() { + swhToNodeMap.close(); + nodeToSwhMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/utils/MMapInputFile.java b/api/server/src/main/java/org/softwareheritage/graph/utils/MMapInputFile.java new file mode 100644 index 0000000..428f33c --- /dev/null +++ b/api/server/src/main/java/org/softwareheritage/graph/utils/MMapInputFile.java @@ -0,0 +1,42 @@ +package org.softwareheritage.graph.utils; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; + +import it.unimi.dsi.io.ByteBufferInputStream; + +public class MMapInputFile { + ByteBufferInputStream bufferMap; + int lineLength; + + public MMapInputFile(String path, int lineLength) { + this.bufferMap = null; + this.lineLength = lineLength; + + try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { + FileChannel fileChannel = mapFile.getChannel(); + bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); + } catch (IOException e) { + System.out.println("Could not load MMapInputFile " + path + ": " + e); + } + } + + public String readLine(long lineIndex) { + byte[] buffer = new byte[lineLength]; + long position = lineIndex * (long) lineLength; + bufferMap.position(position); + bufferMap.read(buffer, 0, lineLength); + String line = new String(buffer); + return line.trim(); + } + + public void close() { + try { + bufferMap.close(); + } catch (IOException e) { + System.out.println("Could not close MMapInputFile: " + e); + } + } +} diff --git a/api/server/src/main/java/org/softwareheritage/graph/utils/MMapOutputFile.java b/api/server/src/main/java/org/softwareheritage/graph/utils/MMapOutputFile.java new file mode 100644 index 0000000..a220aa9 --- /dev/null +++ b/api/server/src/main/java/org/softwareheritage/graph/utils/MMapOutputFile.java @@ -0,0 +1,41 @@ +package org.softwareheritage.graph.utils; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; + +import it.unimi.dsi.io.ByteBufferOutputStream; + +public class MMapOutputFile { + ByteBufferOutputStream bufferMap; + int lineLength; + + public MMapOutputFile(String path, int lineLength, long nbLines) { + this.bufferMap = null; + this.lineLength = lineLength; + long totalSize = nbLines * lineLength; + + try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "rw")) { + FileChannel fileChannel = mapFile.getChannel(); + bufferMap = + ByteBufferOutputStream.map(fileChannel, totalSize, FileChannel.MapMode.READ_WRITE); + } catch (IOException e) { + System.out.println("Could not load MMapOutputFile " + path + ": " + e); + } + } + + public void writeLine(String line, long lineIndex) { + long position = lineIndex * (long) lineLength; + bufferMap.position(position); + bufferMap.write(line.getBytes(), 0, lineLength); + } + + public void close() { + try { + bufferMap.close(); + } catch (IOException e) { + System.out.println("Could not close MMapOutputFile: " + e); + } + } +}