diff --git a/api/server/src/main/java/org/softwareheritage/graph/Graph.java b/api/server/src/main/java/org/softwareheritage/graph/Graph.java index 18c4100..8143c9b 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Graph.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,71 +1,71 @@ package org.softwareheritage.graph; import java.io.IOException; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; -import org.softwareheritage.graph.NodeIdMap; import org.softwareheritage.graph.SwhId; +import org.softwareheritage.graph.backend.NodeIdMap; public class Graph { BVGraph graph; BVGraph graphTransposed; String path; NodeIdMap nodeIdMap; public Graph(String path) throws IOException { this.graph = BVGraph.load(path); this.graphTransposed = BVGraph.load(path + "-transposed"); this.path = path; this.nodeIdMap = new NodeIdMap(path, getNbNodes()); } public void cleanUp() throws IOException { nodeIdMap.close(); } public String getPath() { return path; } public long getNodeId(SwhId swhId) { return nodeIdMap.getNodeId(swhId); } public SwhId getSwhId(long nodeId) { return nodeIdMap.getSwhId(nodeId); } public long getNbNodes() { return graph.numNodes(); } public long getNbEdges() { return graph.numArcs(); } public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } public long outdegree(long nodeId) { return graph.outdegree(nodeId); } public LazyLongIterator predecessors(long nodeId) { return graphTransposed.successors(nodeId); } public long indegree(long nodeId) { return graphTransposed.outdegree(nodeId); } public long degree(long nodeId, boolean useTransposed) { return (useTransposed) ? indegree(nodeId) : outdegree(nodeId); } public LazyLongIterator neighbors(long nodeId, boolean useTransposed) { return (useTransposed) ? predecessors(nodeId) : successors(nodeId); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java deleted file mode 100644 index b347513..0000000 --- a/api/server/src/main/java/org/softwareheritage/graph/NodeIdMap.java +++ /dev/null @@ -1,142 +0,0 @@ -package org.softwareheritage.graph; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.zip.GZIPInputStream; - -import it.unimi.dsi.fastutil.io.BinIO; -import it.unimi.dsi.fastutil.longs.LongBigArrays; -import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.fastutil.objects.ObjectBigArrays; -import it.unimi.dsi.io.FastBufferedReader; -import it.unimi.dsi.io.LineIterator; - -import org.softwareheritage.graph.SwhId; -import org.softwareheritage.graph.utils.MMapInputFile; -import org.softwareheritage.graph.utils.MMapOutputFile; - -// TODO: -// - Add option to dump or not the mapping - -public class NodeIdMap { - private static final int SWH_ID_SIZE = 50; - private static final int NODE_ID_SIZE = 20; - // +1 are for spaces and end of lines - private static final int SWH_TO_NODE_LINE_LENGTH = SWH_ID_SIZE + 1 + NODE_ID_SIZE + 1; - private static final int NODE_TO_SWH_LINE_LENGTH = SWH_ID_SIZE + 1; - - String graphPath; - long nbIds; - MMapInputFile swhToNodeMap; - MMapInputFile nodeToSwhMap; - - public NodeIdMap(String graphPath, long nbNodes) throws IOException { - this.graphPath = graphPath; - this.nbIds = nbNodes; - - dump(); - - this.swhToNodeMap = new MMapInputFile(graphPath + ".swhToNodeMap.csv", SWH_TO_NODE_LINE_LENGTH); - this.nodeToSwhMap = new MMapInputFile(graphPath + ".nodeToSwhMap.csv", NODE_TO_SWH_LINE_LENGTH); - } - - // SWH id (string) -> WebGraph node id (long) - // Each line in .swhToNode.csv is formatted as: swhId nodeId - // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId - public long getNodeId(SwhId swhId) { - long start = 0; - long end = nbIds - 1; - - while (start <= end) { - long lineNumber = (start + end) / 2L; - String[] parts = swhToNodeMap.readAtLine(lineNumber).split(" "); - if (parts.length != 2) { - break; - } - - String currentSwhId = parts[0]; - long currentNodeId = Long.parseLong(parts[1]); - - int cmp = currentSwhId.compareTo(swhId.toString()); - if (cmp == 0) { - return currentNodeId; - } else if (cmp < 0) { - start = lineNumber + 1; - } else { - end = lineNumber - 1; - } - } - - throw new IllegalArgumentException("Unknown SWH id: " + swhId); - } - - // WebGraph node id (long) -> SWH id (string) - // Each line in .nodeToSwh.csv is formatted as: swhId - // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the - // nodeId-th line to get corresponding swhId - public SwhId getSwhId(long nodeId) { - if (nodeId < 0 || nodeId >= nbIds) { - throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); - } - - String swhId = nodeToSwhMap.readAtLine(nodeId); - return new SwhId(swhId); - } - - @SuppressWarnings("unchecked") - void dump() throws IOException { - // First internal mapping: SWH id (string) -> WebGraph MPH (long) - Object2LongFunction mphMap = null; - try { - mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); - } catch (ClassNotFoundException e) { - throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); - } - - // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) - long[][] bfsMap = LongBigArrays.newBigArray(nbIds); - long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); - if (loaded != nbIds) { - throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); - } - - // Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) - MMapOutputFile swhToNodeMapOut = - new MMapOutputFile(graphPath + ".swhToNodeMap.csv", SWH_TO_NODE_LINE_LENGTH, nbIds); - MMapOutputFile nodeToSwhMapOut = - new MMapOutputFile(graphPath + ".nodeToSwhMap.csv", NODE_TO_SWH_LINE_LENGTH, nbIds); - - InputStream nodeFile = new GZIPInputStream(new FileInputStream(graphPath + ".nodes.csv.gz")); - FastBufferedReader fileBuffer = new FastBufferedReader(new InputStreamReader(nodeFile, "UTF-8")); - LineIterator lineIterator = new LineIterator(fileBuffer); - - for (long iNode = 0; iNode < nbIds && lineIterator.hasNext(); iNode++) { - String swhId = lineIterator.next().toString(); - long mphId = mphMap.getLong(swhId); - long nodeId = LongBigArrays.get(bfsMap, mphId); - - { - String paddedNodeId = String.format("%0" + NODE_ID_SIZE + "d", nodeId); - String line = swhId + " " + paddedNodeId + "\n"; - long lineIndex = iNode; - swhToNodeMapOut.writeAtLine(line, lineIndex); - } - - { - String line = swhId + "\n"; - long lineIndex = nodeId; - nodeToSwhMapOut.writeAtLine(line, lineIndex); - } - } - - swhToNodeMapOut.close(); - nodeToSwhMapOut.close(); - } - - public void close() throws IOException { - swhToNodeMap.close(); - nodeToSwhMap.close(); - } -} diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java new file mode 100644 index 0000000..fe887d7 --- /dev/null +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java @@ -0,0 +1,74 @@ +package org.softwareheritage.graph.backend; + +import java.io.IOException; + +import org.softwareheritage.graph.SwhId; +import org.softwareheritage.graph.backend.utils.MMapInputFile; + +public class NodeIdMap { + public static final int SWH_ID_LENGTH = 40; + public static final int NODE_ID_LENGTH = 20; + // +1 are for spaces and end of lines + public static final int SWH_TO_NODE_LINE_LENGTH = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1; + public static final int NODE_TO_SWH_LINE_LENGTH = SWH_ID_LENGTH + 1; + + String graphPath; + long nbIds; + MMapInputFile swhToNodeMap; + MMapInputFile nodeToSwhMap; + + public NodeIdMap(String graphPath, long nbNodes) throws IOException { + this.graphPath = graphPath; + this.nbIds = nbNodes; + this.swhToNodeMap = new MMapInputFile(graphPath + ".swhToNodeMap.csv", SWH_TO_NODE_LINE_LENGTH); + this.nodeToSwhMap = new MMapInputFile(graphPath + ".nodeToSwhMap.csv", NODE_TO_SWH_LINE_LENGTH); + } + + // SWH id (string) -> WebGraph node id (long) + // Each line in .swhToNode.csv is formatted as: swhId nodeId + // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId + public long getNodeId(SwhId swhId) { + long start = 0; + long end = nbIds - 1; + + while (start <= end) { + long lineNumber = (start + end) / 2L; + String[] parts = swhToNodeMap.readAtLine(lineNumber).split(" "); + if (parts.length != 2) { + break; + } + + String currentSwhId = parts[0]; + long currentNodeId = Long.parseLong(parts[1]); + + int cmp = currentSwhId.compareTo(swhId.toString()); + if (cmp == 0) { + return currentNodeId; + } else if (cmp < 0) { + start = lineNumber + 1; + } else { + end = lineNumber - 1; + } + } + + throw new IllegalArgumentException("Unknown SWH id: " + swhId); + } + + // WebGraph node id (long) -> SWH id (string) + // Each line in .nodeToSwh.csv is formatted as: swhId + // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the + // nodeId-th line to get corresponding swhId + public SwhId getSwhId(long nodeId) { + if (nodeId < 0 || nodeId >= nbIds) { + throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); + } + + String swhId = nodeToSwhMap.readAtLine(nodeId); + return new SwhId(swhId); + } + + public void close() throws IOException { + swhToNodeMap.close(); + nodeToSwhMap.close(); + } +} diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java new file mode 100644 index 0000000..2c81622 --- /dev/null +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java @@ -0,0 +1,82 @@ +package org.softwareheritage.graph.backend; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; + +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.io.FastBufferedReader; +import it.unimi.dsi.io.LineIterator; + +import org.softwareheritage.graph.backend.NodeIdMap; +import org.softwareheritage.graph.backend.utils.MMapOutputFile; + +public class Setup { + public static void main(String[] args) throws IOException { + String graphPath = args[0]; + + System.out.println("Pre-computing node id maps..."); + long startTime = System.nanoTime(); + precomputeNodeIdMap(graphPath); + long endTime = System.nanoTime(); + double duration = (double) (endTime - startTime) / 1_000_000_000; + System.out.println("Done in: " + duration + " seconds"); + } + + // Suppress warning for Object2LongFunction cast + @SuppressWarnings("unchecked") + static void precomputeNodeIdMap(String graphPath) throws IOException { + // First internal mapping: SWH id (string) -> WebGraph MPH (long) + Object2LongFunction mphMap = null; + try { + mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); + } + long nbIds = mphMap.size(); + + // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) + long[][] bfsMap = LongBigArrays.newBigArray(nbIds); + long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); + if (loaded != nbIds) { + throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); + } + + // Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) + MMapOutputFile swhToNodeMap = new MMapOutputFile( + graphPath + ".swhToNodeMap.csv", NodeIdMap.SWH_TO_NODE_LINE_LENGTH, nbIds); + MMapOutputFile nodeToSwhMap = new MMapOutputFile( + graphPath + ".nodeToSwhMap.csv", NodeIdMap.NODE_TO_SWH_LINE_LENGTH, nbIds); + + InputStream nodeFile = new GZIPInputStream(new FileInputStream(graphPath + ".nodes.csv.gz")); + FastBufferedReader fileBuffer = + new FastBufferedReader(new InputStreamReader(nodeFile, "UTF-8")); + LineIterator lineIterator = new LineIterator(fileBuffer); + + for (long iNode = 0; iNode < nbIds && lineIterator.hasNext(); iNode++) { + String swhId = lineIterator.next().toString(); + long mphId = mphMap.getLong(swhId); + long nodeId = LongBigArrays.get(bfsMap, mphId); + + { + String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); + String line = swhId + " " + paddedNodeId + "\n"; + long lineIndex = iNode; + swhToNodeMap.writeAtLine(line, lineIndex); + } + + { + String line = swhId + "\n"; + long lineIndex = nodeId; + nodeToSwhMap.writeAtLine(line, lineIndex); + } + } + + swhToNodeMap.close(); + nodeToSwhMap.close(); + } +} diff --git a/api/server/src/main/java/org/softwareheritage/graph/utils/MMapInputFile.java b/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapInputFile.java similarity index 95% rename from api/server/src/main/java/org/softwareheritage/graph/utils/MMapInputFile.java rename to api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapInputFile.java index 154c69b..d7e26c4 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/utils/MMapInputFile.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapInputFile.java @@ -1,36 +1,36 @@ -package org.softwareheritage.graph.utils; +package org.softwareheritage.graph.backend.utils; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import it.unimi.dsi.io.ByteBufferInputStream; public class MMapInputFile { ByteBufferInputStream bufferMap; int lineLength; public MMapInputFile(String path, int lineLength) throws IOException { this.bufferMap = null; this.lineLength = lineLength; try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { FileChannel fileChannel = mapFile.getChannel(); bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); } } public String readAtLine(long lineIndex) { byte[] buffer = new byte[lineLength]; long position = lineIndex * (long) lineLength; bufferMap.position(position); bufferMap.read(buffer, 0, lineLength); String line = new String(buffer); return line.trim(); } public void close() throws IOException { bufferMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/utils/MMapOutputFile.java b/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapOutputFile.java similarity index 95% rename from api/server/src/main/java/org/softwareheritage/graph/utils/MMapOutputFile.java rename to api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapOutputFile.java index 38828aa..369c176 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/utils/MMapOutputFile.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapOutputFile.java @@ -1,35 +1,35 @@ -package org.softwareheritage.graph.utils; +package org.softwareheritage.graph.backend.utils; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import it.unimi.dsi.io.ByteBufferOutputStream; public class MMapOutputFile { ByteBufferOutputStream bufferMap; int lineLength; public MMapOutputFile(String path, int lineLength, long nbLines) throws IOException { this.bufferMap = null; this.lineLength = lineLength; long totalSize = nbLines * lineLength; try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "rw")) { FileChannel fileChannel = mapFile.getChannel(); bufferMap = ByteBufferOutputStream.map(fileChannel, totalSize, FileChannel.MapMode.READ_WRITE); } } public void writeAtLine(String line, long lineIndex) { long position = lineIndex * (long) lineLength; bufferMap.position(position); bufferMap.write(line.getBytes(), 0, lineLength); } public void close() throws IOException { bufferMap.close(); } }