diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapInputFile.java b/api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java similarity index 85% rename from api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapInputFile.java rename to api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java index d7e26c4..de3f8ad 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapInputFile.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/MapFile.java @@ -1,36 +1,36 @@ -package org.softwareheritage.graph.backend.utils; +package org.softwareheritage.graph.backend; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import it.unimi.dsi.io.ByteBufferInputStream; -public class MMapInputFile { +public class MapFile { ByteBufferInputStream bufferMap; int lineLength; - public MMapInputFile(String path, int lineLength) throws IOException { + public MapFile(String path, int lineLength) throws IOException { this.bufferMap = null; this.lineLength = lineLength; try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { FileChannel fileChannel = mapFile.getChannel(); bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); } } public String readAtLine(long lineIndex) { byte[] buffer = new byte[lineLength]; long position = lineIndex * (long) lineLength; bufferMap.position(position); bufferMap.read(buffer, 0, lineLength); String line = new String(buffer); return line.trim(); } public void close() throws IOException { bufferMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java index 3beacd3..fcca1ee 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java @@ -1,74 +1,75 @@ package org.softwareheritage.graph.backend; import java.io.IOException; import org.softwareheritage.graph.SwhId; -import org.softwareheritage.graph.backend.utils.MMapInputFile; +import org.softwareheritage.graph.backend.MapFile; public class NodeIdMap { public static final int SWH_ID_LENGTH = 50; public static final int NODE_ID_LENGTH = 20; - // +1 are for spaces and end of lines - public static final int SWH_TO_NODE_LINE_LENGTH = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1; - public static final int NODE_TO_SWH_LINE_LENGTH = SWH_ID_LENGTH + 1; String graphPath; long nbIds; - MMapInputFile swhToNodeMap; - MMapInputFile nodeToSwhMap; + MapFile swhToNodeMap; + MapFile nodeToSwhMap; public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; - this.swhToNodeMap = new MMapInputFile(graphPath + ".swhToNodeMap.csv", SWH_TO_NODE_LINE_LENGTH); - this.nodeToSwhMap = new MMapInputFile(graphPath + ".nodeToSwhMap.csv", NODE_TO_SWH_LINE_LENGTH); + + // +1 are for spaces and end of lines + int swhToNodeLineLength = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1; + int nodeToSwhLineLength = SWH_ID_LENGTH + 1; + this.swhToNodeMap = new MapFile(graphPath + ".swhToNodeMap.csv", swhToNodeLineLength); + this.nodeToSwhMap = new MapFile(graphPath + ".nodeToSwhMap.csv", nodeToSwhLineLength); } // SWH id (string) -> WebGraph node id (long) // Each line in .swhToNode.csv is formatted as: swhId nodeId // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId public long getNodeId(SwhId swhId) { long start = 0; long end = nbIds - 1; while (start <= end) { long lineNumber = (start + end) / 2L; String[] parts = swhToNodeMap.readAtLine(lineNumber).split(" "); if (parts.length != 2) { break; } String currentSwhId = parts[0]; long currentNodeId = Long.parseLong(parts[1]); int cmp = currentSwhId.compareTo(swhId.toString()); if (cmp == 0) { return currentNodeId; } else if (cmp < 0) { start = lineNumber + 1; } else { end = lineNumber - 1; } } throw new IllegalArgumentException("Unknown SWH id: " + swhId); } // WebGraph node id (long) -> SWH id (string) // Each line in .nodeToSwh.csv is formatted as: swhId // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the // nodeId-th line to get corresponding swhId public SwhId getSwhId(long nodeId) { if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } String swhId = nodeToSwhMap.readAtLine(nodeId); return new SwhId(swhId); } public void close() throws IOException { swhToNodeMap.close(); nodeToSwhMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java index 2c81622..42ab809 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java @@ -1,82 +1,82 @@ package org.softwareheritage.graph.backend; +import java.io.BufferedWriter; import java.io.FileInputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Writer; import java.util.zip.GZIPInputStream; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectBigArrays; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.backend.NodeIdMap; -import org.softwareheritage.graph.backend.utils.MMapOutputFile; public class Setup { public static void main(String[] args) throws IOException { String graphPath = args[0]; System.out.println("Pre-computing node id maps..."); long startTime = System.nanoTime(); precomputeNodeIdMap(graphPath); long endTime = System.nanoTime(); double duration = (double) (endTime - startTime) / 1_000_000_000; System.out.println("Done in: " + duration + " seconds"); } // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String graphPath) throws IOException { // First internal mapping: SWH id (string) -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); } long nbIds = mphMap.size(); // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); if (loaded != nbIds) { throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); } // Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) - MMapOutputFile swhToNodeMap = new MMapOutputFile( - graphPath + ".swhToNodeMap.csv", NodeIdMap.SWH_TO_NODE_LINE_LENGTH, nbIds); - MMapOutputFile nodeToSwhMap = new MMapOutputFile( - graphPath + ".nodeToSwhMap.csv", NodeIdMap.NODE_TO_SWH_LINE_LENGTH, nbIds); InputStream nodeFile = new GZIPInputStream(new FileInputStream(graphPath + ".nodes.csv.gz")); - FastBufferedReader fileBuffer = - new FastBufferedReader(new InputStreamReader(nodeFile, "UTF-8")); - LineIterator lineIterator = new LineIterator(fileBuffer); + FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodeFile, "UTF-8")); + LineIterator swhIdIterator = new LineIterator(buffer); - for (long iNode = 0; iNode < nbIds && lineIterator.hasNext(); iNode++) { - String swhId = lineIterator.next().toString(); - long mphId = mphMap.getLong(swhId); - long nodeId = LongBigArrays.get(bfsMap, mphId); + try ( + Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + ".swhToNodeMap.csv")); + Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + ".nodeToSwhMap.csv"))) { + // nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array + Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds); + + for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) { + String swhId = swhIdIterator.next().toString(); + long mphId = mphMap.getLong(swhId); + long nodeId = LongBigArrays.get(bfsMap, mphId); - { String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); String line = swhId + " " + paddedNodeId + "\n"; - long lineIndex = iNode; - swhToNodeMap.writeAtLine(line, lineIndex); + swhToNodeMap.write(line); + + ObjectBigArrays.set(nodeToSwhId, nodeId, swhId); } - { - String line = swhId + "\n"; - long lineIndex = nodeId; - nodeToSwhMap.writeAtLine(line, lineIndex); + for (long iNode = 0; iNode < nbIds; iNode++) { + String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n"; + nodeToSwhMap.write(line); } } - - swhToNodeMap.close(); - nodeToSwhMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapOutputFile.java b/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapOutputFile.java deleted file mode 100644 index 369c176..0000000 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/utils/MMapOutputFile.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.softwareheritage.graph.backend.utils; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; - -import it.unimi.dsi.io.ByteBufferOutputStream; - -public class MMapOutputFile { - ByteBufferOutputStream bufferMap; - int lineLength; - - public MMapOutputFile(String path, int lineLength, long nbLines) throws IOException { - this.bufferMap = null; - this.lineLength = lineLength; - long totalSize = nbLines * lineLength; - - try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "rw")) { - FileChannel fileChannel = mapFile.getChannel(); - bufferMap = - ByteBufferOutputStream.map(fileChannel, totalSize, FileChannel.MapMode.READ_WRITE); - } - } - - public void writeAtLine(String line, long lineIndex) { - long position = lineIndex * (long) lineLength; - bufferMap.position(position); - bufferMap.write(line.getBytes(), 0, lineLength); - } - - public void close() throws IOException { - bufferMap.close(); - } -}