diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,40 +1,48 @@ package org.softwareheritage.graph.maps; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigList; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.util.ByteBufferLongBigList; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SWHID; +import java.io.FileInputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

- * Mappings in both directions are pre-computed and dumped on disk in the {@link NodeMapBuilder} - * class, then they are loaded here using mmap(). + * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using + * an mmap()-ed .order file containing the graph permutation. + * + * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the + * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ public class NodeIdMap { - /** Fixed length of full SWHID */ - public static final int SWHID_LENGTH = 50; - /** Fixed length of long node id */ - public static final int NODE_ID_LENGTH = 20; - /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; - /** Fixed length of binary node id buffer */ - public static final int NODE_ID_BIN_SIZE = 8; /** Graph path and basename */ String graphPath; /** Number of ids to map */ long nbIds; - /** mmap()-ed SWHID_TO_NODE file */ - MapFile swhToNodeMap; + /** mmap()-ed NODE_TO_SWHID file */ MapFile nodeToSwhMap; + /** Minimal perfect hash (MPH) function SWHID -> initial order */ + Object2LongFunction mph; + /** mmap()-ed long list with the permutation initial order -> graph order */ + LongBigList orderMap; + /** FileInputStream containing the permutation */ + FileInputStream orderInputStream; + /** * Constructor. * @@ -45,46 +53,70 @@ this.graphPath = graphPath; this.nbIds = nbNodes; - this.swhToNodeMap = new MapFile(graphPath + Graph.SWHID_TO_NODE, SWHID_BIN_SIZE + NODE_ID_BIN_SIZE); + // node -> SWHID this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_SWHID, SWHID_BIN_SIZE); + + // SWHID -> node + this.mph = loadMph(graphPath + ".mph"); + this.orderInputStream = new FileInputStream(graphPath + ".order"); + this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel()); + } + + @SuppressWarnings("unchecked") + public static Object2LongFunction loadMph(String path) throws IOException { + Object obj; + try { + obj = BinIO.loadObject(path); + } catch (ClassNotFoundException e) { + throw new IOException(e.getMessage()); + } + + Object2LongFunction res = (Object2LongFunction) obj; + + // Backward-compatibility for old maps parametrized with . + // New maps should be parametrized with , which is faster. + try { + // Try to call it with bytes, will fail if it's a O2LF. + res.getLong("42".getBytes(StandardCharsets.UTF_8)); + } catch (ClassCastException e) { + Object2LongFunction mphLegacy = (Object2LongFunction) obj; + res = (o -> { + byte[] bi = (byte[]) o; + return mphLegacy.getLong(new String(bi, StandardCharsets.UTF_8)); + }); + } + // End of backward-compatibility block + + return res; } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} + * @param checkExists if true, error if the SWHID is not present in the graph, if false the check + * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ - public long getNodeId(SWHID swhid) { - /* - * The file is sorted by swhid, hence we can binary search on swhid to get corresponding nodeId - */ - long start = 0; - long end = nbIds - 1; - - while (start <= end) { - long lineNumber = (start + end) / 2L; - byte[] buffer = swhToNodeMap.readAtLine(lineNumber); - byte[] swhidBuffer = new byte[SWHID_BIN_SIZE]; - byte[] nodeBuffer = new byte[NODE_ID_BIN_SIZE]; - System.arraycopy(buffer, 0, swhidBuffer, 0, SWHID_BIN_SIZE); - System.arraycopy(buffer, SWHID_BIN_SIZE, nodeBuffer, 0, NODE_ID_BIN_SIZE); - - String currentSWHID = SWHID.fromBytes(swhidBuffer).getSWHID(); - long currentNodeId = java.nio.ByteBuffer.wrap(nodeBuffer).getLong(); - - int cmp = currentSWHID.compareTo(swhid.toString()); - if (cmp == 0) { - return currentNodeId; - } else if (cmp < 0) { - start = lineNumber + 1; - } else { - end = lineNumber - 1; - } + public long getNodeId(SWHID swhid, boolean checkExists) { + // 1. Hash the SWHID with the MPH to get its original ID + long origNodeId = mph.getLong(swhid.toString().getBytes(StandardCharsets.US_ASCII)); + + // 2. Use the order permutation to get the position in the permuted graph + long nodeId = this.orderMap.getLong(origNodeId); + + // 3. Check that the position effectively corresponds to a real node using the reverse map. + // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. + if (!checkExists || getSWHID(nodeId).equals(swhid)) { + return nodeId; + } else { + throw new IllegalArgumentException("Unknown SWHID: " + swhid); } + } - throw new IllegalArgumentException("Unknown SWHID: " + swhid); + public long getNodeId(SWHID swhid) { + return getNodeId(swhid, true); } /** @@ -110,7 +142,7 @@ * Closes the mapping files. */ public void close() throws IOException { - swhToNodeMap.close(); + orderInputStream.close(); nodeToSwhMap.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java --- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java @@ -4,7 +4,6 @@ import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; -import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.io.FastBufferedReader; @@ -12,6 +11,7 @@ import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.experiments.topology.ConnectedComponents; +import org.softwareheritage.graph.maps.NodeIdMap; import java.io.File; import java.io.IOException; @@ -19,16 +19,11 @@ import java.nio.charset.StandardCharsets; public class ExportSubdataset { - @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); - } - public static void main(String[] args) throws IOException, ClassNotFoundException { System.err.print("Loading everything..."); String graphPath = args[0]; Graph graph = new Graph(graphPath); - Object2LongFunction mphMap = loadMPH(graphPath + ".mph"); + Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); System.err.println(" done."); final long n = graph.numNodes(); diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java --- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java @@ -1,10 +1,10 @@ package org.softwareheritage.graph.utils; import com.martiansoftware.jsap.*; -import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; +import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; import java.io.InputStreamReader; @@ -28,16 +28,11 @@ return config; } - @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); - } - public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String mphPath = config.getString("function"); - Object2LongFunction mphMap = loadMPH(mphPath); + Object2LongFunction mphMap = NodeIdMap.loadMph(mphPath); // TODO: wasteful to convert to/from bytes FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java --- a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java @@ -7,22 +7,18 @@ import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; +import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class WriteRevisionTimestamps { - @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); - } - public static void main(String[] args) throws IOException, ClassNotFoundException { System.err.print("Loading everything..."); String graphPath = args[0]; String outputFile = args[1]; - Object2LongFunction mphMap = loadMPH(graphPath + ".mph"); + Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order"); // NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds);