diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,40 +1,49 @@ package org.softwareheritage.graph.maps; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigList; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.util.ByteBufferLongBigList; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SWHID; +import java.io.FileInputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

- * Mappings in both directions are pre-computed and dumped on disk in the {@link NodeMapBuilder} - * class, then they are loaded here using mmap(). + * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, + * then permuting it using an mmap()-ed .order file containing the graph + * permutation. + * + * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the + * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ public class NodeIdMap { - /** Fixed length of full SWHID */ - public static final int SWHID_LENGTH = 50; - /** Fixed length of long node id */ - public static final int NODE_ID_LENGTH = 20; - /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; - /** Fixed length of binary node id buffer */ - public static final int NODE_ID_BIN_SIZE = 8; /** Graph path and basename */ String graphPath; /** Number of ids to map */ long nbIds; - /** mmap()-ed SWHID_TO_NODE file */ - MapFile swhToNodeMap; + /** mmap()-ed NODE_TO_SWHID file */ MapFile nodeToSwhMap; + /** Minimal perfect hash (MPH) function SWHID -> initial order */ + Object2LongFunction mph; + /** mmap()-ed long list with the permutation initial order -> graph order */ + LongBigList orderMap; + /** FileInputStream containing the permutation */ + FileInputStream orderInputStream; + /** * Constructor. * @@ -45,46 +54,51 @@ this.graphPath = graphPath; this.nbIds = nbNodes; - this.swhToNodeMap = new MapFile(graphPath + Graph.SWHID_TO_NODE, SWHID_BIN_SIZE + NODE_ID_BIN_SIZE); + // node -> SWHID this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_SWHID, SWHID_BIN_SIZE); + + // SWHID -> node + this.mph = loadMph(graphPath + ".mph"); + this.orderInputStream = new FileInputStream(graphPath + ".order"); + this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel()); + } + + @SuppressWarnings("unchecked") + private Object2LongFunction loadMph(String path) throws IOException { + try { + return (Object2LongFunction) BinIO.loadObject(path); + } catch (ClassNotFoundException e) { + throw new IOException(e.getMessage()); + } } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} + * @param checkExists if true, error if the SWHID is not present in the graph, if false the check + * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ - public long getNodeId(SWHID swhid) { - /* - * The file is sorted by swhid, hence we can binary search on swhid to get corresponding nodeId - */ - long start = 0; - long end = nbIds - 1; - - while (start <= end) { - long lineNumber = (start + end) / 2L; - byte[] buffer = swhToNodeMap.readAtLine(lineNumber); - byte[] swhidBuffer = new byte[SWHID_BIN_SIZE]; - byte[] nodeBuffer = new byte[NODE_ID_BIN_SIZE]; - System.arraycopy(buffer, 0, swhidBuffer, 0, SWHID_BIN_SIZE); - System.arraycopy(buffer, SWHID_BIN_SIZE, nodeBuffer, 0, NODE_ID_BIN_SIZE); - - String currentSWHID = SWHID.fromBytes(swhidBuffer).getSWHID(); - long currentNodeId = java.nio.ByteBuffer.wrap(nodeBuffer).getLong(); - - int cmp = currentSWHID.compareTo(swhid.toString()); - if (cmp == 0) { - return currentNodeId; - } else if (cmp < 0) { - start = lineNumber + 1; - } else { - end = lineNumber - 1; - } + public long getNodeId(SWHID swhid, boolean checkExists) { + // 1. Hash the SWHID with the MPH to get its original ID + long origNodeId = mph.getLong(swhid.toString().getBytes(StandardCharsets.US_ASCII)); + + // 2. Use the order permutation to get the position in the permuted graph + long nodeId = this.orderMap.getLong(origNodeId); + + // 3. Check that the position effectively corresponds to a real node using the reverse map. + // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. + if (!checkExists || getSWHID(nodeId).equals(swhid)) { + return nodeId; + } else { + throw new IllegalArgumentException("Unknown SWHID: " + swhid); } + } - throw new IllegalArgumentException("Unknown SWHID: " + swhid); + public long getNodeId(SWHID swhid) { + return getNodeId(swhid, true); } /** @@ -110,7 +124,7 @@ * Closes the mapping files. */ public void close() throws IOException { - swhToNodeMap.close(); + orderInputStream.close(); nodeToSwhMap.close(); } }