Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9749568
D5427.id19580.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D5427.id19580.diff
View Options
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
@@ -1,40 +1,48 @@
package org.softwareheritage.graph.maps;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.longs.LongBigList;
+import it.unimi.dsi.fastutil.objects.Object2LongFunction;
+import it.unimi.dsi.util.ByteBufferLongBigList;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SWHID;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
/**
* Mapping between internal long node id and external SWHID.
* <p>
- * Mappings in both directions are pre-computed and dumped on disk in the {@link NodeMapBuilder}
- * class, then they are loaded here using mmap().
+ * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using
+ * an mmap()-ed .order file containing the graph permutation.
+ *
+ * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the
+ * {@link NodeMapBuilder} class, then it is loaded here using mmap().
*
* @author The Software Heritage developers
* @see NodeMapBuilder
*/
public class NodeIdMap {
- /** Fixed length of full SWHID */
- public static final int SWHID_LENGTH = 50;
- /** Fixed length of long node id */
- public static final int NODE_ID_LENGTH = 20;
-
/** Fixed length of binary SWHID buffer */
public static final int SWHID_BIN_SIZE = 22;
- /** Fixed length of binary node id buffer */
- public static final int NODE_ID_BIN_SIZE = 8;
/** Graph path and basename */
String graphPath;
/** Number of ids to map */
long nbIds;
- /** mmap()-ed SWHID_TO_NODE file */
- MapFile swhToNodeMap;
+
/** mmap()-ed NODE_TO_SWHID file */
MapFile nodeToSwhMap;
+ /** Minimal perfect hash (MPH) function SWHID -> initial order */
+ Object2LongFunction<byte[]> mph;
+ /** mmap()-ed long list with the permutation initial order -> graph order */
+ LongBigList orderMap;
+ /** FileInputStream containing the permutation */
+ FileInputStream orderInputStream;
+
/**
* Constructor.
*
@@ -45,46 +53,70 @@
this.graphPath = graphPath;
this.nbIds = nbNodes;
- this.swhToNodeMap = new MapFile(graphPath + Graph.SWHID_TO_NODE, SWHID_BIN_SIZE + NODE_ID_BIN_SIZE);
+ // node -> SWHID
this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_SWHID, SWHID_BIN_SIZE);
+
+ // SWHID -> node
+ this.mph = loadMph(graphPath + ".mph");
+ this.orderInputStream = new FileInputStream(graphPath + ".order");
+ this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel());
+ }
+
+ @SuppressWarnings("unchecked")
+ public static Object2LongFunction<byte[]> loadMph(String path) throws IOException {
+ Object obj;
+ try {
+ obj = BinIO.loadObject(path);
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e.getMessage());
+ }
+
+ Object2LongFunction<byte[]> res = (Object2LongFunction<byte[]>) obj;
+
+ // Backward-compatibility for old maps parametrized with <String>.
+ // New maps should be parametrized with <byte[]>, which is faster.
+ try {
+ // Try to call it with bytes, will fail if it's a O2LF<String>.
+ res.getLong("42".getBytes(StandardCharsets.UTF_8));
+ } catch (ClassCastException e) {
+ Object2LongFunction<String> mphLegacy = (Object2LongFunction<String>) obj;
+ res = (o -> {
+ byte[] bi = (byte[]) o;
+ return mphLegacy.getLong(new String(bi, StandardCharsets.UTF_8));
+ });
+ }
+ // End of backward-compatibility block
+
+ return res;
}
/**
* Converts SWHID to corresponding long node id.
*
* @param swhid node represented as a {@link SWHID}
+ * @param checkExists if true, error if the SWHID is not present in the graph, if false the check
+ * will be skipped and invalid data will be returned for non-existing SWHIDs.
* @return corresponding node as a long id
* @see SWHID
*/
- public long getNodeId(SWHID swhid) {
- /*
- * The file is sorted by swhid, hence we can binary search on swhid to get corresponding nodeId
- */
- long start = 0;
- long end = nbIds - 1;
-
- while (start <= end) {
- long lineNumber = (start + end) / 2L;
- byte[] buffer = swhToNodeMap.readAtLine(lineNumber);
- byte[] swhidBuffer = new byte[SWHID_BIN_SIZE];
- byte[] nodeBuffer = new byte[NODE_ID_BIN_SIZE];
- System.arraycopy(buffer, 0, swhidBuffer, 0, SWHID_BIN_SIZE);
- System.arraycopy(buffer, SWHID_BIN_SIZE, nodeBuffer, 0, NODE_ID_BIN_SIZE);
-
- String currentSWHID = SWHID.fromBytes(swhidBuffer).getSWHID();
- long currentNodeId = java.nio.ByteBuffer.wrap(nodeBuffer).getLong();
-
- int cmp = currentSWHID.compareTo(swhid.toString());
- if (cmp == 0) {
- return currentNodeId;
- } else if (cmp < 0) {
- start = lineNumber + 1;
- } else {
- end = lineNumber - 1;
- }
+ public long getNodeId(SWHID swhid, boolean checkExists) {
+ // 1. Hash the SWHID with the MPH to get its original ID
+ long origNodeId = mph.getLong(swhid.toString().getBytes(StandardCharsets.US_ASCII));
+
+ // 2. Use the order permutation to get the position in the permuted graph
+ long nodeId = this.orderMap.getLong(origNodeId);
+
+ // 3. Check that the position effectively corresponds to a real node using the reverse map.
+ // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid.
+ if (!checkExists || getSWHID(nodeId).equals(swhid)) {
+ return nodeId;
+ } else {
+ throw new IllegalArgumentException("Unknown SWHID: " + swhid);
}
+ }
- throw new IllegalArgumentException("Unknown SWHID: " + swhid);
+ public long getNodeId(SWHID swhid) {
+ return getNodeId(swhid, true);
}
/**
@@ -110,7 +142,7 @@
* Closes the mapping files.
*/
public void close() throws IOException {
- swhToNodeMap.close();
+ orderInputStream.close();
nodeToSwhMap.close();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
@@ -4,7 +4,6 @@
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
-import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.io.FastBufferedReader;
@@ -12,6 +11,7 @@
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.experiments.topology.ConnectedComponents;
+import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.File;
import java.io.IOException;
@@ -19,16 +19,11 @@
import java.nio.charset.StandardCharsets;
public class ExportSubdataset {
- @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
- static Object2LongFunction<byte[]> loadMPH(String mphPath) throws IOException, ClassNotFoundException {
- return (Object2LongFunction<byte[]>) BinIO.loadObject(mphPath);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
System.err.print("Loading everything...");
String graphPath = args[0];
Graph graph = new Graph(graphPath);
- Object2LongFunction<byte[]> mphMap = loadMPH(graphPath + ".mph");
+ Object2LongFunction<byte[]> mphMap = NodeIdMap.loadMph(graphPath + ".mph");
System.err.println(" done.");
final long n = graph.numNodes();
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
@@ -1,10 +1,10 @@
package org.softwareheritage.graph.utils;
import com.martiansoftware.jsap.*;
-import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
+import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
import java.io.InputStreamReader;
@@ -28,16 +28,11 @@
return config;
}
- @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
- static Object2LongFunction<byte[]> loadMPH(String mphPath) throws IOException, ClassNotFoundException {
- return (Object2LongFunction<byte[]>) BinIO.loadObject(mphPath);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
JSAPResult config = parse_args(args);
String mphPath = config.getString("function");
- Object2LongFunction<byte[]> mphMap = loadMPH(mphPath);
+ Object2LongFunction<byte[]> mphMap = NodeIdMap.loadMph(mphPath);
// TODO: wasteful to convert to/from bytes
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
@@ -7,22 +7,18 @@
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
+import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class WriteRevisionTimestamps {
- @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
- static Object2LongFunction<byte[]> loadMPH(String mphPath) throws IOException, ClassNotFoundException {
- return (Object2LongFunction<byte[]>) BinIO.loadObject(mphPath);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
System.err.print("Loading everything...");
String graphPath = args[0];
String outputFile = args[1];
- Object2LongFunction<byte[]> mphMap = loadMPH(graphPath + ".mph");
+ Object2LongFunction<byte[]> mphMap = NodeIdMap.loadMph(graphPath + ".mph");
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order");
// NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 24, 5:58 PM (6 d, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229768
Attached To
D5427: NodeIdMap: use the MPH + mmapped .order to translate SWHID -> node ID
Event Timeline
Log In to Comment