diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
index 492ece2..ddee3cd 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
@@ -1,130 +1,148 @@
package org.softwareheritage.graph.maps;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.util.ByteBufferLongBigList;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SWHID;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* Mapping between internal long node id and external SWHID.
*
- * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH,
- * then permuting it using an mmap()-ed .order file containing the graph
- * permutation.
+ * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using
+ * an mmap()-ed .order file containing the graph permutation.
*
* The node -> SWHID reverse mapping is pre-computed and dumped on disk in the
* {@link NodeMapBuilder} class, then it is loaded here using mmap().
*
* @author The Software Heritage developers
* @see NodeMapBuilder
*/
public class NodeIdMap {
/** Fixed length of binary SWHID buffer */
public static final int SWHID_BIN_SIZE = 22;
/** Graph path and basename */
String graphPath;
/** Number of ids to map */
long nbIds;
/** mmap()-ed NODE_TO_SWHID file */
MapFile nodeToSwhMap;
/** Minimal perfect hash (MPH) function SWHID -> initial order */
Object2LongFunction mph;
/** mmap()-ed long list with the permutation initial order -> graph order */
LongBigList orderMap;
/** FileInputStream containing the permutation */
FileInputStream orderInputStream;
/**
* Constructor.
*
* @param graphPath full graph path
* @param nbNodes number of nodes in the graph
*/
public NodeIdMap(String graphPath, long nbNodes) throws IOException {
this.graphPath = graphPath;
this.nbIds = nbNodes;
// node -> SWHID
this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_SWHID, SWHID_BIN_SIZE);
// SWHID -> node
this.mph = loadMph(graphPath + ".mph");
this.orderInputStream = new FileInputStream(graphPath + ".order");
this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel());
}
@SuppressWarnings("unchecked")
- private Object2LongFunction loadMph(String path) throws IOException {
+ public static Object2LongFunction loadMph(String path) throws IOException {
+ Object obj;
try {
- return (Object2LongFunction) BinIO.loadObject(path);
+ obj = BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e.getMessage());
}
+
+ Object2LongFunction res = (Object2LongFunction) obj;
+
+ // Backward-compatibility for old maps parametrized with .
+ // New maps should be parametrized with , which is faster.
+ try {
+ // Try to call it with bytes, will fail if it's a O2LF.
+ res.getLong("42".getBytes(StandardCharsets.UTF_8));
+ } catch (ClassCastException e) {
+ Object2LongFunction mphLegacy = (Object2LongFunction) obj;
+ res = (o -> {
+ byte[] bi = (byte[]) o;
+ return mphLegacy.getLong(new String(bi, StandardCharsets.UTF_8));
+ });
+ }
+ // End of backward-compatibility block
+
+ return res;
}
/**
* Converts SWHID to corresponding long node id.
*
* @param swhid node represented as a {@link SWHID}
* @param checkExists if true, error if the SWHID is not present in the graph, if false the check
* will be skipped and invalid data will be returned for non-existing SWHIDs.
* @return corresponding node as a long id
* @see SWHID
*/
public long getNodeId(SWHID swhid, boolean checkExists) {
// 1. Hash the SWHID with the MPH to get its original ID
long origNodeId = mph.getLong(swhid.toString().getBytes(StandardCharsets.US_ASCII));
// 2. Use the order permutation to get the position in the permuted graph
long nodeId = this.orderMap.getLong(origNodeId);
// 3. Check that the position effectively corresponds to a real node using the reverse map.
// This is necessary because the MPH makes no guarantees on whether the input SWHID is valid.
if (!checkExists || getSWHID(nodeId).equals(swhid)) {
return nodeId;
} else {
throw new IllegalArgumentException("Unknown SWHID: " + swhid);
}
}
public long getNodeId(SWHID swhid) {
return getNodeId(swhid, true);
}
/**
* Converts a node long id to corresponding SWHID.
*
* @param nodeId node as a long id
* @return corresponding node as a {@link SWHID}
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
/*
* Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's
* swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid
*/
if (nodeId < 0 || nodeId >= nbIds) {
throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds);
}
return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId));
}
/**
* Closes the mapping files.
*/
public void close() throws IOException {
orderInputStream.close();
nodeToSwhMap.close();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
index 1938355..3471ed5 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
@@ -1,81 +1,76 @@
package org.softwareheritage.graph.utils;
import com.google.common.primitives.Longs;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
-import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.experiments.topology.ConnectedComponents;
+import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class ExportSubdataset {
- @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
- static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException {
- return (Object2LongFunction) BinIO.loadObject(mphPath);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
System.err.print("Loading everything...");
String graphPath = args[0];
Graph graph = new Graph(graphPath);
- Object2LongFunction mphMap = loadMPH(graphPath + ".mph");
+ Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
System.err.println(" done.");
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
LineIterator lineIterator = new LineIterator(buffer);
while (lineIterator.hasNext()) {
String line = lineIterator.next().toString();
long i;
try {
// i = mphMap.getLong(line.getBytes(StandardCharsets.UTF_8));
i = graph.getNodeId(new SWHID(line));
} catch (IllegalArgumentException e) {
continue;
}
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
SWHID currentNodeSWHID = graph.getSWHID(currentNode);
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
System.out.format("%s %s\n", currentNodeSWHID, graph.getSWHID(succ));
if (visited.getBoolean(succ))
continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
index 3be8ae0..0d672e2 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
@@ -1,51 +1,46 @@
package org.softwareheritage.graph.utils;
import com.martiansoftware.jsap.*;
-import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
+import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class MPHTranslate {
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(MPHTranslate.class.getName(), "",
new Parameter[]{new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Filename of the serialized MPH"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
- @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
- static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException {
- return (Object2LongFunction) BinIO.loadObject(mphPath);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
JSAPResult config = parse_args(args);
String mphPath = config.getString("function");
- Object2LongFunction mphMap = loadMPH(mphPath);
+ Object2LongFunction mphMap = NodeIdMap.loadMph(mphPath);
// TODO: wasteful to convert to/from bytes
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
LineIterator lineIterator = new LineIterator(buffer);
while (lineIterator.hasNext()) {
String line = lineIterator.next().toString();
System.out.println(mphMap.getLong(line.getBytes(StandardCharsets.US_ASCII)));
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
index 52491b2..7d35574 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java
@@ -1,57 +1,53 @@
package org.softwareheritage.graph.utils;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
+import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class WriteRevisionTimestamps {
- @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
- static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException {
- return (Object2LongFunction) BinIO.loadObject(mphPath);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
System.err.print("Loading everything...");
String graphPath = args[0];
String outputFile = args[1];
- Object2LongFunction mphMap = loadMPH(graphPath + ".mph");
+ Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order");
// NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds);
long[][] timestampArray = LongBigArrays.newBigArray(nbIds);
BigArrays.fill(timestampArray, Long.MIN_VALUE);
System.err.println(" done.");
// TODO: wasteful to convert to/from bytes
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
LineIterator lineIterator = new LineIterator(buffer);
while (lineIterator.hasNext()) {
String line = lineIterator.next().toString();
String[] line_elements = line.split("[ \\t]");
// SWHID currentRev = new SWHID(line_elements[0].strip());
long revId = -1;
long timestamp = -1;
try {
// revId = nodeIdMap.getNodeId(currentRev);
long revHash = mphMap.getLong(line_elements[0].strip().getBytes(StandardCharsets.US_ASCII));
revId = BigArrays.get(nodePerm, revHash);
timestamp = Long.parseLong(line_elements[1].strip());
} catch (IllegalArgumentException e) {
continue;
}
BigArrays.set(timestampArray, revId, timestamp);
// System.err.println(revId + " " + timestamp);
}
BinIO.storeLongs(timestampArray, outputFile);
}
}