diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java index 492ece2..ddee3cd 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,130 +1,148 @@ package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.util.ByteBufferLongBigList; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SWHID; import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

- * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, - * then permuting it using an mmap()-ed .order file containing the graph - * permutation. + * The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using + * an mmap()-ed .order file containing the graph permutation. * * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ public class NodeIdMap { /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; /** Graph path and basename */ String graphPath; /** Number of ids to map */ long nbIds; /** mmap()-ed NODE_TO_SWHID file */ MapFile nodeToSwhMap; /** Minimal perfect hash (MPH) function SWHID -> initial order */ Object2LongFunction mph; /** mmap()-ed long list with the permutation initial order -> graph order */ LongBigList orderMap; /** FileInputStream containing the permutation */ FileInputStream orderInputStream; /** * Constructor. * * @param graphPath full graph path * @param nbNodes number of nodes in the graph */ public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; // node -> SWHID this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_SWHID, SWHID_BIN_SIZE); // SWHID -> node this.mph = loadMph(graphPath + ".mph"); this.orderInputStream = new FileInputStream(graphPath + ".order"); this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel()); } @SuppressWarnings("unchecked") - private Object2LongFunction loadMph(String path) throws IOException { + public static Object2LongFunction loadMph(String path) throws IOException { + Object obj; try { - return (Object2LongFunction) BinIO.loadObject(path); + obj = BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e.getMessage()); } + + Object2LongFunction res = (Object2LongFunction) obj; + + // Backward-compatibility for old maps parametrized with . + // New maps should be parametrized with , which is faster. + try { + // Try to call it with bytes, will fail if it's a O2LF. + res.getLong("42".getBytes(StandardCharsets.UTF_8)); + } catch (ClassCastException e) { + Object2LongFunction mphLegacy = (Object2LongFunction) obj; + res = (o -> { + byte[] bi = (byte[]) o; + return mphLegacy.getLong(new String(bi, StandardCharsets.UTF_8)); + }); + } + // End of backward-compatibility block + + return res; } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} * @param checkExists if true, error if the SWHID is not present in the graph, if false the check * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ public long getNodeId(SWHID swhid, boolean checkExists) { // 1. Hash the SWHID with the MPH to get its original ID long origNodeId = mph.getLong(swhid.toString().getBytes(StandardCharsets.US_ASCII)); // 2. Use the order permutation to get the position in the permuted graph long nodeId = this.orderMap.getLong(origNodeId); // 3. Check that the position effectively corresponds to a real node using the reverse map. // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. if (!checkExists || getSWHID(nodeId).equals(swhid)) { return nodeId; } else { throw new IllegalArgumentException("Unknown SWHID: " + swhid); } } public long getNodeId(SWHID swhid) { return getNodeId(swhid, true); } /** * Converts a node long id to corresponding SWHID. * * @param nodeId node as a long id * @return corresponding node as a {@link SWHID} * @see SWHID */ public SWHID getSWHID(long nodeId) { /* * Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's * swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid */ if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId)); } /** * Closes the mapping files. */ public void close() throws IOException { orderInputStream.close(); nodeToSwhMap.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java index 1938355..3471ed5 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java @@ -1,81 +1,76 @@ package org.softwareheritage.graph.utils; import com.google.common.primitives.Longs; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; -import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.experiments.topology.ConnectedComponents; +import org.softwareheritage.graph.maps.NodeIdMap; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class ExportSubdataset { - @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); - } - public static void main(String[] args) throws IOException, ClassNotFoundException { System.err.print("Loading everything..."); String graphPath = args[0]; Graph graph = new Graph(graphPath); - Object2LongFunction mphMap = loadMPH(graphPath + ".mph"); + Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); System.err.println(" done."); final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 LongArrayBitVector visited = LongArrayBitVector.ofLength(n); FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); long i; try { // i = mphMap.getLong(line.getBytes(StandardCharsets.UTF_8)); i = graph.getNodeId(new SWHID(line)); } catch (IllegalArgumentException e) { continue; } queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); SWHID currentNodeSWHID = graph.getSWHID(currentNode); final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { System.out.format("%s %s\n", currentNodeSWHID, graph.getSWHID(succ)); if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java index 3be8ae0..0d672e2 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java @@ -1,51 +1,46 @@ package org.softwareheritage.graph.utils; import com.martiansoftware.jsap.*; -import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; +import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class MPHTranslate { private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(MPHTranslate.class.getName(), "", new Parameter[]{new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.REQUIRED, "Filename of the serialized MPH"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } - @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); - } - public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String mphPath = config.getString("function"); - Object2LongFunction mphMap = loadMPH(mphPath); + Object2LongFunction mphMap = NodeIdMap.loadMph(mphPath); // TODO: wasteful to convert to/from bytes FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); System.out.println(mphMap.getLong(line.getBytes(StandardCharsets.US_ASCII))); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java index 52491b2..7d35574 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java @@ -1,57 +1,53 @@ package org.softwareheritage.graph.utils; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; +import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class WriteRevisionTimestamps { - @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); - } - public static void main(String[] args) throws IOException, ClassNotFoundException { System.err.print("Loading everything..."); String graphPath = args[0]; String outputFile = args[1]; - Object2LongFunction mphMap = loadMPH(graphPath + ".mph"); + Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order"); // NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds); long[][] timestampArray = LongBigArrays.newBigArray(nbIds); BigArrays.fill(timestampArray, Long.MIN_VALUE); System.err.println(" done."); // TODO: wasteful to convert to/from bytes FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); String[] line_elements = line.split("[ \\t]"); // SWHID currentRev = new SWHID(line_elements[0].strip()); long revId = -1; long timestamp = -1; try { // revId = nodeIdMap.getNodeId(currentRev); long revHash = mphMap.getLong(line_elements[0].strip().getBytes(StandardCharsets.US_ASCII)); revId = BigArrays.get(nodePerm, revHash); timestamp = Long.parseLong(line_elements[1].strip()); } catch (IllegalArgumentException e) { continue; } BigArrays.set(timestampArray, revId, timestamp); // System.err.println(revId + " " + timestamp); } BinIO.storeLongs(timestampArray, outputFile); } }