diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
index 5d7641a..2a2c50f 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
@@ -1,192 +1,186 @@
package org.softwareheritage.graph.maps;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.util.ByteBufferLongBigList;
import org.softwareheritage.graph.SWHID;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* Mapping between internal long node id and external SWHID.
*
* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using
* an mmap()-ed .order file containing the graph permutation.
*
* The node -> SWHID reverse mapping is pre-computed and dumped on disk in the
* {@link NodeMapBuilder} class, then it is loaded here using mmap().
*
* @author The Software Heritage developers
* @see NodeMapBuilder
*/
public class NodeIdMap {
/** Fixed length of binary SWHID buffer */
public static final int SWHID_BIN_SIZE = 22;
- /** File extension for the SWHID to long node id map */
- public static final String SWHID_TO_NODE = ".swhid2node.bin";
/** File extension for the long node id to SWHID map */
public static final String NODE_TO_SWHID = ".node2swhid.bin";
/** Graph path and basename */
String graphPath;
/** Number of ids to map */
long nbIds;
/** mmap()-ed NODE_TO_SWHID file */
MapFile nodeToSwhMap;
/** Minimal perfect hash (MPH) function SWHID -> initial order */
Object2LongFunction mph;
/** mmap()-ed long list with the permutation initial order -> graph order */
LongBigList orderMap;
/** FileInputStream containing the permutation */
FileInputStream orderInputStream;
/**
* Constructor.
*
* @param graphPath full graph path
* @param nbNodes number of nodes in the graph
*/
public NodeIdMap(String graphPath, long nbNodes) throws IOException {
this.graphPath = graphPath;
this.nbIds = nbNodes;
// node -> SWHID
this.nodeToSwhMap = new MapFile(graphPath + NODE_TO_SWHID, SWHID_BIN_SIZE);
// SWHID -> node
this.mph = loadMph(graphPath + ".mph");
this.orderInputStream = new FileInputStream(graphPath + ".order");
this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel());
}
@SuppressWarnings("unchecked")
public static Object2LongFunction loadMph(String path) throws IOException {
Object obj;
try {
obj = BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e.getMessage());
}
Object2LongFunction res = (Object2LongFunction) obj;
// Backward-compatibility for old maps parametrized with .
// New maps should be parametrized with , which is faster.
try {
// Try to call it with bytes, will fail if it's a O2LF.
res.getLong("42".getBytes(StandardCharsets.UTF_8));
} catch (ClassCastException e) {
class StringCompatibleByteFunction implements Object2LongFunction, Size64 {
private final Object2LongFunction legacyFunction;
public StringCompatibleByteFunction(Object2LongFunction legacyFunction) {
this.legacyFunction = legacyFunction;
}
@Override
public long getLong(Object o) {
byte[] bi = (byte[]) o;
return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8));
}
@Override
public int size() {
return legacyFunction.size();
}
@Override
public long size64() {
return (legacyFunction instanceof Size64)
? ((Size64) legacyFunction).size64()
: legacyFunction.size();
}
}
Object2LongFunction mphLegacy = (Object2LongFunction) obj;
return new StringCompatibleByteFunction(mphLegacy);
- /*
- * res = (o -> { byte[] bi = (byte[]) o; return mphLegacy.getLong(new String(bi,
- * StandardCharsets.UTF_8)); });
- */
}
// End of backward-compatibility block
return res;
}
/**
* Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the
* SWHID is valid.
*
* @param swhid node represented as bytes
* @return corresponding node as a long id
*/
public long getNodeId(byte[] swhid) {
// 1. Hash the SWHID with the MPH to get its original ID
long origNodeId = mph.getLong(swhid);
// 2. Use the order permutation to get the position in the permuted graph
return this.orderMap.getLong(origNodeId);
}
/**
* Converts SWHID to corresponding long node id.
*
* @param swhid node represented as a {@link SWHID}
* @param checkExists if true, error if the SWHID is not present in the graph, if false the check
* will be skipped and invalid data will be returned for non-existing SWHIDs.
* @return corresponding node as a long id
* @see SWHID
*/
public long getNodeId(SWHID swhid, boolean checkExists) {
// Convert the SWHID to bytes and call getNodeId()
long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII));
// Check that the position effectively corresponds to a real node using the reverse map.
// This is necessary because the MPH makes no guarantees on whether the input SWHID is valid.
if (!checkExists || getSWHID(nodeId).equals(swhid)) {
return nodeId;
} else {
throw new IllegalArgumentException("Unknown SWHID: " + swhid);
}
}
public long getNodeId(SWHID swhid) {
return getNodeId(swhid, true);
}
/**
* Converts a node long id to corresponding SWHID.
*
* @param nodeId node as a long id
* @return corresponding node as a {@link SWHID}
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
/*
* Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's
* swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid
*/
if (nodeId < 0 || nodeId >= nbIds) {
throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds);
}
return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId));
}
/**
* Closes the mapping files.
*/
public void close() throws IOException {
orderInputStream.close();
nodeToSwhMap.close();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
index 2050bba..626c747 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
@@ -1,216 +1,191 @@
package org.softwareheritage.graph.maps;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SWHID;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Scanner;
import java.util.concurrent.TimeUnit;
/**
* Create maps needed at runtime by the graph service, in particular:
*
*
* - SWHID → WebGraph long node id
* - WebGraph long node id → SWHID (converse of the former)
* - WebGraph long node id → SWH node type (enum)
*
*
* @author The Software Heritage developers
*/
public class NodeMapBuilder {
final static String SORT_BUFFER_SIZE = "40%";
final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class);
/**
* Main entrypoint.
*
* @param args command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV");
System.exit(1);
}
String graphPath = args[0];
String tmpDir = args[1];
logger.info("starting maps generation...");
precomputeNodeIdMap(graphPath, tmpDir);
logger.info("maps generation completed");
}
/**
* Computes and dumps on disk mapping files.
*
* @param graphPath path of the compressed graph
*/
- // Suppress warning for Object2LongFunction cast
- @SuppressWarnings("unchecked")
static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException {
ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
- plSWHID2Node.itemsName = "swhid→node";
- plNode2SWHID.itemsName = "node→swhid";
-
- /*
- * avg speed for swhid→node is sometime skewed due to write to the sort pipe hanging when sort is
- * sorting; hence also desplay local speed
- */
- plSWHID2Node.displayLocalSpeed = true;
+ plSWHID2Node.itemsName = "Hashing swhid→node";
+ plNode2SWHID.itemsName = "Building map node→swhid";
// first half of SWHID->node mapping: SWHID -> WebGraph MPH (long)
- Object2LongFunction mphMap = null;
- try {
- logger.info("loading MPH function...");
- mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph");
- logger.info("MPH function loaded");
- } catch (ClassNotFoundException e) {
- logger.error("unknown class object in .mph file: " + e);
- System.exit(2);
- }
+ Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
plSWHID2Node.expectedUpdates = nbIds;
plNode2SWHID.expectedUpdates = nbIds;
// second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long)
long[][] bfsMap = LongBigArrays.newBigArray(nbIds);
logger.info("loading BFS order file...");
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap);
logger.info("BFS order file loaded");
if (loaded != nbIds) {
logger.error("graph contains " + nbIds + " nodes, but read " + loaded);
System.exit(2);
}
/*
- * Create mapping SWHID -> WebGraph node id, by sequentially reading nodes, hashing them with MPH,
- * and permuting according to BFS order
+ * Read on stdin a list of SWHIDs, hash them with MPH, then permute them according to the .order
+ * file
*/
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
LineIterator swhidIterator = new LineIterator(buffer);
/*
* The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically
* sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the
* sorting job to /usr/bin/sort via pipes
*/
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir);
Process sort = processBuilder.start();
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream());
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream());
- // for the binary format of swhidToNodeMap, see Python module swh.graph.swhid:SwhidToIntMap
// for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap
- try (DataOutputStream swhidToNodeMap = new DataOutputStream(
- new BufferedOutputStream(new FileOutputStream(graphPath + NodeIdMap.SWHID_TO_NODE)));
- BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream(
- new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) {
+ try (BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream(
+ new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) {
/*
- * background handler for sort output, it will be fed SWHID/node pairs while swhidToNodeMap is being
- * filled, and will itself fill nodeToSwhidMap as soon as data from sort is ready
+ * background handler for sort output, it will be fed SWHID/node pairs, and will itself fill
+ * nodeToSwhidMap as soon as data from sort is ready.
*/
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID);
outputHandler.start();
/*
* Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to
* efficiently check edge restrictions.
*/
- final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2));
- final int nbBitsPerNodeType = log2NbTypes;
+ final int nbBitsPerNodeType = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2));
LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
- plSWHID2Node.start("filling swhid2node map");
+ plSWHID2Node.start("Hashing SWHIDs to fill sort input");
for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) {
String swhidStr = swhidIterator.next().toString();
SWHID swhid = new SWHID(swhidStr);
- byte[] swhidBin = swhid.toBytes();
long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII));
long nodeId = BigArrays.get(bfsMap, mphId);
-
- swhidToNodeMap.write(swhidBin, 0, swhidBin.length);
- swhidToNodeMap.writeLong(nodeId);
sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII));
nodeTypesMap.set(nodeId, swhid.getType().ordinal());
plSWHID2Node.lightUpdate();
}
plSWHID2Node.done();
sort_stdin.close();
// write type map
logger.info("storing type map");
BinIO.storeObject(nodeTypesMap, graphPath + NodeTypesMap.NODE_TO_TYPE);
logger.info("type map stored");
// wait for nodeToSwhidMap filling
try {
logger.info("waiting for node2swhid map...");
int sortExitCode = sort.waitFor();
if (sortExitCode != 0) {
logger.error("sort returned non-zero exit code: " + sortExitCode);
System.exit(2);
}
outputHandler.join();
} catch (InterruptedException e) {
logger.error("processing of sort output failed with: " + e);
System.exit(2);
}
}
-
}
private static class SortOutputHandler extends Thread {
- private Scanner input;
- private OutputStream output;
- private ProgressLogger pl;
+ private final Scanner input;
+ private final OutputStream output;
+ private final ProgressLogger pl;
SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) {
this.input = new Scanner(input, StandardCharsets.US_ASCII);
this.output = output;
this.pl = pl;
}
public void run() {
boolean sortDone = false;
logger.info("node2swhid: waiting for sort output...");
while (input.hasNextLine()) {
if (!sortDone) {
sortDone = true;
this.pl.start("filling node2swhid map");
}
String line = input.nextLine(); // format: SWHID NODE_ID
SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID
try {
- output.write((byte[]) swhid.toBytes());
+ output.write(swhid.toBytes());
} catch (IOException e) {
logger.error("writing to node->SWHID map failed with: " + e);
}
this.pl.lightUpdate();
}
this.pl.done();
}
}
}
diff --git a/swh/graph/tests/dataset/output/example.swhid2node.bin b/swh/graph/tests/dataset/output/example.swhid2node.bin
deleted file mode 100644
index b8df2fa..0000000
Binary files a/swh/graph/tests/dataset/output/example.swhid2node.bin and /dev/null differ