Differential D4028 Diff 14692 java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
Changeset View
Changeset View
Standalone View
Standalone View
java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
Show All 18 Lines | |||||
import java.io.*; | import java.io.*; | ||||
import java.nio.charset.StandardCharsets; | import java.nio.charset.StandardCharsets; | ||||
import java.util.Scanner; | import java.util.Scanner; | ||||
import java.util.concurrent.TimeUnit; | import java.util.concurrent.TimeUnit; | ||||
/** | /** | ||||
* Create maps needed at runtime by the graph service, in particular: | * Create maps needed at runtime by the graph service, in particular: | ||||
* <p> | * <p> | ||||
* - SWHID → WebGraph long node id | * <ul> | ||||
* - WebGraph long node id → SWHID (converse of the former) | * <li>SWHID → WebGraph long node id</li> | ||||
* - WebGraph long node id → SWH node type (enum) | * <li>WebGraph long node id → SWHID (converse of the former)</li> | ||||
* <li>WebGraph long node id → SWH node type (enum)</li> | |||||
* </ul> | |||||
* | * | ||||
haltode: Fix list formatting | |||||
* @author The Software Heritage developers | * @author The Software Heritage developers | ||||
*/ | */ | ||||
public class NodeMapBuilder { | public class NodeMapBuilder { | ||||
final static String SORT_BUFFER_SIZE = "40%"; | final static String SORT_BUFFER_SIZE = "40%"; | ||||
final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class); | final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class); | ||||
Show All 17 Lines | public class NodeMapBuilder { | ||||
/** | /** | ||||
* Computes and dumps on disk mapping files. | * Computes and dumps on disk mapping files. | ||||
* | * | ||||
* @param graphPath path of the compressed graph | * @param graphPath path of the compressed graph | ||||
*/ | */ | ||||
// Suppress warning for Object2LongFunction cast | // Suppress warning for Object2LongFunction cast | ||||
@SuppressWarnings("unchecked") | @SuppressWarnings("unchecked") | ||||
static void precomputeNodeIdMap(String graphPath, String tmpDir) | static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { | ||||
throws IOException { | |||||
ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
plSWHID2Node.itemsName = "swhid→node"; | plSWHID2Node.itemsName = "swhid→node"; | ||||
plNode2SWHID.itemsName = "node→swhid"; | plNode2SWHID.itemsName = "node→swhid"; | ||||
// avg speed for swhid→node is sometime skewed due to write to the sort | /* | ||||
// pipe hanging when sort is sorting; hence also desplay local speed | * avg speed for swhid→node is sometime skewed due to write to the sort pipe hanging when sort is | ||||
* sorting; hence also desplay local speed | |||||
*/ | |||||
plSWHID2Node.displayLocalSpeed = true; | plSWHID2Node.displayLocalSpeed = true; | ||||
// first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) | // first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) | ||||
Object2LongFunction<String> mphMap = null; | Object2LongFunction<String> mphMap = null; | ||||
try { | try { | ||||
logger.info("loading MPH function..."); | logger.info("loading MPH function..."); | ||||
mphMap = (Object2LongFunction<String>) BinIO.loadObject(graphPath + ".mph"); | mphMap = (Object2LongFunction<String>) BinIO.loadObject(graphPath + ".mph"); | ||||
logger.info("MPH function loaded"); | logger.info("MPH function loaded"); | ||||
Show All 10 Lines | static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { | ||||
logger.info("loading BFS order file..."); | logger.info("loading BFS order file..."); | ||||
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); | long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); | ||||
logger.info("BFS order file loaded"); | logger.info("BFS order file loaded"); | ||||
if (loaded != nbIds) { | if (loaded != nbIds) { | ||||
logger.error("graph contains " + nbIds + " nodes, but read " + loaded); | logger.error("graph contains " + nbIds + " nodes, but read " + loaded); | ||||
System.exit(2); | System.exit(2); | ||||
} | } | ||||
// Create mapping SWHID -> WebGraph node id, by sequentially reading | /* | ||||
// nodes, hashing them with MPH, and permuting according to BFS order | * Create mapping SWHID -> WebGraph node id, by sequentially reading nodes, hashing them with MPH, | ||||
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, | * and permuting according to BFS order | ||||
StandardCharsets.US_ASCII)); | */ | ||||
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | |||||
LineIterator swhidIterator = new LineIterator(buffer); | LineIterator swhidIterator = new LineIterator(buffer); | ||||
// The WebGraph node id -> SWHID mapping can be obtained from the | /* | ||||
// SWHID->node one by numerically sorting on node id and sequentially | * The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically | ||||
// writing obtained SWHIDs to a binary map. Delegates the sorting job to | * sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the | ||||
// /usr/bin/sort via pipes | * sorting job to /usr/bin/sort via pipes | ||||
*/ | |||||
ProcessBuilder processBuilder = new ProcessBuilder(); | ProcessBuilder processBuilder = new ProcessBuilder(); | ||||
processBuilder.command("sort", "--numeric-sort", "--key", "2", | processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, | ||||
"--buffer-size", SORT_BUFFER_SIZE, | |||||
"--temporary-directory", tmpDir); | "--temporary-directory", tmpDir); | ||||
Process sort = processBuilder.start(); | Process sort = processBuilder.start(); | ||||
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | ||||
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | ||||
// for the binary format of swhidToNodeMap, see Python module swh.graph.swhid:SwhidToIntMap | // for the binary format of swhidToNodeMap, see Python module swh.graph.swhid:SwhidToIntMap | ||||
// for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap | // for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap | ||||
try (DataOutputStream swhidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.SWHID_TO_NODE))); | try (DataOutputStream swhidToNodeMap = new DataOutputStream( | ||||
Done Inline ActionsFix inline comments. haltode: Fix inline comments. | |||||
BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_SWHID))) { | new BufferedOutputStream(new FileOutputStream(graphPath + Graph.SWHID_TO_NODE))); | ||||
BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( | |||||
// background handler for sort output, it will be fed SWHID/node | new FileOutputStream(graphPath + Graph.NODE_TO_SWHID))) { | ||||
// pairs while swhidToNodeMap is being filled, and will itself fill | |||||
// nodeToSwhidMap as soon as data from sort is ready | /* | ||||
* background handler for sort output, it will be fed SWHID/node pairs while swhidToNodeMap is being | |||||
* filled, and will itself fill nodeToSwhidMap as soon as data from sort is ready | |||||
*/ | |||||
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); | SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); | ||||
outputHandler.start(); | outputHandler.start(); | ||||
// Type map from WebGraph node ID to SWH type. Used at runtime by | /* | ||||
// pure Java graph traversals to efficiently check edge | * Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to | ||||
// restrictions. | * efficiently check edge restrictions. | ||||
final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) | */ | ||||
/ Math.log(2)); | final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); | ||||
final int nbBitsPerNodeType = log2NbTypes; | final int nbBitsPerNodeType = log2NbTypes; | ||||
LongArrayBitVector nodeTypesBitVector = | LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | ||||
LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | |||||
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | ||||
plSWHID2Node.start("filling swhid2node map"); | plSWHID2Node.start("filling swhid2node map"); | ||||
for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { | for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { | ||||
String swhidStr = swhidIterator.next().toString(); | String swhidStr = swhidIterator.next().toString(); | ||||
SWHID swhid = new SWHID(swhidStr); | SWHID swhid = new SWHID(swhidStr); | ||||
byte[] swhidBin = swhid.toBytes(); | byte[] swhidBin = swhid.toBytes(); | ||||
long mphId = mphMap.getLong(swhidStr); | long mphId = mphMap.getLong(swhidStr); | ||||
long nodeId = BigArrays.get(bfsMap, mphId); | long nodeId = BigArrays.get(bfsMap, mphId); | ||||
swhidToNodeMap.write(swhidBin, 0, swhidBin.length); | swhidToNodeMap.write(swhidBin, 0, swhidBin.length); | ||||
swhidToNodeMap.writeLong(nodeId); | swhidToNodeMap.writeLong(nodeId); | ||||
sort_stdin.write((swhidStr + "\t" + nodeId + "\n") | sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); | ||||
.getBytes(StandardCharsets.US_ASCII)); | |||||
nodeTypesMap.set(nodeId, swhid.getType().ordinal()); | nodeTypesMap.set(nodeId, swhid.getType().ordinal()); | ||||
plSWHID2Node.lightUpdate(); | plSWHID2Node.lightUpdate(); | ||||
} | } | ||||
plSWHID2Node.done(); | plSWHID2Node.done(); | ||||
sort_stdin.close(); | sort_stdin.close(); | ||||
// write type map | // write type map | ||||
Show All 32 Lines | private static class SortOutputHandler extends Thread { | ||||
public void run() { | public void run() { | ||||
boolean sortDone = false; | boolean sortDone = false; | ||||
logger.info("node2swhid: waiting for sort output..."); | logger.info("node2swhid: waiting for sort output..."); | ||||
while (input.hasNextLine()) { | while (input.hasNextLine()) { | ||||
if (!sortDone) { | if (!sortDone) { | ||||
sortDone = true; | sortDone = true; | ||||
this.pl.start("filling node2swhid map"); | this.pl.start("filling node2swhid map"); | ||||
} | } | ||||
String line = input.nextLine(); // format: SWHID <TAB> NODE_ID | String line = input.nextLine(); // format: SWHID <TAB> NODE_ID | ||||
SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID | SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID | ||||
try { | try { | ||||
output.write((byte[]) swhid.toBytes()); | output.write((byte[]) swhid.toBytes()); | ||||
} catch (IOException e) { | } catch (IOException e) { | ||||
logger.error("writing to node->SWHID map failed with: " + e); | logger.error("writing to node->SWHID map failed with: " + e); | ||||
} | } | ||||
this.pl.lightUpdate(); | this.pl.lightUpdate(); | ||||
} | } | ||||
this.pl.done(); | this.pl.done(); | ||||
} | } | ||||
} | } | ||||
} | } |
Fix list formatting