Differential D6699 Diff 24489 java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
Changeset View
Changeset View
Standalone View
Standalone View
java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | public static void main(String[] args) throws IOException { | ||||
logger.info("maps generation completed"); | logger.info("maps generation completed"); | ||||
} | } | ||||
/** | /** | ||||
* Computes and dumps on disk mapping files. | * Computes and dumps on disk mapping files. | ||||
* | * | ||||
* @param graphPath path of the compressed graph | * @param graphPath path of the compressed graph | ||||
*/ | */ | ||||
// Suppress warning for Object2LongFunction cast | |||||
@SuppressWarnings("unchecked") | |||||
static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { | static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { | ||||
ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
plSWHID2Node.itemsName = "swhid→node"; | plSWHID2Node.itemsName = "Hashing swhid→node"; | ||||
plNode2SWHID.itemsName = "node→swhid"; | plNode2SWHID.itemsName = "Building map node→swhid"; | ||||
/* | |||||
* avg speed for swhid→node is sometime skewed due to write to the sort pipe hanging when sort is | |||||
* sorting; hence also desplay local speed | |||||
*/ | |||||
plSWHID2Node.displayLocalSpeed = true; | |||||
// first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) | // first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) | ||||
Object2LongFunction<byte[]> mphMap = null; | Object2LongFunction<byte[]> mphMap = NodeIdMap.loadMph(graphPath + ".mph"); | ||||
try { | |||||
logger.info("loading MPH function..."); | |||||
mphMap = (Object2LongFunction<byte[]>) BinIO.loadObject(graphPath + ".mph"); | |||||
logger.info("MPH function loaded"); | |||||
} catch (ClassNotFoundException e) { | |||||
logger.error("unknown class object in .mph file: " + e); | |||||
System.exit(2); | |||||
} | |||||
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); | long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); | ||||
plSWHID2Node.expectedUpdates = nbIds; | plSWHID2Node.expectedUpdates = nbIds; | ||||
plNode2SWHID.expectedUpdates = nbIds; | plNode2SWHID.expectedUpdates = nbIds; | ||||
// second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long) | // second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long) | ||||
long[][] bfsMap = LongBigArrays.newBigArray(nbIds); | long[][] bfsMap = LongBigArrays.newBigArray(nbIds); | ||||
logger.info("loading BFS order file..."); | logger.info("loading BFS order file..."); | ||||
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); | long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); | ||||
logger.info("BFS order file loaded"); | logger.info("BFS order file loaded"); | ||||
if (loaded != nbIds) { | if (loaded != nbIds) { | ||||
logger.error("graph contains " + nbIds + " nodes, but read " + loaded); | logger.error("graph contains " + nbIds + " nodes, but read " + loaded); | ||||
System.exit(2); | System.exit(2); | ||||
} | } | ||||
/* | /* | ||||
* Create mapping SWHID -> WebGraph node id, by sequentially reading nodes, hashing them with MPH, | * Read on stdin a list of SWHIDs, hash them with MPH, then permute them according to the .order | ||||
* and permuting according to BFS order | * file | ||||
*/ | */ | ||||
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | ||||
LineIterator swhidIterator = new LineIterator(buffer); | LineIterator swhidIterator = new LineIterator(buffer); | ||||
/* | /* | ||||
* The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically | * The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically | ||||
* sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the | * sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the | ||||
* sorting job to /usr/bin/sort via pipes | * sorting job to /usr/bin/sort via pipes | ||||
*/ | */ | ||||
ProcessBuilder processBuilder = new ProcessBuilder(); | ProcessBuilder processBuilder = new ProcessBuilder(); | ||||
processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, | processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, | ||||
"--temporary-directory", tmpDir); | "--temporary-directory", tmpDir); | ||||
Process sort = processBuilder.start(); | Process sort = processBuilder.start(); | ||||
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | ||||
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | ||||
// for the binary format of swhidToNodeMap, see Python module swh.graph.swhid:SwhidToIntMap | |||||
// for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap | // for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap | ||||
try (DataOutputStream swhidToNodeMap = new DataOutputStream( | try (BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( | ||||
new BufferedOutputStream(new FileOutputStream(graphPath + NodeIdMap.SWHID_TO_NODE))); | |||||
BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( | |||||
new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) { | new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) { | ||||
/* | /* | ||||
* background handler for sort output, it will be fed SWHID/node pairs while swhidToNodeMap is being | * background handler for sort output, it will be fed SWHID/node pairs, and will itself fill | ||||
* filled, and will itself fill nodeToSwhidMap as soon as data from sort is ready | * nodeToSwhidMap as soon as data from sort is ready. | ||||
*/ | */ | ||||
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); | SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); | ||||
outputHandler.start(); | outputHandler.start(); | ||||
/* | /* | ||||
* Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to | * Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to | ||||
* efficiently check edge restrictions. | * efficiently check edge restrictions. | ||||
*/ | */ | ||||
final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); | final int nbBitsPerNodeType = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); | ||||
final int nbBitsPerNodeType = log2NbTypes; | |||||
LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | ||||
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | ||||
plSWHID2Node.start("filling swhid2node map"); | plSWHID2Node.start("Hashing SWHIDs to fill sort input"); | ||||
for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { | for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { | ||||
String swhidStr = swhidIterator.next().toString(); | String swhidStr = swhidIterator.next().toString(); | ||||
SWHID swhid = new SWHID(swhidStr); | SWHID swhid = new SWHID(swhidStr); | ||||
byte[] swhidBin = swhid.toBytes(); | |||||
long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII)); | long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII)); | ||||
long nodeId = BigArrays.get(bfsMap, mphId); | long nodeId = BigArrays.get(bfsMap, mphId); | ||||
swhidToNodeMap.write(swhidBin, 0, swhidBin.length); | |||||
swhidToNodeMap.writeLong(nodeId); | |||||
sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); | sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); | ||||
nodeTypesMap.set(nodeId, swhid.getType().ordinal()); | nodeTypesMap.set(nodeId, swhid.getType().ordinal()); | ||||
plSWHID2Node.lightUpdate(); | plSWHID2Node.lightUpdate(); | ||||
} | } | ||||
plSWHID2Node.done(); | plSWHID2Node.done(); | ||||
sort_stdin.close(); | sort_stdin.close(); | ||||
Show All 11 Lines | static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { | ||||
System.exit(2); | System.exit(2); | ||||
} | } | ||||
outputHandler.join(); | outputHandler.join(); | ||||
} catch (InterruptedException e) { | } catch (InterruptedException e) { | ||||
logger.error("processing of sort output failed with: " + e); | logger.error("processing of sort output failed with: " + e); | ||||
System.exit(2); | System.exit(2); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
private static class SortOutputHandler extends Thread { | private static class SortOutputHandler extends Thread { | ||||
private Scanner input; | private final Scanner input; | ||||
private OutputStream output; | private final OutputStream output; | ||||
private ProgressLogger pl; | private final ProgressLogger pl; | ||||
SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { | SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { | ||||
this.input = new Scanner(input, StandardCharsets.US_ASCII); | this.input = new Scanner(input, StandardCharsets.US_ASCII); | ||||
this.output = output; | this.output = output; | ||||
this.pl = pl; | this.pl = pl; | ||||
} | } | ||||
public void run() { | public void run() { | ||||
boolean sortDone = false; | boolean sortDone = false; | ||||
logger.info("node2swhid: waiting for sort output..."); | logger.info("node2swhid: waiting for sort output..."); | ||||
while (input.hasNextLine()) { | while (input.hasNextLine()) { | ||||
if (!sortDone) { | if (!sortDone) { | ||||
sortDone = true; | sortDone = true; | ||||
this.pl.start("filling node2swhid map"); | this.pl.start("filling node2swhid map"); | ||||
} | } | ||||
String line = input.nextLine(); // format: SWHID <TAB> NODE_ID | String line = input.nextLine(); // format: SWHID <TAB> NODE_ID | ||||
SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID | SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID | ||||
try { | try { | ||||
output.write((byte[]) swhid.toBytes()); | output.write(swhid.toBytes()); | ||||
} catch (IOException e) { | } catch (IOException e) { | ||||
logger.error("writing to node->SWHID map failed with: " + e); | logger.error("writing to node->SWHID map failed with: " + e); | ||||
} | } | ||||
this.pl.lightUpdate(); | this.pl.lightUpdate(); | ||||
} | } | ||||
this.pl.done(); | this.pl.done(); | ||||
} | } | ||||
} | } | ||||
} | } |