Changeset View
Changeset View
Standalone View
Standalone View
api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
package org.softwareheritage.graph.backend; | package org.softwareheritage.graph.backend; | ||||
import java.io.BufferedWriter; | import java.io.BufferedWriter; | ||||
import java.io.FileInputStream; | import java.io.FileInputStream; | ||||
import java.io.FileWriter; | import java.io.FileWriter; | ||||
import java.io.IOException; | import java.io.IOException; | ||||
import java.io.InputStream; | import java.io.InputStream; | ||||
import java.io.InputStreamReader; | import java.io.InputStreamReader; | ||||
import java.io.Writer; | import java.io.Writer; | ||||
import java.util.zip.GZIPInputStream; | import java.util.zip.GZIPInputStream; | ||||
import it.unimi.dsi.bits.LongArrayBitVector; | |||||
import it.unimi.dsi.fastutil.Size64; | import it.unimi.dsi.fastutil.Size64; | ||||
import it.unimi.dsi.fastutil.io.BinIO; | import it.unimi.dsi.fastutil.io.BinIO; | ||||
import it.unimi.dsi.fastutil.longs.LongBigArrays; | import it.unimi.dsi.fastutil.longs.LongBigArrays; | ||||
import it.unimi.dsi.fastutil.longs.LongBigList; | |||||
import it.unimi.dsi.fastutil.objects.Object2LongFunction; | import it.unimi.dsi.fastutil.objects.Object2LongFunction; | ||||
import it.unimi.dsi.fastutil.objects.ObjectBigArrays; | import it.unimi.dsi.fastutil.objects.ObjectBigArrays; | ||||
import it.unimi.dsi.io.FastBufferedReader; | import it.unimi.dsi.io.FastBufferedReader; | ||||
import it.unimi.dsi.io.LineIterator; | import it.unimi.dsi.io.LineIterator; | ||||
import org.softwareheritage.graph.backend.NodeIdMap; | import org.softwareheritage.graph.SwhId; | ||||
import org.softwareheritage.graph.backend.NodeTypesMap; | |||||
public class Setup { | public class Setup { | ||||
public static void main(String[] args) throws IOException { | public static void main(String[] args) throws IOException { | ||||
if (args.length != 2) { | if (args.length != 2) { | ||||
System.err.println("Expected parameters: <nodes.csv.gz path> <compressed graph path>"); | System.err.println("Expected parameters: <nodes.csv.gz path> <compressed graph path>"); | ||||
System.exit(1); | System.exit(1); | ||||
} | } | ||||
Show All 34 Lines | static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException { | ||||
LineIterator swhIdIterator = new LineIterator(buffer); | LineIterator swhIdIterator = new LineIterator(buffer); | ||||
try ( | try ( | ||||
Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + ".swhToNodeMap.csv")); | Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + ".swhToNodeMap.csv")); | ||||
Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + ".nodeToSwhMap.csv"))) { | Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + ".nodeToSwhMap.csv"))) { | ||||
// nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array | // nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array | ||||
Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds); | Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds); | ||||
// To effectively run edge restriction during graph traversals, we store node id (long) -> SWH | |||||
// type map. This is represented as a bitmap where each Node.Type uses 3 bits. | |||||
zack: Can we make this the result of a log2 calculation on the size of the Node.Type enum? (unless… | |||||
Done Inline ActionsSure, it's a bit sad Java only has log, log10, log1p but no log2 :( haltode: Sure, it's a bit sad Java only has `log`, `log10`, `log1p` but no `log2` :( | |||||
Done Inline Actionssomething like Math.ceil(Math.log(x) / Math.log(2)) then? zack: something like `Math.ceil(Math.log(x) / Math.log(2))` then? | |||||
final int nbBitsPerNodeType = 3; | |||||
LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | |||||
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | |||||
for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) { | for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) { | ||||
String swhId = swhIdIterator.next().toString(); | String strSwhId = swhIdIterator.next().toString(); | ||||
long mphId = mphMap.getLong(swhId); | long mphId = mphMap.getLong(strSwhId); | ||||
long nodeId = LongBigArrays.get(bfsMap, mphId); | long nodeId = LongBigArrays.get(bfsMap, mphId); | ||||
String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); | String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); | ||||
String line = swhId + " " + paddedNodeId + "\n"; | String line = strSwhId + " " + paddedNodeId + "\n"; | ||||
swhToNodeMap.write(line); | swhToNodeMap.write(line); | ||||
ObjectBigArrays.set(nodeToSwhId, nodeId, swhId); | ObjectBigArrays.set(nodeToSwhId, nodeId, strSwhId); | ||||
SwhId swhId = new SwhId(strSwhId); | |||||
nodeTypesMap.set(nodeId, swhId.getType().ordinal()); | |||||
} | } | ||||
Done Inline Actionsthis is another place where the constant proposed above should be used zack: this is another place where the constant proposed above should be used | |||||
BinIO.storeObject(nodeTypesMap, graphPath + ".nodeTypesMap"); | |||||
for (long iNode = 0; iNode < nbIds; iNode++) { | for (long iNode = 0; iNode < nbIds; iNode++) { | ||||
String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n"; | String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n"; | ||||
nodeToSwhMap.write(line); | nodeToSwhMap.write(line); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
} | } |
Can we make this the result of a log2 calculation on the size of the Node.Type enum? (unless it's too annoying to code, that is)
It would be nicer, and less prone to future bugs if we ever add/remove node types in the future.