Differential D1842 Diff 6196 java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
Changeset View
Changeset View
Standalone View
Standalone View
java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
Show All 15 Lines | |||||
import it.unimi.dsi.fastutil.longs.LongBigList; | import it.unimi.dsi.fastutil.longs.LongBigList; | ||||
import it.unimi.dsi.fastutil.objects.Object2LongFunction; | import it.unimi.dsi.fastutil.objects.Object2LongFunction; | ||||
import it.unimi.dsi.fastutil.objects.ObjectBigArrays; | import it.unimi.dsi.fastutil.objects.ObjectBigArrays; | ||||
import it.unimi.dsi.io.FastBufferedReader; | import it.unimi.dsi.io.FastBufferedReader; | ||||
import it.unimi.dsi.io.LineIterator; | import it.unimi.dsi.io.LineIterator; | ||||
import org.softwareheritage.graph.Graph; | import org.softwareheritage.graph.Graph; | ||||
import org.softwareheritage.graph.Node; | import org.softwareheritage.graph.Node; | ||||
import org.softwareheritage.graph.SwhId; | import org.softwareheritage.graph.SwhPID; | ||||
import org.softwareheritage.graph.backend.NodeTypesMap; | import org.softwareheritage.graph.backend.NodeTypesMap; | ||||
/** | /** | ||||
* Pre-processing steps (such as dumping mapping files on disk) before running the graph service. | * Pre-processing steps (such as dumping mapping files on disk) before running the graph service. | ||||
* | * | ||||
* @author Thibault Allançon | * @author Thibault Allançon | ||||
* @version 0.0.1 | * @version 0.0.1 | ||||
* @since 0.0.1 | * @since 0.0.1 | ||||
Show All 26 Lines | public class Setup { | ||||
* Computes and dumps on disk mapping files. | * Computes and dumps on disk mapping files. | ||||
* | * | ||||
* @param nodesPath path of the compressed csv nodes file | * @param nodesPath path of the compressed csv nodes file | ||||
* @param graphPath path of the compressed graph | * @param graphPath path of the compressed graph | ||||
*/ | */ | ||||
// Suppress warning for Object2LongFunction cast | // Suppress warning for Object2LongFunction cast | ||||
@SuppressWarnings("unchecked") | @SuppressWarnings("unchecked") | ||||
static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException { | static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException { | ||||
// First internal mapping: SWH id (string) -> WebGraph MPH (long) | // First internal mapping: SWH PID (string) -> WebGraph MPH (long) | ||||
Object2LongFunction<String> mphMap = null; | Object2LongFunction<String> mphMap = null; | ||||
try { | try { | ||||
mphMap = (Object2LongFunction<String>) BinIO.loadObject(graphPath + ".mph"); | mphMap = (Object2LongFunction<String>) BinIO.loadObject(graphPath + ".mph"); | ||||
} catch (ClassNotFoundException e) { | } catch (ClassNotFoundException e) { | ||||
throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); | throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); | ||||
} | } | ||||
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); | long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); | ||||
// Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) | // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) | ||||
long[][] bfsMap = LongBigArrays.newBigArray(nbIds); | long[][] bfsMap = LongBigArrays.newBigArray(nbIds); | ||||
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); | long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); | ||||
if (loaded != nbIds) { | if (loaded != nbIds) { | ||||
throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); | throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); | ||||
} | } | ||||
// Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) | // Dump complete mapping for all nodes: SWH PID (string) <=> WebGraph node id (long) | ||||
InputStream nodesStream = new GZIPInputStream(new FileInputStream(nodesPath)); | InputStream nodesStream = new GZIPInputStream(new FileInputStream(nodesPath)); | ||||
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodesStream, "UTF-8")); | FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodesStream, "UTF-8")); | ||||
LineIterator swhIdIterator = new LineIterator(buffer); | LineIterator swhPIDIterator = new LineIterator(buffer); | ||||
try (Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE)); | try (Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE)); | ||||
Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) { | Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) { | ||||
// nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array | // nodeToSwhMap needs to write SWH PID in order of node id, so use a temporary array | ||||
Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds); | Object[][] nodeToSwhPID = ObjectBigArrays.newBigArray(nbIds); | ||||
// To effectively run edge restriction during graph traversals, we store node id (long) -> SWH | // To effectively run edge restriction during graph traversals, we store node id (long) -> SWH | ||||
// type map. This is represented as a bitmap using minimum number of bits per Node.Type. | // type map. This is represented as a bitmap using minimum number of bits per Node.Type. | ||||
final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); | final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); | ||||
final int nbBitsPerNodeType = log2NbTypes; | final int nbBitsPerNodeType = log2NbTypes; | ||||
LongArrayBitVector nodeTypesBitVector = | LongArrayBitVector nodeTypesBitVector = | ||||
LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); | ||||
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); | ||||
for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) { | for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) { | ||||
String strSwhId = swhIdIterator.next().toString(); | String strSwhPID = swhPIDIterator.next().toString(); | ||||
long mphId = mphMap.getLong(strSwhId); | long mphId = mphMap.getLong(strSwhPID); | ||||
long nodeId = LongBigArrays.get(bfsMap, mphId); | long nodeId = LongBigArrays.get(bfsMap, mphId); | ||||
String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); | String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); | ||||
String line = strSwhId + " " + paddedNodeId + "\n"; | String line = strSwhPID + " " + paddedNodeId + "\n"; | ||||
swhToNodeMap.write(line); | swhToNodeMap.write(line); | ||||
ObjectBigArrays.set(nodeToSwhId, nodeId, strSwhId); | ObjectBigArrays.set(nodeToSwhPID, nodeId, strSwhPID); | ||||
SwhId swhId = new SwhId(strSwhId); | SwhPID swhPID = new SwhPID(strSwhPID); | ||||
nodeTypesMap.set(nodeId, swhId.getType().ordinal()); | nodeTypesMap.set(nodeId, swhPID.getType().ordinal()); | ||||
} | } | ||||
BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); | BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); | ||||
for (long iNode = 0; iNode < nbIds; iNode++) { | for (long iNode = 0; iNode < nbIds; iNode++) { | ||||
String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n"; | String line = ObjectBigArrays.get(nodeToSwhPID, iNode).toString() + "\n"; | ||||
nodeToSwhMap.write(line); | nodeToSwhMap.write(line); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
} | } |