diff --git a/api/server/src/main/java/org/softwareheritage/graph/Graph.java b/api/server/src/main/java/org/softwareheritage/graph/Graph.java index d015f37..e265f67 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Graph.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,70 +1,82 @@ package org.softwareheritage.graph; import java.io.IOException; import it.unimi.dsi.big.webgraph.BVGraph; +import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.backend.NodeIdMap; +import org.softwareheritage.graph.backend.NodeTypesMap; public class Graph { + public static final String PID_TO_NODE = ".pid2node.csv"; + public static final String NODE_TO_PID = ".node2pid.csv"; + public static final String NODE_TO_TYPE = ".node2type.map"; + BVGraph graph; BVGraph graphTransposed; String path; NodeIdMap nodeIdMap; + NodeTypesMap nodeTypesMap; public Graph(String path) throws IOException { this.graph = BVGraph.load(path); this.graphTransposed = BVGraph.load(path + "-transposed"); this.path = path; this.nodeIdMap = new NodeIdMap(path, getNbNodes()); + this.nodeTypesMap = new NodeTypesMap(path); } public void cleanUp() throws IOException { nodeIdMap.close(); } public String getPath() { return path; } public long getNodeId(SwhId swhId) { return nodeIdMap.getNodeId(swhId); } public SwhId getSwhId(long nodeId) { return nodeIdMap.getSwhId(nodeId); } + public Node.Type getNodeType(long nodeId) { + return nodeTypesMap.getType(nodeId); + } + public long getNbNodes() { return graph.numNodes(); } public long getNbEdges() { return graph.numArcs(); } public long[][] successors(long nodeId) { return graph.successorBigArray(nodeId); } public long outdegree(long nodeId) { return graph.outdegree(nodeId); } public long[][] predecessors(long nodeId) { return graphTransposed.successorBigArray(nodeId); } public long indegree(long nodeId) { return graphTransposed.outdegree(nodeId); } public long degree(long nodeId, boolean useTransposed) { return (useTransposed) ? indegree(nodeId) : outdegree(nodeId); } public long[][] neighbors(long nodeId, boolean useTransposed) { return (useTransposed) ? predecessors(nodeId) : successors(nodeId); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/Node.java b/api/server/src/main/java/org/softwareheritage/graph/Node.java index 9668331..4fe9823 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/Node.java +++ b/api/server/src/main/java/org/softwareheritage/graph/Node.java @@ -1,32 +1,48 @@ package org.softwareheritage.graph; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class Node { public enum Type { CNT, DIR, REL, REV, SNP; + public static Node.Type fromInt(int intType) { + switch (intType) { + case 0: + return CNT; + case 1: + return DIR; + case 2: + return REL; + case 3: + return REV; + case 4: + return SNP; + } + return null; + } + public static Node.Type fromStr(String strType) { return Node.Type.valueOf(strType.toUpperCase()); } public static ArrayList parse(String strType) { ArrayList types = new ArrayList<>(); if (strType.equals("*")) { List nodeTypes = Arrays.asList(Node.Type.values()); types.addAll(nodeTypes); } else { types.add(Node.Type.fromStr(strType)); } return types; } } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java index fcca1ee..b510281 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java @@ -1,75 +1,76 @@ package org.softwareheritage.graph.backend; import java.io.IOException; +import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhId; import org.softwareheritage.graph.backend.MapFile; public class NodeIdMap { public static final int SWH_ID_LENGTH = 50; public static final int NODE_ID_LENGTH = 20; String graphPath; long nbIds; MapFile swhToNodeMap; MapFile nodeToSwhMap; public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; // +1 are for spaces and end of lines int swhToNodeLineLength = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1; int nodeToSwhLineLength = SWH_ID_LENGTH + 1; - this.swhToNodeMap = new MapFile(graphPath + ".swhToNodeMap.csv", swhToNodeLineLength); - this.nodeToSwhMap = new MapFile(graphPath + ".nodeToSwhMap.csv", nodeToSwhLineLength); + this.swhToNodeMap = new MapFile(graphPath + Graph.PID_TO_NODE, swhToNodeLineLength); + this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_PID, nodeToSwhLineLength); } // SWH id (string) -> WebGraph node id (long) - // Each line in .swhToNode.csv is formatted as: swhId nodeId + // Each line in PID_TO_NODE is formatted as: swhId nodeId // The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId public long getNodeId(SwhId swhId) { long start = 0; long end = nbIds - 1; while (start <= end) { long lineNumber = (start + end) / 2L; String[] parts = swhToNodeMap.readAtLine(lineNumber).split(" "); if (parts.length != 2) { break; } String currentSwhId = parts[0]; long currentNodeId = Long.parseLong(parts[1]); int cmp = currentSwhId.compareTo(swhId.toString()); if (cmp == 0) { return currentNodeId; } else if (cmp < 0) { start = lineNumber + 1; } else { end = lineNumber - 1; } } throw new IllegalArgumentException("Unknown SWH id: " + swhId); } // WebGraph node id (long) -> SWH id (string) - // Each line in .nodeToSwh.csv is formatted as: swhId + // Each line in NODE_TO_PID is formatted as: swhId // The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the // nodeId-th line to get corresponding swhId public SwhId getSwhId(long nodeId) { if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } String swhId = nodeToSwhMap.readAtLine(nodeId); return new SwhId(swhId); } public void close() throws IOException { swhToNodeMap.close(); nodeToSwhMap.close(); } } diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java new file mode 100644 index 0000000..143fa65 --- /dev/null +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java @@ -0,0 +1,26 @@ +package org.softwareheritage.graph.backend; + +import java.io.IOException; + +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigList; + +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.Node; + +public class NodeTypesMap { + LongBigList nodeTypesMap; + + public NodeTypesMap(String graphPath) throws IOException { + try { + nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + Graph.NODE_TO_TYPE); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("Unknown class object: " + e); + } + } + + public Node.Type getType(long nodeId) { + long type = nodeTypesMap.getLong(nodeId); + return Node.Type.fromInt((int) type); + } +} diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java index aafb549..dfcc30f 100644 --- a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java +++ b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java @@ -1,89 +1,106 @@ package org.softwareheritage.graph.backend; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Writer; import java.util.zip.GZIPInputStream; +import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectBigArrays; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; -import org.softwareheritage.graph.backend.NodeIdMap; +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhId; +import org.softwareheritage.graph.backend.NodeTypesMap; public class Setup { public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Expected parameters: "); System.exit(1); } String nodesPath = args[0]; String graphPath = args[1]; System.out.println("Pre-computing node id maps..."); long startTime = System.nanoTime(); precomputeNodeIdMap(nodesPath, graphPath); long endTime = System.nanoTime(); double duration = (double) (endTime - startTime) / 1_000_000_000; System.out.println("Done in: " + duration + " seconds"); } // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException { // First internal mapping: SWH id (string) -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); if (loaded != nbIds) { throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); } // Dump complete mapping for all nodes: SWH id (string) <=> WebGraph node id (long) InputStream nodesStream = new GZIPInputStream(new FileInputStream(nodesPath)); FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodesStream, "UTF-8")); LineIterator swhIdIterator = new LineIterator(buffer); try ( - Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + ".swhToNodeMap.csv")); - Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + ".nodeToSwhMap.csv"))) { + Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE)); + Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) { // nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds); + // To effectively run edge restriction during graph traversals, we store node id (long) -> SWH + // type map. This is represented as a bitmap using minimum number of bits per Node.Type. + final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); + final int nbBitsPerNodeType = log2NbTypes; + LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); + LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); + for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) { - String swhId = swhIdIterator.next().toString(); - long mphId = mphMap.getLong(swhId); + String strSwhId = swhIdIterator.next().toString(); + long mphId = mphMap.getLong(strSwhId); long nodeId = LongBigArrays.get(bfsMap, mphId); String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); - String line = swhId + " " + paddedNodeId + "\n"; + String line = strSwhId + " " + paddedNodeId + "\n"; swhToNodeMap.write(line); - ObjectBigArrays.set(nodeToSwhId, nodeId, swhId); + ObjectBigArrays.set(nodeToSwhId, nodeId, strSwhId); + + SwhId swhId = new SwhId(strSwhId); + nodeTypesMap.set(nodeId, swhId.getType().ordinal()); } + BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); + for (long iNode = 0; iNode < nbIds; iNode++) { String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n"; nodeToSwhMap.write(line); } } } }