Page MenuHomeSoftware Heritage

D1726.id5821.diff
No OneTemporary

D1726.id5821.diff

diff --git a/api/server/src/main/java/org/softwareheritage/graph/Graph.java b/api/server/src/main/java/org/softwareheritage/graph/Graph.java
--- a/api/server/src/main/java/org/softwareheritage/graph/Graph.java
+++ b/api/server/src/main/java/org/softwareheritage/graph/Graph.java
@@ -5,20 +5,28 @@
import it.unimi.dsi.big.webgraph.BVGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
+import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SwhId;
import org.softwareheritage.graph.backend.NodeIdMap;
+import org.softwareheritage.graph.backend.NodeTypesMap;
public class Graph {
+ public static final String PID_TO_NODE = ".pid2node.csv";
+ public static final String NODE_TO_PID = ".node2pid.csv";
+ public static final String NODE_TO_TYPE = ".node2type.map";
+
BVGraph graph;
BVGraph graphTransposed;
String path;
NodeIdMap nodeIdMap;
+ NodeTypesMap nodeTypesMap;
public Graph(String path) throws IOException {
this.graph = BVGraph.load(path);
this.graphTransposed = BVGraph.load(path + "-transposed");
this.path = path;
this.nodeIdMap = new NodeIdMap(path, getNbNodes());
+ this.nodeTypesMap = new NodeTypesMap(path);
}
public void cleanUp() throws IOException {
@@ -37,6 +45,10 @@
return nodeIdMap.getSwhId(nodeId);
}
+ public Node.Type getNodeType(long nodeId) {
+ return nodeTypesMap.getType(nodeId);
+ }
+
public long getNbNodes() {
return graph.numNodes();
}
diff --git a/api/server/src/main/java/org/softwareheritage/graph/Node.java b/api/server/src/main/java/org/softwareheritage/graph/Node.java
--- a/api/server/src/main/java/org/softwareheritage/graph/Node.java
+++ b/api/server/src/main/java/org/softwareheritage/graph/Node.java
@@ -12,6 +12,22 @@
REV,
SNP;
+ public static Node.Type fromInt(int intType) {
+ switch (intType) {
+ case 0:
+ return CNT;
+ case 1:
+ return DIR;
+ case 2:
+ return REL;
+ case 3:
+ return REV;
+ case 4:
+ return SNP;
+ }
+ return null;
+ }
+
public static Node.Type fromStr(String strType) {
return Node.Type.valueOf(strType.toUpperCase());
}
diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java
--- a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java
+++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java
@@ -2,6 +2,7 @@
import java.io.IOException;
+import org.softwareheritage.graph.Graph;
import org.softwareheritage.graph.SwhId;
import org.softwareheritage.graph.backend.MapFile;
@@ -21,12 +22,12 @@
// +1 are for spaces and end of lines
int swhToNodeLineLength = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1;
int nodeToSwhLineLength = SWH_ID_LENGTH + 1;
- this.swhToNodeMap = new MapFile(graphPath + ".swhToNodeMap.csv", swhToNodeLineLength);
- this.nodeToSwhMap = new MapFile(graphPath + ".nodeToSwhMap.csv", nodeToSwhLineLength);
+ this.swhToNodeMap = new MapFile(graphPath + Graph.PID_TO_NODE, swhToNodeLineLength);
+ this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_PID, nodeToSwhLineLength);
}
// SWH id (string) -> WebGraph node id (long)
- // Each line in .swhToNode.csv is formatted as: swhId nodeId
+ // Each line in PID_TO_NODE is formatted as: swhId nodeId
// The file is sorted by swhId, hence we can binary search on swhId to get corresponding nodeId
public long getNodeId(SwhId swhId) {
long start = 0;
@@ -56,7 +57,7 @@
}
// WebGraph node id (long) -> SWH id (string)
- // Each line in .nodeToSwh.csv is formatted as: swhId
+ // Each line in NODE_TO_PID is formatted as: swhId
// The file is ordered by nodeId, meaning node0's swhId is at line 0, hence we can read the
// nodeId-th line to get corresponding swhId
public SwhId getSwhId(long nodeId) {
diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java
new file mode 100644
--- /dev/null
+++ b/api/server/src/main/java/org/softwareheritage/graph/backend/NodeTypesMap.java
@@ -0,0 +1,26 @@
+package org.softwareheritage.graph.backend;
+
+import java.io.IOException;
+
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.longs.LongBigList;
+
+import org.softwareheritage.graph.Graph;
+import org.softwareheritage.graph.Node;
+
+public class NodeTypesMap {
+ LongBigList nodeTypesMap;
+
+ public NodeTypesMap(String graphPath) throws IOException {
+ try {
+ nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + Graph.NODE_TO_TYPE);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalArgumentException("Unknown class object: " + e);
+ }
+ }
+
+ public Node.Type getType(long nodeId) {
+ long type = nodeTypesMap.getLong(nodeId);
+ return Node.Type.fromInt((int) type);
+ }
+}
diff --git a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
--- a/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
+++ b/api/server/src/main/java/org/softwareheritage/graph/backend/Setup.java
@@ -9,15 +9,20 @@
import java.io.Writer;
import java.util.zip.GZIPInputStream;
+import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
+import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectBigArrays;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
-import org.softwareheritage.graph.backend.NodeIdMap;
+import org.softwareheritage.graph.Graph;
+import org.softwareheritage.graph.Node;
+import org.softwareheritage.graph.SwhId;
+import org.softwareheritage.graph.backend.NodeTypesMap;
public class Setup {
public static void main(String[] args) throws IOException {
@@ -63,23 +68,35 @@
LineIterator swhIdIterator = new LineIterator(buffer);
try (
- Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + ".swhToNodeMap.csv"));
- Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + ".nodeToSwhMap.csv"))) {
+ Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE));
+ Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) {
// nodeToSwhMap needs to write SWH id in order of node id, so use a temporary array
Object[][] nodeToSwhId = ObjectBigArrays.newBigArray(nbIds);
+ // To effectively run edge restriction during graph traversals, we store node id (long) -> SWH
+ // type map. This is represented as a bitmap using minimum number of bits per Node.Type.
+ final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2));
+ final int nbBitsPerNodeType = log2NbTypes;
+ LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
+ LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
+
for (long iNode = 0; iNode < nbIds && swhIdIterator.hasNext(); iNode++) {
- String swhId = swhIdIterator.next().toString();
- long mphId = mphMap.getLong(swhId);
+ String strSwhId = swhIdIterator.next().toString();
+ long mphId = mphMap.getLong(strSwhId);
long nodeId = LongBigArrays.get(bfsMap, mphId);
String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId);
- String line = swhId + " " + paddedNodeId + "\n";
+ String line = strSwhId + " " + paddedNodeId + "\n";
swhToNodeMap.write(line);
- ObjectBigArrays.set(nodeToSwhId, nodeId, swhId);
+ ObjectBigArrays.set(nodeToSwhId, nodeId, strSwhId);
+
+ SwhId swhId = new SwhId(strSwhId);
+ nodeTypesMap.set(nodeId, swhId.getType().ordinal());
}
+ BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE);
+
for (long iNode = 0; iNode < nbIds; iNode++) {
String line = ObjectBigArrays.get(nodeToSwhId, iNode).toString() + "\n";
nodeToSwhMap.write(line);
diff --git a/api/server/src/test/dataset/.gitignore b/api/server/src/test/dataset/.gitignore
--- a/api/server/src/test/dataset/.gitignore
+++ b/api/server/src/test/dataset/.gitignore
@@ -16,8 +16,9 @@
*.stats
# Generated node ids mapping
-*.nodeToSwhMap.csv
-*.swhToNodeMap.csv
+*.node2pid.csv
+*.pid2node.csv
+*.node2type.csv
# Logs
stdout

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 10:42 AM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220491

Event Timeline