Differential D4006 Diff 19063 java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
Changeset View
Changeset View
Standalone View
Standalone View
java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
package org.softwareheritage.graph.maps; | package org.softwareheritage.graph.maps; | ||||
import com.martiansoftware.jsap.*; | import com.martiansoftware.jsap.*; | ||||
import it.unimi.dsi.big.webgraph.LazyLongIterator; | import it.unimi.dsi.big.webgraph.LazyLongIterator; | ||||
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; | import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; | ||||
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; | import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; | ||||
import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; | |||||
import it.unimi.dsi.fastutil.BigArrays; | import it.unimi.dsi.fastutil.BigArrays; | ||||
import it.unimi.dsi.fastutil.Size64; | import it.unimi.dsi.fastutil.Size64; | ||||
import it.unimi.dsi.fastutil.io.BinIO; | import it.unimi.dsi.fastutil.io.BinIO; | ||||
import it.unimi.dsi.fastutil.longs.LongBigArrays; | import it.unimi.dsi.fastutil.longs.LongBigArrays; | ||||
import it.unimi.dsi.fastutil.objects.Object2LongFunction; | import it.unimi.dsi.fastutil.objects.Object2LongFunction; | ||||
import it.unimi.dsi.io.FastBufferedReader; | import it.unimi.dsi.io.FastBufferedReader; | ||||
import it.unimi.dsi.io.LineIterator; | import it.unimi.dsi.io.LineIterator; | ||||
import it.unimi.dsi.io.OutputBitStream; | import it.unimi.dsi.io.OutputBitStream; | ||||
import it.unimi.dsi.logging.ProgressLogger; | import it.unimi.dsi.logging.ProgressLogger; | ||||
import it.unimi.dsi.big.webgraph.BVGraph; | import it.unimi.dsi.big.webgraph.BVGraph; | ||||
import it.unimi.dsi.big.webgraph.ImmutableGraph; | import it.unimi.dsi.big.webgraph.ImmutableGraph; | ||||
import it.unimi.dsi.big.webgraph.NodeIterator; | import it.unimi.dsi.big.webgraph.NodeIterator; | ||||
import org.slf4j.Logger; | import org.slf4j.Logger; | ||||
import org.slf4j.LoggerFactory; | import org.slf4j.LoggerFactory; | ||||
import org.softwareheritage.graph.labels.DirEntry; | |||||
import org.softwareheritage.graph.labels.SwhLabel; | |||||
import java.io.*; | import java.io.*; | ||||
import java.nio.charset.StandardCharsets; | import java.nio.charset.StandardCharsets; | ||||
import java.util.*; | import java.util.*; | ||||
import java.util.concurrent.TimeUnit; | import java.util.concurrent.TimeUnit; | ||||
public class LabelMapBuilder { | public class LabelMapBuilder { | ||||
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines | public class LabelMapBuilder { | ||||
static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { | static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { | ||||
// Compute intermediate representation as: "<src node id> <dst node id> <label ids>\n" | // Compute intermediate representation as: "<src node id> <dst node id> <label ids>\n" | ||||
ImmutableGraph graph = BVGraph.loadMapped(graphPath); | ImmutableGraph graph = BVGraph.loadMapped(graphPath); | ||||
Object2LongFunction<String> swhIdMph = loadMPH(graphPath); | Object2LongFunction<String> swhIdMph = loadMPH(graphPath); | ||||
long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); | long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); | ||||
BinIO.loadLongs(graphPath + ".order", orderMap); | BinIO.loadLongs(graphPath + ".order", orderMap); | ||||
Object2LongFunction<String> labelMPH = loadMPH(graphPath + "-labels"); | Object2LongFunction<String> filenameMph = loadMPH(graphPath + "-filename-labels"); | ||||
long numLabels = getMPHSize(labelMPH); | long numFilenames = getMPHSize(filenameMph); | ||||
int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); | |||||
seirl: This is no longer the case. | |||||
if (labelWidth > 30) { | int totalLabelWidth = DirEntry.labelWidth(numFilenames); | ||||
Done Inline ActionsMake that a static DirEntry.labelWidth(long numLabels) function (including the log2() call). seirl: Make that a static `DirEntry.labelWidth(long numLabels)` function (including the `log2()` call). | |||||
logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now."); | |||||
System.exit(2); | |||||
} | |||||
Done Inline ActionsThis will no longer be needed if you do what I suggested below. seirl: This will no longer be needed if you do what I suggested below. | |||||
ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
plInter.itemsName = "edges"; | plInter.itemsName = "edges"; | ||||
plInter.expectedUpdates = graph.numArcs(); | plInter.expectedUpdates = graph.numArcs(); | ||||
/* | /* | ||||
* Pass the intermediate representation to sort(1) so that we see the labels in the order they will | * Pass the intermediate representation to sort(1) so that we see the labels in the order they will | ||||
* appear in the label file. | * appear in the label file. | ||||
*/ | */ | ||||
ProcessBuilder processBuilder = new ProcessBuilder(); | ProcessBuilder processBuilder = new ProcessBuilder(); | ||||
processBuilder.command("sort", "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields | processBuilder.command( | ||||
Done Inline ActionsI don't think we want to sort either the 3rd or the 4th field actually, so I would remove them both. seirl: I don't think we want to sort either the 3rd or the 4th field actually, so I would remove them… | |||||
"--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); | "sort", | ||||
"-k1,1n", "-k2,2n", // Numerical sort | |||||
"--numeric-sort", | |||||
"--buffer-size", SORT_BUFFER_SIZE, | |||||
"--temporary-directory", tmpDir | |||||
); | |||||
Process sort = processBuilder.start(); | Process sort = processBuilder.start(); | ||||
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | ||||
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | ||||
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | ||||
LineIterator edgeIterator = new LineIterator(buffer); | LineIterator edgeIterator = new LineIterator(buffer); | ||||
plInter.start("Piping intermediate representation to sort(1)"); | plInter.start("Piping intermediate representation to sort(1)"); | ||||
while (edgeIterator.hasNext()) { | while (edgeIterator.hasNext()) { | ||||
String[] edge = edgeIterator.next().toString().split(" "); | String[] edge = edgeIterator.next().toString().split(" "); | ||||
if (edge.length < 3) | if (edge.length < 4) | ||||
continue; | continue; | ||||
long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); | long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); | ||||
long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); | long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); | ||||
long labelId = labelMPH.getLong(edge[2]); | long filenameId = filenameMph.getLong(edge[2]); | ||||
int permission = Integer.parseInt(edge[3]); | |||||
sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n").getBytes(StandardCharsets.US_ASCII)); | sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n") | ||||
.getBytes(StandardCharsets.US_ASCII)); | |||||
plInter.lightUpdate(); | plInter.lightUpdate(); | ||||
} | } | ||||
plInter.done(); | plInter.done(); | ||||
sort_stdin.close(); | sort_stdin.close(); | ||||
// Get the sorted output and write the labels and label offsets | // Get the sorted output and write the labels and label offsets | ||||
ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
plLabels.itemsName = "nodes"; | plLabels.itemsName = "nodes"; | ||||
Show All 10 Lines | static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { | ||||
new File(graphPath + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); | new File(graphPath + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); | ||||
offsets.writeGamma(0); | offsets.writeGamma(0); | ||||
Scanner sortOutput = new Scanner(sort_stdout, StandardCharsets.US_ASCII); | Scanner sortOutput = new Scanner(sort_stdout, StandardCharsets.US_ASCII); | ||||
NodeIterator it = graph.nodeIterator(); | NodeIterator it = graph.nodeIterator(); | ||||
long labelSrcNode = -1; | long labelSrcNode = -1; | ||||
long labelDstNode = -1; | long labelDstNode = -1; | ||||
long labelId = -1; | long labelFilenameId = -1; | ||||
int labelPermission = -1; | |||||
while (it.hasNext()) { | while (it.hasNext()) { | ||||
long srcNode = it.nextLong(); | long srcNode = it.nextLong(); | ||||
// Fill a hashmap with the labels of each edge starting from this node | // Fill a hashmap with the labels of each edge starting from this node | ||||
HashMap<Long, List<Long>> successorsLabels = new HashMap<>(); | HashMap<Long, List<DirEntry>> successorsLabels = new HashMap<>(); | ||||
while (labelSrcNode <= srcNode) { | while (labelSrcNode <= srcNode) { | ||||
if (labelSrcNode == srcNode) { | if (labelSrcNode == srcNode) { | ||||
successorsLabels.computeIfAbsent(labelDstNode, k -> new ArrayList<>()).add(labelId); | successorsLabels | ||||
.computeIfAbsent( | |||||
labelDstNode, | |||||
k -> new ArrayList<>() | |||||
).add(new DirEntry(labelFilenameId, labelPermission)); | |||||
if (debugFile != null) { | if (debugFile != null) { | ||||
debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n"); | debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n"); | ||||
} | } | ||||
} | } | ||||
if (!sortOutput.hasNext()) | if (!sortOutput.hasNext()) | ||||
break; | break; | ||||
String line = sortOutput.nextLine(); | String line = sortOutput.nextLine(); | ||||
String[] parts = line.split("\\t"); | String[] parts = line.split("\\t"); | ||||
labelSrcNode = Long.parseLong(parts[0]); | labelSrcNode = Long.parseLong(parts[0]); | ||||
labelDstNode = Long.parseLong(parts[1]); | labelDstNode = Long.parseLong(parts[1]); | ||||
labelId = Long.parseLong(parts[2]); | labelFilenameId = Long.parseLong(parts[2]); | ||||
labelPermission = Integer.parseInt(parts[3]); | |||||
} | } | ||||
int bits = 0; | int bits = 0; | ||||
LazyLongIterator s = it.successors(); | LazyLongIterator s = it.successors(); | ||||
long dstNode; | long dstNode; | ||||
while ((dstNode = s.nextLong()) >= 0) { | while ((dstNode = s.nextLong()) >= 0) { | ||||
List<Long> edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); | List<DirEntry> currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); | ||||
bits += labels.writeGamma(edgeLabels.size()); | SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, currentLabels.toArray(new DirEntry[0])); | ||||
for (Long label : edgeLabels) { | bits += l.toBitStream(labels, -1); | ||||
Done Inline ActionsCan't we replace all of that by the toBitStream() method in SwhLabel? seirl: Can't we replace all of that by the `toBitStream()` method in SwhLabel? | |||||
bits += labels.writeLong(label, labelWidth); | |||||
} | |||||
} | } | ||||
offsets.writeGamma(bits); | offsets.writeGamma(bits); | ||||
plLabels.lightUpdate(); | plLabels.lightUpdate(); | ||||
} | } | ||||
labels.close(); | labels.close(); | ||||
offsets.close(); | offsets.close(); | ||||
plLabels.done(); | plLabels.done(); | ||||
if (debugFile != null) { | if (debugFile != null) { | ||||
debugFile.close(); | debugFile.close(); | ||||
} | } | ||||
PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties")); | PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties")); | ||||
pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); | pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); | ||||
pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " | pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName() + "(DirEntry," + totalLabelWidth + ")" ); | ||||
+ FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")"); | |||||
pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); | pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); | ||||
pw.close(); | pw.close(); | ||||
} | } | ||||
} | } |
This is no longer the case.