Differential D4006 Diff 14145 java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
Changeset View
Changeset View
Standalone View
Standalone View
java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
package org.softwareheritage.graph.maps; | package org.softwareheritage.graph.maps; | ||||
import com.martiansoftware.jsap.*; | import com.martiansoftware.jsap.*; | ||||
import it.unimi.dsi.big.webgraph.LazyLongIterator; | import it.unimi.dsi.big.webgraph.LazyLongIterator; | ||||
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; | import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; | ||||
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; | import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; | ||||
import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntLabel; | |||||
import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; | |||||
import it.unimi.dsi.fastutil.BigArrays; | import it.unimi.dsi.fastutil.BigArrays; | ||||
import it.unimi.dsi.fastutil.Size64; | import it.unimi.dsi.fastutil.Size64; | ||||
import it.unimi.dsi.fastutil.io.BinIO; | import it.unimi.dsi.fastutil.io.BinIO; | ||||
import it.unimi.dsi.fastutil.longs.LongBigArrays; | import it.unimi.dsi.fastutil.longs.LongBigArrays; | ||||
import it.unimi.dsi.fastutil.objects.Object2LongFunction; | import it.unimi.dsi.fastutil.objects.Object2LongFunction; | ||||
import it.unimi.dsi.io.FastBufferedReader; | import it.unimi.dsi.io.FastBufferedReader; | ||||
import it.unimi.dsi.io.LineIterator; | import it.unimi.dsi.io.LineIterator; | ||||
import it.unimi.dsi.io.OutputBitStream; | import it.unimi.dsi.io.OutputBitStream; | ||||
import it.unimi.dsi.logging.ProgressLogger; | import it.unimi.dsi.logging.ProgressLogger; | ||||
import it.unimi.dsi.big.webgraph.BVGraph; | import it.unimi.dsi.big.webgraph.BVGraph; | ||||
import it.unimi.dsi.big.webgraph.ImmutableGraph; | import it.unimi.dsi.big.webgraph.ImmutableGraph; | ||||
import it.unimi.dsi.big.webgraph.NodeIterator; | import it.unimi.dsi.big.webgraph.NodeIterator; | ||||
import org.slf4j.Logger; | import org.slf4j.Logger; | ||||
import org.slf4j.LoggerFactory; | import org.slf4j.LoggerFactory; | ||||
import org.softwareheritage.graph.SwhLabel; | |||||
import org.softwareheritage.graph.SwhPerm; | |||||
import org.softwareheritage.graph.webgraph.FixedWidthSwhLabelList; | |||||
import java.io.*; | import java.io.*; | ||||
import java.nio.charset.StandardCharsets; | import java.nio.charset.StandardCharsets; | ||||
import java.util.*; | import java.util.*; | ||||
import java.util.concurrent.TimeUnit; | import java.util.concurrent.TimeUnit; | ||||
public class LabelMapBuilder { | public class LabelMapBuilder { | ||||
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines | public class LabelMapBuilder { | ||||
{ | { | ||||
// Compute intermediate representation in the format "<src node id> <dst node id> <label ids>\n" | // Compute intermediate representation in the format "<src node id> <dst node id> <label ids>\n" | ||||
ImmutableGraph graph = BVGraph.loadMapped(graphPath); | ImmutableGraph graph = BVGraph.loadMapped(graphPath); | ||||
Object2LongFunction<String> swhIdMph = loadMPH(graphPath); | Object2LongFunction<String> swhIdMph = loadMPH(graphPath); | ||||
long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); | long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); | ||||
BinIO.loadLongs(graphPath + ".order", orderMap); | BinIO.loadLongs(graphPath + ".order", orderMap); | ||||
Object2LongFunction<String> labelMPH = loadMPH(graphPath + "-labels"); | // TODO: change the path to be explicit it is only filenames | ||||
long numLabels = getMPHSize(labelMPH); | Object2LongFunction<String> filenameMph = loadMPH(graphPath + "-labels"); | ||||
int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); | long numFilenames = getMPHSize(filenameMph); | ||||
if (labelWidth > 30) { | int filenameIdWidth = (int) Math.ceil(Math.log(numFilenames) / Math.log(2)); | ||||
logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now."); | if (filenameIdWidth > 30) { | ||||
seirl: This is no longer the case. | |||||
logger.error("FIXME: Too many filenames, we can't handle more than 2^30 for now."); | |||||
System.exit(2); | System.exit(2); | ||||
} | } | ||||
int totalLabelWidth = filenameIdWidth + SwhPerm.TOTAL_NB_BITS; | |||||
seirlAuthorUnsubmitted Done Inline ActionsThis will no longer be needed if you do what I suggested below. seirl: This will no longer be needed if you do what I suggested below. | |||||
Done Inline ActionsMake that a static DirEntry.labelWidth(long numLabels) function (including the log2() call). seirl: Make that a static `DirEntry.labelWidth(long numLabels)` function (including the `log2()` call). | |||||
ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
plInter.itemsName = "edges"; | plInter.itemsName = "edges"; | ||||
plInter.expectedUpdates = graph.numArcs(); | plInter.expectedUpdates = graph.numArcs(); | ||||
// Pass the intermediate representation to sort(1) so that we see the labels in the | // Pass the intermediate representation to sort(1) so that we see the labels in the | ||||
// order they will appear in the label file. | // order they will appear in the label file. | ||||
ProcessBuilder processBuilder = new ProcessBuilder(); | ProcessBuilder processBuilder = new ProcessBuilder(); | ||||
processBuilder.command( | processBuilder.command( | ||||
"sort", | "sort", | ||||
"-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields | "-k1,1n", "-k2,2n", "-k3,3n", "-k4,4n", // Numerical sort on all fields | ||||
seirlAuthorUnsubmitted Done Inline ActionsI don't think we want to sort either the 3rd or the 4th field actually, so I would remove them both. seirl: I don't think we want to sort either the 3rd or the 4th field actually, so I would remove them… | |||||
"--numeric-sort", | "--numeric-sort", | ||||
"--buffer-size", SORT_BUFFER_SIZE, | "--buffer-size", SORT_BUFFER_SIZE, | ||||
"--temporary-directory", tmpDir | "--temporary-directory", tmpDir | ||||
); | ); | ||||
Process sort = processBuilder.start(); | Process sort = processBuilder.start(); | ||||
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | ||||
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | ||||
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader( | FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader( | ||||
System.in, StandardCharsets.US_ASCII)); | System.in, StandardCharsets.US_ASCII)); | ||||
LineIterator edgeIterator = new LineIterator(buffer); | LineIterator edgeIterator = new LineIterator(buffer); | ||||
plInter.start("Piping intermediate representation to sort(1)"); | plInter.start("Piping intermediate representation to sort(1)"); | ||||
while (edgeIterator.hasNext()) { | while (edgeIterator.hasNext()) { | ||||
String[] edge = edgeIterator.next().toString().split(" "); | String[] edge = edgeIterator.next().toString().split(" "); | ||||
if (edge.length < 3) | if (edge.length < 4) | ||||
continue; | continue; | ||||
long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); | long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); | ||||
long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); | long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); | ||||
long labelId = labelMPH.getLong(edge[2]); | long filenameId = filenameMph.getLong(edge[2]); | ||||
int permission = Integer.parseInt(edge[3]); | |||||
sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n") | sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n") | ||||
.getBytes(StandardCharsets.US_ASCII)); | .getBytes(StandardCharsets.US_ASCII)); | ||||
plInter.lightUpdate(); | plInter.lightUpdate(); | ||||
} | } | ||||
plInter.done(); | plInter.done(); | ||||
sort_stdin.close(); | sort_stdin.close(); | ||||
// Get the sorted output and write the labels and label offsets | // Get the sorted output and write the labels and label offsets | ||||
ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
Show All 13 Lines | static void computeLabelMap(String graphPath, String debugPath, String tmpDir) | ||||
+ BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); | + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); | ||||
offsets.writeGamma(0); | offsets.writeGamma(0); | ||||
Scanner sortOutput = new Scanner(sort_stdout, StandardCharsets.US_ASCII); | Scanner sortOutput = new Scanner(sort_stdout, StandardCharsets.US_ASCII); | ||||
NodeIterator it = graph.nodeIterator(); | NodeIterator it = graph.nodeIterator(); | ||||
long labelSrcNode = -1; | long labelSrcNode = -1; | ||||
long labelDstNode = -1; | long labelDstNode = -1; | ||||
long labelId = -1; | long labelFilenameId = -1; | ||||
int labelPermission = -1; | |||||
while (it.hasNext()) { | while (it.hasNext()) { | ||||
long srcNode = it.nextLong(); | long srcNode = it.nextLong(); | ||||
// Fill a hashmap with the labels of each edge starting from this node | // Fill a hashmap with the labels of each edge starting from this node | ||||
HashMap<Long, List<Long>> successorsLabels = new HashMap<>(); | HashMap<Long, List<SwhLabel>> successorsLabels = new HashMap<>(); | ||||
while (labelSrcNode <= srcNode) { | while (labelSrcNode <= srcNode) { | ||||
if (labelSrcNode == srcNode) { | if (labelSrcNode == srcNode) { | ||||
successorsLabels | successorsLabels | ||||
.computeIfAbsent( | .computeIfAbsent( | ||||
labelDstNode, | labelDstNode, | ||||
k -> new ArrayList<>() | k -> new ArrayList<>() | ||||
).add(labelId); | ).add(new SwhLabel(labelFilenameId, labelPermission)); | ||||
if (debugFile != null) { | if (debugFile != null) { | ||||
debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n"); | debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n"); | ||||
} | } | ||||
} | } | ||||
if (!sortOutput.hasNext()) | if (!sortOutput.hasNext()) | ||||
break; | break; | ||||
String line = sortOutput.nextLine(); | String line = sortOutput.nextLine(); | ||||
String[] parts = line.split("\\t"); | String[] parts = line.split("\\t"); | ||||
labelSrcNode = Long.parseLong(parts[0]); | labelSrcNode = Long.parseLong(parts[0]); | ||||
labelDstNode = Long.parseLong(parts[1]); | labelDstNode = Long.parseLong(parts[1]); | ||||
labelId = Long.parseLong(parts[2]); | labelFilenameId = Long.parseLong(parts[2]); | ||||
SwhPerm.Type permissionType = SwhPerm.Type.fromOct(Integer.parseInt(parts[3])); | |||||
labelPermission = SwhPerm.Type.toInt(permissionType); | |||||
} | } | ||||
int bits = 0; | int bits = 0; | ||||
LazyLongIterator s = it.successors(); | LazyLongIterator s = it.successors(); | ||||
long dstNode; | long dstNode; | ||||
while ((dstNode = s.nextLong()) >= 0) { | while ((dstNode = s.nextLong()) >= 0) { | ||||
List<Long> edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); | List<SwhLabel> currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); | ||||
bits += labels.writeGamma(edgeLabels.size()); | bits += labels.writeGamma(currentLabels.size()); | ||||
for (Long label : edgeLabels) { | for (SwhLabel label : currentLabels) { | ||||
bits += labels.writeLong(label, labelWidth); | long labelEncoded = SwhLabel.toEncoded(label); | ||||
bits += labels.writeLong(labelEncoded, totalLabelWidth); | |||||
seirlAuthorUnsubmitted Done Inline ActionsCan't we replace all of that by the toBitStream() method in SwhLabel? seirl: Can't we replace all of that by the `toBitStream()` method in SwhLabel? | |||||
} | } | ||||
} | } | ||||
offsets.writeGamma(bits); | offsets.writeGamma(bits); | ||||
plLabels.lightUpdate(); | plLabels.lightUpdate(); | ||||
} | } | ||||
labels.close(); | labels.close(); | ||||
offsets.close(); | offsets.close(); | ||||
plLabels.done(); | plLabels.done(); | ||||
if (debugFile != null) { | if (debugFile != null) { | ||||
debugFile.close(); | debugFile.close(); | ||||
} | } | ||||
PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties" )); | PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties" )); | ||||
pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); | pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); | ||||
pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")" ); | pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthSwhLabelList.class.getName() + "(SwhLabel," + totalLabelWidth + ")" ); | ||||
pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); | pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); | ||||
pw.close(); | pw.close(); | ||||
} | } | ||||
} | } |
This is no longer the case.