Differential D4006 Diff 19063 java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
Changeset View
Changeset View
Standalone View
Standalone View
java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
| package org.softwareheritage.graph.maps; | package org.softwareheritage.graph.maps; | ||||
| import com.martiansoftware.jsap.*; | import com.martiansoftware.jsap.*; | ||||
| import it.unimi.dsi.big.webgraph.LazyLongIterator; | import it.unimi.dsi.big.webgraph.LazyLongIterator; | ||||
| import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; | import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; | ||||
| import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; | import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; | ||||
| import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; | |||||
| import it.unimi.dsi.fastutil.BigArrays; | import it.unimi.dsi.fastutil.BigArrays; | ||||
| import it.unimi.dsi.fastutil.Size64; | import it.unimi.dsi.fastutil.Size64; | ||||
| import it.unimi.dsi.fastutil.io.BinIO; | import it.unimi.dsi.fastutil.io.BinIO; | ||||
| import it.unimi.dsi.fastutil.longs.LongBigArrays; | import it.unimi.dsi.fastutil.longs.LongBigArrays; | ||||
| import it.unimi.dsi.fastutil.objects.Object2LongFunction; | import it.unimi.dsi.fastutil.objects.Object2LongFunction; | ||||
| import it.unimi.dsi.io.FastBufferedReader; | import it.unimi.dsi.io.FastBufferedReader; | ||||
| import it.unimi.dsi.io.LineIterator; | import it.unimi.dsi.io.LineIterator; | ||||
| import it.unimi.dsi.io.OutputBitStream; | import it.unimi.dsi.io.OutputBitStream; | ||||
| import it.unimi.dsi.logging.ProgressLogger; | import it.unimi.dsi.logging.ProgressLogger; | ||||
| import it.unimi.dsi.big.webgraph.BVGraph; | import it.unimi.dsi.big.webgraph.BVGraph; | ||||
| import it.unimi.dsi.big.webgraph.ImmutableGraph; | import it.unimi.dsi.big.webgraph.ImmutableGraph; | ||||
| import it.unimi.dsi.big.webgraph.NodeIterator; | import it.unimi.dsi.big.webgraph.NodeIterator; | ||||
| import org.slf4j.Logger; | import org.slf4j.Logger; | ||||
| import org.slf4j.LoggerFactory; | import org.slf4j.LoggerFactory; | ||||
| import org.softwareheritage.graph.labels.DirEntry; | |||||
| import org.softwareheritage.graph.labels.SwhLabel; | |||||
| import java.io.*; | import java.io.*; | ||||
| import java.nio.charset.StandardCharsets; | import java.nio.charset.StandardCharsets; | ||||
| import java.util.*; | import java.util.*; | ||||
| import java.util.concurrent.TimeUnit; | import java.util.concurrent.TimeUnit; | ||||
| public class LabelMapBuilder { | public class LabelMapBuilder { | ||||
| ▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines | public class LabelMapBuilder { | ||||
| static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { | static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { | ||||
| // Compute intermediate representation as: "<src node id> <dst node id> <label ids>\n" | // Compute intermediate representation as: "<src node id> <dst node id> <label ids>\n" | ||||
| ImmutableGraph graph = BVGraph.loadMapped(graphPath); | ImmutableGraph graph = BVGraph.loadMapped(graphPath); | ||||
| Object2LongFunction<String> swhIdMph = loadMPH(graphPath); | Object2LongFunction<String> swhIdMph = loadMPH(graphPath); | ||||
| long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); | long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); | ||||
| BinIO.loadLongs(graphPath + ".order", orderMap); | BinIO.loadLongs(graphPath + ".order", orderMap); | ||||
| Object2LongFunction<String> labelMPH = loadMPH(graphPath + "-labels"); | Object2LongFunction<String> filenameMph = loadMPH(graphPath + "-filename-labels"); | ||||
| long numLabels = getMPHSize(labelMPH); | long numFilenames = getMPHSize(filenameMph); | ||||
| int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); | |||||
seirl: This is no longer the case. | |||||
| if (labelWidth > 30) { | int totalLabelWidth = DirEntry.labelWidth(numFilenames); | ||||
Done Inline ActionsMake that a static DirEntry.labelWidth(long numLabels) function (including the log2() call). seirl: Make that a static `DirEntry.labelWidth(long numLabels)` function (including the `log2()` call). | |||||
| logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now."); | |||||
| System.exit(2); | |||||
| } | |||||
Done Inline ActionsThis will no longer be needed if you do what I suggested below. seirl: This will no longer be needed if you do what I suggested below. | |||||
| ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
| plInter.itemsName = "edges"; | plInter.itemsName = "edges"; | ||||
| plInter.expectedUpdates = graph.numArcs(); | plInter.expectedUpdates = graph.numArcs(); | ||||
| /* | /* | ||||
| * Pass the intermediate representation to sort(1) so that we see the labels in the order they will | * Pass the intermediate representation to sort(1) so that we see the labels in the order they will | ||||
| * appear in the label file. | * appear in the label file. | ||||
| */ | */ | ||||
| ProcessBuilder processBuilder = new ProcessBuilder(); | ProcessBuilder processBuilder = new ProcessBuilder(); | ||||
| processBuilder.command("sort", "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields | processBuilder.command( | ||||
Done Inline ActionsI don't think we want to sort either the 3rd or the 4th field actually, so I would remove them both. seirl: I don't think we want to sort either the 3rd or the 4th field actually, so I would remove them… | |||||
| "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); | "sort", | ||||
| "-k1,1n", "-k2,2n", // Numerical sort | |||||
| "--numeric-sort", | |||||
| "--buffer-size", SORT_BUFFER_SIZE, | |||||
| "--temporary-directory", tmpDir | |||||
| ); | |||||
| Process sort = processBuilder.start(); | Process sort = processBuilder.start(); | ||||
| BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); | ||||
| BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); | ||||
| FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); | ||||
| LineIterator edgeIterator = new LineIterator(buffer); | LineIterator edgeIterator = new LineIterator(buffer); | ||||
| plInter.start("Piping intermediate representation to sort(1)"); | plInter.start("Piping intermediate representation to sort(1)"); | ||||
| while (edgeIterator.hasNext()) { | while (edgeIterator.hasNext()) { | ||||
| String[] edge = edgeIterator.next().toString().split(" "); | String[] edge = edgeIterator.next().toString().split(" "); | ||||
| if (edge.length < 3) | if (edge.length < 4) | ||||
| continue; | continue; | ||||
| long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); | long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); | ||||
| long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); | long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); | ||||
| long labelId = labelMPH.getLong(edge[2]); | long filenameId = filenameMph.getLong(edge[2]); | ||||
| int permission = Integer.parseInt(edge[3]); | |||||
| sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n").getBytes(StandardCharsets.US_ASCII)); | sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n") | ||||
| .getBytes(StandardCharsets.US_ASCII)); | |||||
| plInter.lightUpdate(); | plInter.lightUpdate(); | ||||
| } | } | ||||
| plInter.done(); | plInter.done(); | ||||
| sort_stdin.close(); | sort_stdin.close(); | ||||
| // Get the sorted output and write the labels and label offsets | // Get the sorted output and write the labels and label offsets | ||||
| ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); | ||||
| plLabels.itemsName = "nodes"; | plLabels.itemsName = "nodes"; | ||||
| Show All 10 Lines | static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { | ||||
| new File(graphPath + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); | new File(graphPath + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); | ||||
| offsets.writeGamma(0); | offsets.writeGamma(0); | ||||
| Scanner sortOutput = new Scanner(sort_stdout, StandardCharsets.US_ASCII); | Scanner sortOutput = new Scanner(sort_stdout, StandardCharsets.US_ASCII); | ||||
| NodeIterator it = graph.nodeIterator(); | NodeIterator it = graph.nodeIterator(); | ||||
| long labelSrcNode = -1; | long labelSrcNode = -1; | ||||
| long labelDstNode = -1; | long labelDstNode = -1; | ||||
| long labelId = -1; | long labelFilenameId = -1; | ||||
| int labelPermission = -1; | |||||
| while (it.hasNext()) { | while (it.hasNext()) { | ||||
| long srcNode = it.nextLong(); | long srcNode = it.nextLong(); | ||||
| // Fill a hashmap with the labels of each edge starting from this node | // Fill a hashmap with the labels of each edge starting from this node | ||||
| HashMap<Long, List<Long>> successorsLabels = new HashMap<>(); | HashMap<Long, List<DirEntry>> successorsLabels = new HashMap<>(); | ||||
| while (labelSrcNode <= srcNode) { | while (labelSrcNode <= srcNode) { | ||||
| if (labelSrcNode == srcNode) { | if (labelSrcNode == srcNode) { | ||||
| successorsLabels.computeIfAbsent(labelDstNode, k -> new ArrayList<>()).add(labelId); | successorsLabels | ||||
| .computeIfAbsent( | |||||
| labelDstNode, | |||||
| k -> new ArrayList<>() | |||||
| ).add(new DirEntry(labelFilenameId, labelPermission)); | |||||
| if (debugFile != null) { | if (debugFile != null) { | ||||
| debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n"); | debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n"); | ||||
| } | } | ||||
| } | } | ||||
| if (!sortOutput.hasNext()) | if (!sortOutput.hasNext()) | ||||
| break; | break; | ||||
| String line = sortOutput.nextLine(); | String line = sortOutput.nextLine(); | ||||
| String[] parts = line.split("\\t"); | String[] parts = line.split("\\t"); | ||||
| labelSrcNode = Long.parseLong(parts[0]); | labelSrcNode = Long.parseLong(parts[0]); | ||||
| labelDstNode = Long.parseLong(parts[1]); | labelDstNode = Long.parseLong(parts[1]); | ||||
| labelId = Long.parseLong(parts[2]); | labelFilenameId = Long.parseLong(parts[2]); | ||||
| labelPermission = Integer.parseInt(parts[3]); | |||||
| } | } | ||||
| int bits = 0; | int bits = 0; | ||||
| LazyLongIterator s = it.successors(); | LazyLongIterator s = it.successors(); | ||||
| long dstNode; | long dstNode; | ||||
| while ((dstNode = s.nextLong()) >= 0) { | while ((dstNode = s.nextLong()) >= 0) { | ||||
| List<Long> edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); | List<DirEntry> currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); | ||||
| bits += labels.writeGamma(edgeLabels.size()); | SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, currentLabels.toArray(new DirEntry[0])); | ||||
| for (Long label : edgeLabels) { | bits += l.toBitStream(labels, -1); | ||||
Done Inline ActionsCan't we replace all of that by the toBitStream() method in SwhLabel? seirl: Can't we replace all of that by the `toBitStream()` method in SwhLabel? | |||||
| bits += labels.writeLong(label, labelWidth); | |||||
| } | |||||
| } | } | ||||
| offsets.writeGamma(bits); | offsets.writeGamma(bits); | ||||
| plLabels.lightUpdate(); | plLabels.lightUpdate(); | ||||
| } | } | ||||
| labels.close(); | labels.close(); | ||||
| offsets.close(); | offsets.close(); | ||||
| plLabels.done(); | plLabels.done(); | ||||
| if (debugFile != null) { | if (debugFile != null) { | ||||
| debugFile.close(); | debugFile.close(); | ||||
| } | } | ||||
| PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties")); | PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties")); | ||||
| pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); | pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); | ||||
| pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " | pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName() + "(DirEntry," + totalLabelWidth + ")" ); | ||||
| + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")"); | |||||
| pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); | pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); | ||||
| pw.close(); | pw.close(); | ||||
| } | } | ||||
| } | } | ||||
This is no longer the case.