diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java index c1f5b75..4dc4c7d 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java @@ -1,199 +1,200 @@ package org.softwareheritage.graph.experiments.topology; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.AllowedNodes; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.Subgraph; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.*; public class ConnectedComponents { private Subgraph graph; private void load_graph(String graphBasename, String nodeTypes) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); var underlyingGraph = Graph.loadMapped(graphBasename); var underlyingGraphSym = underlyingGraph.symmetrize(); graph = new Subgraph(underlyingGraphSym, new AllowedNodes(nodeTypes)); System.err.println("Graph loaded."); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ConnectedComponents.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new Switch("byOrigins", JSAP.NO_SHORTFLAG, "by-origins", "Compute size of components by number of origins"), new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', "nodetypes", "Allowed node types (comma-separated)"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } - private HashMap /* ArrayList> */ compute(ProgressLogger pl, boolean byOrigin) throws IOException { + private HashMap /* ArrayList> */ compute(ProgressLogger pl, boolean byOrigin) + throws IOException { final long n = graph.numNodes(); final long maxN = graph.maxNodeNumber(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * maxN); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 LongArrayBitVector visited = LongArrayBitVector.ofLength(maxN); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); // ArrayList> components = new ArrayList<>(); HashMap componentDistribution = new HashMap<>(); var it = graph.nodeIterator(); while (it.hasNext()) { long i = it.nextLong(); if (visited.getBoolean(i)) continue; // ArrayList component = new ArrayList<>(); long componentNodes = 0; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); // component.add(currentNode); if (!byOrigin || graph.getNodeType(currentNode) == Node.Type.ORI) componentNodes += 1; final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } /* * if (component.size() > 0) { components.add(component); } */ if (componentNodes > 0) componentDistribution.merge(componentNodes, 1L, Long::sum); } pl.done(); // return components; return componentDistribution; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } out.close(); } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } out.close(); } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } out.close(); } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdirPath = config.getString("outdir"); String nodeTypes = config.getString("nodeTypes"); boolean byOrigin = config.getBoolean("byOrigins"); ConnectedComponents connectedComponents = new ConnectedComponents(); try { connectedComponents.load_graph(graphPath, nodeTypes); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } ProgressLogger logger = new ProgressLogger(); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { // ArrayList> components = connectedComponents.compute(logger); // components.sort(Comparator.comparing(ArrayList::size).reversed()); // printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); // printLargestComponent(components, new Formatter(outdirPath + "/largest_component.txt")); // printAllComponents(components, new Formatter(outdirPath + "/all_components.txt")); HashMap componentDistribution = connectedComponents.compute(logger, byOrigin); PrintWriter f = new PrintWriter(new FileWriter(outdirPath + "/distribution.txt")); TreeMap sortedDistribution = new TreeMap<>(componentDistribution); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } catch (IOException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java index da9a599..04bde71 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -1,529 +1,527 @@ package org.softwareheritage.graph.maps; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.bytes.ByteArrays; -import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; -import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.labels.DirEntry; import org.softwareheritage.graph.labels.SwhLabel; import java.io.*; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.TimeUnit; public class LabelMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); String graphPath; String outputGraphPath; String debugPath; String tmpDir; ImmutableGraph graph; long numNodes; long numArcs; NodeIdMap nodeIdMap; Object2LongFunction filenameMph; long numFilenames; int totalLabelWidth; - public LabelMapBuilder(String graphPath, String debugPath, String outputGraphPath, String tmpDir) throws IOException { + public LabelMapBuilder(String graphPath, String debugPath, String outputGraphPath, String tmpDir) + throws IOException { this.graphPath = graphPath; if (outputGraphPath == null) { this.outputGraphPath = graphPath; } else { this.outputGraphPath = outputGraphPath; } this.debugPath = debugPath; this.tmpDir = tmpDir; // Load the graph in offline mode to retrieve the number of nodes/edges, // then immediately destroy it. XXX: not even needed? // ImmutableGraph graphOffline = BVGraph.loadMapped(graphPath); graph = BVGraph.loadMapped(graphPath); numArcs = graph.numArcs(); numNodes = graph.numNodes(); nodeIdMap = new NodeIdMap(graphPath, numNodes); filenameMph = NodeIdMap.loadMph(graphPath + "-labels.mph"); numFilenames = getMPHSize(filenameMph); totalLabelWidth = DirEntry.labelWidth(numFilenames); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd', "debug-path", "Store the intermediate representation here for debug"), new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "output-graph", "Basename of the output graph, same as --graph if not specified"), new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp", "Temporary directory path"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outputGraphPath = config.getString("outputGraphPath"); String tmpDir = config.getString("tmpDir"); String debugPath = config.getString("debugPath"); LabelMapBuilder builder = new LabelMapBuilder(graphPath, debugPath, outputGraphPath, tmpDir); logger.info("Loading graph and MPH functions..."); builder.computeLabelMap(); } static long getMPHSize(Object2LongFunction mph) { return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); } void computeLabelMap() throws IOException, InterruptedException { this.loadGraph(); // this.computeLabelMapSort(); this.computeLabelMapBsort(); } void computeLabelMapSort() throws IOException { // Pass the intermediate representation to sort(1) so that we see the labels in the order they will // appear in the label file. ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "-k1,1n", "-k2,2n", // Numerical sort "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); // BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); FastBufferedInputStream sort_stdout = new FastBufferedInputStream(sort.getInputStream()); final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); hashLabelStream(fbis, new EdgeLabelLineWriter() { @Override public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { sort_stdin.write((src + "\t" + dst + "\t" + filenameId + "\t" + permission + "\n") .getBytes(StandardCharsets.US_ASCII)); } }); sort_stdin.close(); EdgeLabelLineIterator mapLines = new TextualEdgeLabelLineIterator(sort_stdout); writeLabels(mapLines); logger.info("Done"); } void computeLabelMapBsort() throws IOException, InterruptedException { // Pass the intermediate representation to bsort(1) so that we see the labels in the order they will // appear in the label file. String tmpFile = tmpDir + "/labelsToSort.bin"; final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmpFile))); // Number of bytes to represent a node. final int nodeBytes = (Long.SIZE - Long.numberOfLeadingZeros(graph.numNodes())) / 8 + 1; ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); logger.info("Writing labels to a packed binary files (node bytes: {})", nodeBytes); hashLabelStream(fbis, new EdgeLabelLineWriter() { @Override public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { buffer.putLong(0, src); out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); buffer.putLong(0, dst); out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); out.writeLong(filenameId); out.writeInt(permission); } }); ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("/home/seirl/bsort/src/bsort", "-v", "-r", String.valueOf(nodeBytes * 2 + Long.BYTES + Integer.BYTES), "-k", String.valueOf(nodeBytes * 2), tmpFile); Process sort = processBuilder.start(); sort.waitFor(); final DataInputStream sortedLabels = new DataInputStream(new BufferedInputStream(new FileInputStream(tmpFile))); BinaryEdgeLabelLineIterator mapLines = new BinaryEdgeLabelLineIterator(sortedLabels, nodeBytes); writeLabels(mapLines); logger.info("Done"); } void loadGraph() throws IOException { } void hashLabelStream(FastBufferedInputStream input, EdgeLabelLineWriter writer) throws IOException { // Compute intermediate representation and write it on : // "