diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java new file mode 100644 index 0000000..e2e68ff --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java @@ -0,0 +1,66 @@ +package org.softwareheritage.graph.experiments.forks; + +import com.martiansoftware.jsap.*; +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.Traversal; + +import java.io.IOException; +import java.util.Scanner; + +public class FindCommonAncestor { + private Graph graph; + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename); + System.err.println("Graph loaded."); + } + + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP( + FindCommonAncestor.class.getName(), + "", + new Parameter[] { + new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'e', "edges", "Edges constraints"), + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + } + ); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + String edgesFmt = config.getString("edgesFmt"); + + FindCommonAncestor fca = new FindCommonAncestor(); + try { + fca.load_graph(graphPath); + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + Scanner input = new Scanner(System.in); + while (input.hasNextLong()) { + long lhsNode = input.nextLong(); + long rhsNode = input.nextLong(); + + Traversal t = new Traversal(fca.graph.symmetrize(), "forward", edgesFmt); + System.out.println(t.findCommonDescendant(lhsNode, rhsNode)); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java new file mode 100644 index 0000000..6839a8f --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java @@ -0,0 +1,130 @@ +package org.softwareheritage.graph.experiments.forks; + +import com.martiansoftware.jsap.*; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.Node; + +import java.io.IOException; +import java.util.*; + +public class FindPath { + private Graph graph; + private Long emptySnapshot; + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename).symmetrize(); + System.err.println("Graph loaded."); + this.emptySnapshot = null; + } + + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP( + FindPath.class.getName(), + "", + new Parameter[] { + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + } + ); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + private boolean nodeIsEmptySnapshot(Long node) { + if (this.emptySnapshot == null + && this.graph.getNodeType(node) == Node.Type.SNP + && this.graph.outdegree(node) == 0) { + System.err.println("Found empty snapshot: " + node); + this.emptySnapshot = node; + } + return node.equals(this.emptySnapshot); + } + + private Boolean shouldVisit(Long node){ + Node.Type nt = this.graph.getNodeType(node); + if (nt != Node.Type.REV && nt != Node.Type.REL + && nt != Node.Type.SNP && nt != Node.Type.ORI) { + return false; + } + if (this.nodeIsEmptySnapshot(node)) + return false; + return true; + } + + private ArrayList findPath(Long src, Long dst) { + HashSet visited = new HashSet<>(); + Queue queue = new ArrayDeque<>(); + Map parentNode = new HashMap<>(); + + queue.add(src); + visited.add(src); + + while (!queue.isEmpty()) { + long currentNode = queue.poll(); + + final LazyLongIterator iterator = graph.successors(currentNode); + long succ; + while ((succ = iterator.nextLong()) != -1) { + if (!shouldVisit(succ) || visited.contains(succ)) continue; + visited.add(succ); + queue.add(succ); + parentNode.put(succ, currentNode); + + if (succ == dst) { + ArrayList path = new ArrayList<>(); + long n = dst; + while (n != src) { + path.add(n); + n = parentNode.get(n); + } + path.add(src); + Collections.reverse(path); + return path; + } + } + } + return null; + } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + + FindPath fpath = new FindPath(); + try { + fpath.load_graph(graphPath); + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + Scanner input = new Scanner(System.in); + while (input.hasNextLong()) { + long lhsNode = input.nextLong(); + long rhsNode = input.nextLong(); + + ArrayList path = fpath.findPath(lhsNode, rhsNode); + if (path != null) { + for (Long n : path) { + System.out.format("%d ", n); + } + System.out.println(); + } + else { + System.out.println("null"); + } + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java index fc9667c..0b239d1 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java @@ -1,219 +1,253 @@ package org.softwareheritage.graph.experiments.forks; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.benchmark.BFS; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.ArrayList; -import java.util.Map; -import java.util.Scanner; -import java.util.TreeMap; +import java.util.*; public class ForkCC { public Boolean includeRootDir; private Graph graph; private Long emptySnapshot; private LongArrayBitVector visited; private LongArrayBitVector whitelist; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( - ForkCC.class.getName(), - "", - new Parameter[]{ - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'g', "graph", "Basename of the compressed graph"), - new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, - 't', "whitelist", "Whitelist of origins"), - new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, - 'w', "numthreads", "Number of threads"), - new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, - 'R', "includerootdir", "Include root directory (default: false)"), - } + ForkCC.class.getName(), + "", + new Parameter[] { + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, + 't', "whitelist", "Whitelist of origins"), + new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, + 'R', "includerootdir", "Include root directory (default: false)"), + new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'o', "outdir", "Directory where to put the results"), + } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private static void printDistribution(ArrayList> components) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { System.out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { System.out.println(node); } } - public static void main(String[] args) { - JSAPResult config = parse_args(args); - - String graphPath = config.getString("graphPath"); - int numThreads = config.getInt("numThreads"); - String whitelistPath = config.getString("whitelistPath"); - boolean includeRootDir = config.getBoolean("includeRootDir"); - - ForkCC forkCc = new ForkCC(); - try { - forkCc.load_graph(graphPath); - forkCc.includeRootDir = includeRootDir; - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } - - if (whitelistPath != null) { - forkCc.parseWhitelist(whitelistPath); - } - - ProgressLogger logger = new ProgressLogger(); - try { - ArrayList> components = forkCc.compute(logger); - printDistribution(components); - // printLargestComponent(components); - } catch (IOException e) { - e.printStackTrace(); - } - logger.done(); - } - private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = new Graph(graphBasename).symmetrize(); System.err.println("Graph loaded."); this.emptySnapshot = null; this.whitelist = null; this.visited = null; this.includeRootDir = null; } private boolean nodeIsEmptySnapshot(Long node) { if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private Boolean shouldVisit(Long node) { Node.Type nt = this.graph.getNodeType(node); if (nt == Node.Type.CNT) { return false; } if (nt == Node.Type.DIR && !includeRootDir) return false; if (this.nodeIsEmptySnapshot(node)) return false; if (visited.getBoolean(node)) return false; return true; } private ArrayList> compute(ProgressLogger pl) throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier - final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue"); + final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 visited = LongArrayBitVector.ofLength(n); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); ArrayList> components = new ArrayList<>(); for (long i = 0; i < n; i++) { if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) continue; ArrayList component = new ArrayList<>(); queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); Node.Type cur_nt = this.graph.getNodeType(currentNode); if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) { // TODO: add a check that the origin has >=1 non-empty snapshot component.add(currentNode); } final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!shouldVisit(succ)) continue; if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (component.size() > 0) { components.add(component); } } pl.done(); queue.close(); return components; } + private static void printDistribution(ArrayList> components, Formatter out) { + TreeMap distribution = new TreeMap<>(); + for (ArrayList component : components) { + distribution.merge((long) component.size(), 1L, Long::sum); + } + + for (Map.Entry entry : distribution.entrySet()) { + out.format("%d %d\n", entry.getKey(), entry.getValue()); + } + } + + private static void printLargestComponent(ArrayList> components, Formatter out) { + int indexLargest = 0; + for (int i = 1; i < components.size(); ++i) { + if (components.get(i).size() > components.get(indexLargest).size()) + indexLargest = i; + } + + ArrayList component = components.get(indexLargest); + for (Long node : component) { + out.format("%d\n", node); + } + } + + private static void printAllComponents(ArrayList> components, Formatter out) { + for (int i = 1; i < components.size(); ++i) { + ArrayList component = components.get(i); + for (Long node : component) { + out.format("%d ", node); + } + out.format("\n"); + } + } + private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); - Scanner scanner = null; + Scanner scanner; try { scanner = new Scanner(new File(path)); while (scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + String whitelistPath = config.getString("whitelistPath"); + boolean includeRootDir = config.getBoolean("includeRootDir"); + String outdirPath = config.getString("outdir"); + + ForkCC forkCc = new ForkCC(); + try { + forkCc.load_graph(graphPath); + forkCc.includeRootDir = includeRootDir; + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + if (whitelistPath != null) { + forkCc.parseWhitelist(whitelistPath); + } + + ProgressLogger logger = new ProgressLogger(); + //noinspection ResultOfMethodCallIgnored + new File(outdirPath).mkdirs(); + try { + ArrayList> components = forkCc.compute(logger); + printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); + printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt")); + printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt")); + } catch (IOException e) { + e.printStackTrace(); + } + logger.done(); + } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java new file mode 100644 index 0000000..1ebc7fb --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java @@ -0,0 +1,228 @@ +package org.softwareheritage.graph.experiments.forks; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import com.google.common.primitives.Longs; +import com.martiansoftware.jsap.*; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.Node; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; + +public class ForkCliques { + private Graph graph; + private LongArrayBitVector whitelist; + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename); + System.err.println("Graph loaded."); + this.whitelist = null; + } + + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP( + ForkCliques.class.getName(), + "", + new Parameter[]{ + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, + 't', "whitelist", "Whitelist of origins"), + new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'o', "outdir", "Directory where to put the results"), + } + ); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + private ArrayList dfsAt(Long baseNode) { + ArrayList res = new ArrayList<>(); + + final Deque stack = new ArrayDeque<>(); + HashSet seen = new HashSet<>(); + stack.push(baseNode); + + while (!stack.isEmpty()) { + final Long currentNode = stack.pop(); + + final LazyLongIterator iterator = this.graph.predecessors(currentNode); + long succ; + while ((succ = iterator.nextLong()) != -1) { + if (!seen.contains(succ)) { + Node.Type nt = this.graph.getNodeType(succ); + if (nt == Node.Type.DIR || nt == Node.Type.CNT) + continue; + if (nt == Node.Type.ORI + && (this.whitelist == null || this.whitelist.getBoolean(succ))) { + res.add(succ); + } else { + stack.push(succ); + seen.add(succ); + } + } + } + } + + Collections.sort(res); + return res; + } + + private boolean isBaseRevision(Long node) { + if (this.graph.getNodeType(node) != Node.Type.REV) + return false; + + final LazyLongIterator iterator = this.graph.successors(node); + long succ; + while ((succ = iterator.nextLong()) != -1) { + if (this.graph.getNodeType(succ) == Node.Type.REV) + return false; + } + return true; + } + + static private String fingerprint(ArrayList cluster) { + MessageDigest digest; + try { + digest = MessageDigest.getInstance("SHA-256"); + } catch (NoSuchAlgorithmException e) { + e.printStackTrace(); + return null; + } + for (Long n : cluster) + digest.update(Longs.toByteArray(n)); + + return new String(digest.digest()); + } + + private ArrayList> compute(ProgressLogger pl) { + final long n = this.graph.numNodes(); + + HashSet fingerprints = new HashSet<>(); + ArrayList> clusters = new ArrayList<>(); + + pl.expectedUpdates = n; + pl.itemsName = "nodes"; + pl.start("Starting topological sort..."); + + for (long i = 0; i < n; i++) { + if (isBaseRevision(i)) { + ArrayList currentCluster = dfsAt(i); + String clusterFp = fingerprint(currentCluster); + if (!fingerprints.contains(clusterFp)) { + fingerprints.add(clusterFp); + clusters.add(currentCluster); + } + } + pl.update(); + } + pl.done(); + + return clusters; + } + + private static void printDistribution(ArrayList> components, Formatter out) { + TreeMap distribution = new TreeMap<>(); + for (ArrayList component : components) { + distribution.merge((long) component.size(), 1L, Long::sum); + } + + for (Map.Entry entry : distribution.entrySet()) { + out.format("%d %d\n", entry.getKey(), entry.getValue()); + } + } + + private static void printLargestComponent(ArrayList> components, Formatter out) { + int indexLargest = 0; + for (int i = 1; i < components.size(); ++i) { + if (components.get(i).size() > components.get(indexLargest).size()) + indexLargest = i; + } + + ArrayList component = components.get(indexLargest); + for (Long node : component) { + out.format("%d\n", node); + } + } + + private static void printAllComponents(ArrayList> components, Formatter out) { + for (int i = 1; i < components.size(); ++i) { + ArrayList component = components.get(i); + for (Long node : component) { + out.format("%d ", node); + } + out.format("\n"); + } + } + + private void parseWhitelist(String path) { + System.err.println("Loading whitelist " + path + " ..."); + this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); + Scanner scanner; + try { + scanner = new Scanner(new File(path)); + while(scanner.hasNextLong()) { + whitelist.set(scanner.nextLong()); + } + System.err.println("Whitelist loaded."); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + String whitelistPath = config.getString("whitelistPath"); + String outdirPath = config.getString("outdir"); + + ForkCliques forkCliques = new ForkCliques(); + try { + forkCliques.load_graph(graphPath); + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + if (whitelistPath != null) { + forkCliques.parseWhitelist(whitelistPath); + } + + Logger rootLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(Level.DEBUG); + + ProgressLogger logger = new ProgressLogger(rootLogger); + ArrayList> components = forkCliques.compute(logger); + + //noinspection ResultOfMethodCallIgnored + new File(outdirPath).mkdirs(); + try { + printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); + printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt")); + printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt")); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + logger.done(); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java similarity index 98% rename from java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java rename to java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java index 8ee8ede..81a0c50 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/ListEmptyOrigins.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java @@ -1,93 +1,93 @@ -package org.softwareheritage.graph.benchmark; +package org.softwareheritage.graph.experiments.forks; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import java.io.IOException; import java.util.ArrayList; public class ListEmptyOrigins { private Graph graph; private Long emptySnapshot; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( ListEmptyOrigins.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); ListEmptyOrigins leo = new ListEmptyOrigins(); try { leo.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } ArrayList badlist = leo.compute(leo.graph); for (Long bad : badlist) { System.out.println(bad); } } private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = new Graph(graphBasename); System.err.println("Graph loaded."); this.emptySnapshot = null; } private boolean nodeIsEmptySnapshot(Long node) { System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node); if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private ArrayList compute(ImmutableGraph graph) { final long n = graph.numNodes(); ArrayList bad = new ArrayList<>(); for (long i = 0; i < n; i++) { Node.Type nt = this.graph.getNodeType(i); if (nt != Node.Type.ORI) continue; final LazyLongIterator iterator = graph.successors(i); long succ; boolean found = false; while ((succ = iterator.nextLong()) != -1) { if (this.graph.outdegree(succ) > 0) { found = true; } } if (!found) bad.add(i); } return bad; } } diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java b/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java similarity index 98% rename from java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java rename to java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java index d9c3846..9fd8e6b 100644 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/GenDistribution.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java @@ -1,136 +1,136 @@ -package org.softwareheritage.graph.benchmark; +package org.softwareheritage.graph.experiments.multiplicationfactor; import com.martiansoftware.jsap.*; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.Traversal; import org.softwareheritage.graph.benchmark.utils.Timing; import java.io.IOException; import java.util.Scanner; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; public class GenDistribution { private Graph graph; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP( GenDistribution.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("srcType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's', "srctype", "Source node type"), new FlaggedOption("dstType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'd', "dsttype", "Destination node type"), new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'e', "edges", "Edges constraints"), new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, 't', "numthreads", "Number of threads"), } ); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); Node.Type srcType = Node.Type.fromStr(config.getString("srcType")); Node.Type dstType = Node.Type.fromStr(config.getString("dstType")); String edgesFmt = config.getString("edgesFmt"); int numThreads = config.getInt("numThreads"); GenDistribution tp = new GenDistribution(); try { tp.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } final long END_OF_QUEUE = -1L; ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); service.submit(() -> { try { Scanner input = new Scanner(System.in); while (input.hasNextLong()) { long node = input.nextLong(); if (tp.graph.getNodeType(node) == srcType) { queue.put(node); } } } catch (InterruptedException e) { e.printStackTrace(); } finally { for (int i = 0; i < numThreads; ++i) { try { queue.put(END_OF_QUEUE); } catch (InterruptedException e) { e.printStackTrace(); } } } }); for (int i = 0; i < numThreads; ++i) { service.submit(() -> { Graph thread_graph = tp.graph.copy(); long startTime; double totalTime; while (true) { Long node = null; try { node = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } if (node == null || node == END_OF_QUEUE) { return; } Traversal t = new Traversal(thread_graph, "backward", edgesFmt); int[] count = {0}; startTime = Timing.start(); t.visitNodesVisitor(node, (curnode) -> { if (tp.graph.getNodeType(curnode) == dstType) { count[0]++; } }); totalTime = Timing.stop(startTime); System.out.format("%d %d %d %d %f\n", node, count[0], t.getNbNodesAccessed(), t.getNbEdgesAccessed(), totalTime ); } }); } service.shutdown(); } private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = new Graph(graphBasename); System.err.println("Graph loaded."); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java new file mode 100644 index 0000000..c3a13e4 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java @@ -0,0 +1,205 @@ +package org.softwareheritage.graph.experiments.topology; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import com.martiansoftware.jsap.*; +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.big.webgraph.Transform; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.Graph; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; +import java.util.concurrent.ThreadLocalRandom; + +public class ClusteringCoefficient { + private Graph graph; + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename); + System.err.println("Graph loaded."); + } + + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP( + ClusteringCoefficient.class.getName(), + "", + new Parameter[]{ + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'o', "outdir", "Directory where to put the results"), + } + ); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + private long oppositeEdges(ImmutableGraph graph, long node) { + System.out.format("%d\n", node); + HashSet neighborhood = new HashSet<>(); + long succ; + final LazyLongIterator iterator = graph.successors(node); + while ((succ = iterator.nextLong()) != -1) { + System.out.format("%d neighbor add %d\n", node, succ); + neighborhood.add(succ); + } + + long closed_triplets = 0; + for (Long neighbor : neighborhood) { + System.out.format("%d neighbor visit %d\n", node, neighbor); + final LazyLongIterator it = graph.successors(neighbor); + while ((succ = it.nextLong()) != -1) { + System.out.format("%d neighbor visit %d succ %d\n", node, neighbor, succ); + if (neighborhood.contains(succ)) { + closed_triplets++; + } + } + } + return closed_triplets; + } + + public void compute(ImmutableGraph graph, ProgressLogger pl, Formatter out_local, Formatter out_global) { + final long n = this.graph.getNbNodes(); + + pl.expectedUpdates = n; + pl.itemsName = "nodes"; + + long nodes_d2 = 0; + double cum_lcc = 0; + double cum_lcc_c0 = 0; + double cum_lcc_c1 = 0; + HashMap distribution = new HashMap<>(); + + for (long node = 0; node < n; node++) { + long d = graph.outdegree(node); + if (d >= 2) { + double t = (d * (d - 1)); + double m = oppositeEdges(graph, node); + double lcc = m / t; + + distribution.merge(lcc, 1L, Long::sum); + + cum_lcc += lcc; + nodes_d2++; + } else { + cum_lcc_c1++; + } + pl.update(); + } + pl.done(); + + for (Map.Entry entry : distribution.entrySet()) { + out_local.format("%f %d\n", entry.getKey(), entry.getValue()); + } + + double gC = cum_lcc / nodes_d2; + double gC0 = cum_lcc_c0 / n; + double gC1 = cum_lcc_c1 / n; + + out_global.format("C: %f\n", gC); + out_global.format("C0: %f\n", gC0); + out_global.format("C1: %f\n", gC1); + } + + public void compute_approx(ImmutableGraph graph, Formatter out_global) { + final long n = this.graph.getNbNodes(); + + long trials = 0; + long triangles = 0; + + while (true) { + long node = ThreadLocalRandom.current().nextLong(0, n); + long d = graph.outdegree(node); + if (d >= 2) { + Long u = null; + Long v = null; + + long u_idx = ThreadLocalRandom.current().nextLong(0, d); + long v_idx = ThreadLocalRandom.current().nextLong(0, d - 1); + if (v_idx >= u_idx) { + v_idx++; + } + + long succ; + final LazyLongIterator node_iterator = graph.successors(node); + for (long succ_idx = 0; (succ = node_iterator.nextLong()) != -1; succ_idx++) { + if (succ_idx == u_idx) { + u = succ; + } + if (succ_idx == v_idx) { + v = succ; + } + } + + final LazyLongIterator u_iterator = graph.successors(u); + while ((succ = u_iterator.nextLong()) != -1) { + if (succ == v) + triangles++; + } + } + trials++; + + if (trials % 100 == 0 || true) { + double gC = (double)triangles / (double)trials; + out_global.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials); + System.out.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials); + } + } + } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + String outdirPath = config.getString("outdir"); + + ClusteringCoefficient ccoef = new ClusteringCoefficient(); + try { + ccoef.load_graph(graphPath); + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(Level.DEBUG); + + new File(outdirPath).mkdirs(); + + ImmutableGraph symmetric = Transform.union( + ccoef.graph.getBVGraph(false), + ccoef.graph.getBVGraph(true) + ); + try { + ccoef.compute_approx( + symmetric, + new Formatter(outdirPath + "/local.txt") + ); + /* + ccoef.compute( + symmetric, + new ProgressLogger(rootLogger), + new Formatter(outdirPath + "/local.txt"), + new Formatter(outdirPath + "/global.txt") + ); + */ + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java new file mode 100644 index 0000000..0170481 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java @@ -0,0 +1,165 @@ +package org.softwareheritage.graph.experiments.topology; + +import com.google.common.primitives.Longs; +import com.martiansoftware.jsap.*; +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.big.webgraph.Transform; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.Arrays; +import it.unimi.dsi.io.ByteDiskQueue; +import it.unimi.dsi.logging.ProgressLogger; +import org.softwareheritage.graph.Graph; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class ConnectedComponents { + private Graph graph; + + private void load_graph(String graphBasename) throws IOException { + System.err.println("Loading graph " + graphBasename + " ..."); + this.graph = new Graph(graphBasename).symmetrize(); + System.err.println("Graph loaded."); + } + + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP( + ConnectedComponents.class.getName(), + "", + new Parameter[] { + new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'g', "graph", "Basename of the compressed graph"), + new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + 'o', "outdir", "Directory where to put the results"), + } + ); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + private ArrayList> compute(ProgressLogger pl) throws IOException { + final long n = graph.numNodes(); + + // Allow enough memory to behave like in-memory queue + int bufferSize = (int)Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); + + // Use a disk based queue to store BFS frontier + final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); + final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); + final byte[] byteBuf = new byte[Long.BYTES]; + // WARNING: no 64-bit version of this data-structure, but it can support + // indices up to 2^37 + LongArrayBitVector visited = LongArrayBitVector.ofLength(n); + + pl.expectedUpdates = n; + pl.itemsName = "nodes"; + pl.start("Starting connected components visit..."); + + ArrayList> components = new ArrayList<>(); + + for (long i = 0; i < n; i++) { + ArrayList component = new ArrayList<>(); + + queue.enqueue(Longs.toByteArray(i)); + visited.set(i); + + while (!queue.isEmpty()) { + queue.dequeue(byteBuf); + final long currentNode = Longs.fromByteArray(byteBuf); + component.add(currentNode); + + final LazyLongIterator iterator = graph.successors(currentNode); + long succ; + while ((succ = iterator.nextLong()) != -1) { + if (visited.getBoolean(succ)) + continue; + visited.set(succ); + queue.enqueue(Longs.toByteArray(succ)); + } + + pl.update(); + } + + if (component.size() > 0) { + components.add(component); + } + } + pl.done(); + return components; + } + + private static void printDistribution(ArrayList> components, Formatter out) { + TreeMap distribution = new TreeMap<>(); + for (ArrayList component : components) { + distribution.merge((long) component.size(), 1L, Long::sum); + } + + for (Map.Entry entry : distribution.entrySet()) { + out.format("%d %d\n", entry.getKey(), entry.getValue()); + } + } + + private static void printLargestComponent(ArrayList> components, Formatter out) { + int indexLargest = 0; + for (int i = 1; i < components.size(); ++i) { + if (components.get(i).size() > components.get(indexLargest).size()) + indexLargest = i; + } + + ArrayList component = components.get(indexLargest); + for (Long node : component) { + out.format("%d\n", node); + } + } + + private static void printAllComponents(ArrayList> components, Formatter out) { + for (int i = 1; i < components.size(); ++i) { + ArrayList component = components.get(i); + for (Long node : component) { + out.format("%d ", node); + } + out.format("\n"); + } + } + + public static void main(String[] args) { + JSAPResult config = parse_args(args); + + String graphPath = config.getString("graphPath"); + String outdirPath = config.getString("outdir"); + + ConnectedComponents connectedComponents = new ConnectedComponents(); + try { + connectedComponents.load_graph(graphPath); + } catch (IOException e) { + System.out.println("Could not load graph: " + e); + System.exit(2); + } + + ProgressLogger logger = new ProgressLogger(); + //noinspection ResultOfMethodCallIgnored + new File(outdirPath).mkdirs(); + try { + ArrayList> components = connectedComponents.compute(logger); + components.sort(Comparator.comparing(ArrayList::size).reversed()); + + printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); + printLargestComponent(components, new Formatter(outdirPath + "/largest_component.txt")); + printAllComponents(components, new Formatter(outdirPath + "/all_components.txt")); + } catch (IOException e) { + e.printStackTrace(); + } + logger.done(); + } +}