diff --git a/java/pom.xml b/java/pom.xml index cd1eece..0b2172e 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1,269 +1,274 @@ 4.0.0 org.softwareheritage.graph swh-graph ${git.closest.tag.name} swh-graph https://forge.softwareheritage.org/source/swh-graph/ UTF-8 11 ch.qos.logback logback-classic 1.2.3 org.junit.jupiter junit-jupiter-api 5.7.0 test org.junit.jupiter junit-jupiter-engine 5.7.0 test org.hamcrest hamcrest 2.1 test io.javalin javalin 3.0.0 org.slf4j slf4j-simple 1.7.26 com.fasterxml.jackson.core jackson-databind 2.9.8 it.unimi.dsi webgraph-big - 3.6.5 + 3.6.6 it.unimi.dsi fastutil 8.4.4 it.unimi.dsi dsiutils - 2.6.16 + 2.6.17 + + + it.unimi.dsi + sux4j + 5.2.3 it.unimi.dsi law 2.7.1 org.apache.hadoop hadoop-common org.umlgraph umlgraph org.eclipse.jetty.aggregate jetty-all it.unimi.di mg4j it.unimi.di mg4j-big com.martiansoftware jsap 2.1 net.sf.py4j py4j 0.10.8.1 commons-codec commons-codec 1.11 maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 11 11 -verbose -Xlint:all maven-surefire-plugin 2.22.2 maven-failsafe-plugin 2.22.2 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0 maven-assembly-plugin 3.3.0 org.softwareheritage.graph.server.App jar-with-dependencies false make-assembly package single com.diffplug.spotless spotless-maven-plugin 2.4.1 *.md .gitignore true 4 4.16.0 .coding-style.xml pl.project13.maven git-commit-id-plugin 3.0.1 get-the-git-infos revision initialize true true true true v* git.closest.tag.name ^v true org.apache.maven.plugins maven-javadoc-plugin 3.1.1 diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java new file mode 100644 index 0000000..4f2dd00 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java @@ -0,0 +1,95 @@ +// TODO: should be in webgraph upstream +// package it.unimi.dsi.big.webgraph.labelling; +package org.softwareheritage.graph.labels; + +/* + * Copyright (C) 2020 TODO + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; + +import java.util.Arrays; + +/** An abstract (single-attribute) list-of-longs label. +* +*

This class provides basic methods for a label holding a list of longs. +* Concrete implementations may impose further requirements on the long. +* +*

Implementing subclasses must provide constructors, {@link Label#copy()}, +* {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} +* and possibly override {@link #toString()}. +*/ + +public abstract class AbstractLongListLabel extends AbstractLabel implements Label { + /** The key of the attribute represented by this label. */ + protected final String key; + /** The values of the attribute represented by this label. */ + public long[] value; + + /** Creates an long label with given key and value. + * + * @param key the (only) key of this label. + * @param value the value of this label. + */ + public AbstractLongListLabel(String key, long[] value) { + this.key = key; + this.value = value; + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[] { key }; + } + + @Override + public Class[] attributeTypes() { + return new Class[] { long[].class }; + } + + @Override + public Object get(String key) { + if (this.key.equals(key)) return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value); + } + + @Override + public boolean equals(Object x) { + if (x instanceof AbstractLongListLabel) return Arrays.equals(value, ((AbstractLongListLabel)x).value); + else return false; + } + + @Override + public int hashCode() { + return Arrays.hashCode(value); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java new file mode 100644 index 0000000..878db82 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java @@ -0,0 +1,135 @@ +package org.softwareheritage.graph.labels; + +/** + * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be + * encoded in a single long type, to re-use Webgraph interface. + * + * @author The Software Heritage developers + */ +public class DirEntry { + public long filenameId; + public int permission; + + public DirEntry(long filenameId, int permission) { + this.filenameId = filenameId; + this.permission = permission; + } + + public DirEntry(long dirEntryEncoded) { + this.filenameId = dirEntryEncoded >> Permission.NB_BITS_PER_TYPE; + int dirBytes = (int) (dirEntryEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); + this.permission = Permission.Type.fromEncoded(dirBytes); + } + + public long toEncoded() { + return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission); + } + + public static int labelWidth(long numLabels) { + int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); + if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) { + System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + + (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now."); + System.exit(2); + } + return filenameIdWidth + Permission.NB_BITS_PER_TYPE; + } + + /** + * Permission types present in the Software Heritage graph. + * + * @author The Software Heritage developers + */ + private static class Permission { + public static final int NB_BITS_PER_TYPE = (int) Math + .ceil(Math.log(Permission.Type.values().length) / Math.log(2)); + + public enum Type { + NONE, CONTENT, EXECUTABLE_CONTENT, SYMLINK, DIRECTORY, REVISION; + + public static Permission.Type fromIntCode(int intCode) { + switch (intCode) { + case 0: + return NONE; + case 1: + return CONTENT; + case 2: + return EXECUTABLE_CONTENT; + case 3: + return SYMLINK; + case 4: + return DIRECTORY; + case 5: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission code: " + intCode); + } + + public static int toIntCode(Permission.Type type) { + switch (type) { + case NONE: + return 0; + case CONTENT: + return 1; + case EXECUTABLE_CONTENT: + return 2; + case SYMLINK: + return 3; + case DIRECTORY: + return 4; + case REVISION: + return 5; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static Permission.Type fromIntPerm(int intPerm) { + switch (intPerm) { + case 0: + return NONE; + case 0100644: + return CONTENT; + case 0100755: + return EXECUTABLE_CONTENT; + case 0120000: + return SYMLINK; + case 0040000: + return DIRECTORY; + case 0160000: + return REVISION; + default : + return NONE; + } + // throw new IllegalArgumentException("Unknown node permission: " + intPerm); + // TODO: warning here instead? + } + + public static int toIntPerm(Permission.Type type) { + switch (type) { + case NONE: + return 0; + case CONTENT: + return 0100644; + case EXECUTABLE_CONTENT: + return 0100755; + case SYMLINK: + return 0120000; + case DIRECTORY: + return 0040000; + case REVISION: + return 0160000; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static int fromEncoded(int encoded) { + return toIntPerm(fromIntCode(encoded)); + } + + public static int toEncoded(int permission) { + return toIntCode(fromIntPerm(permission)); + } + } + } + +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java new file mode 100644 index 0000000..f99ec3e --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java @@ -0,0 +1,108 @@ +// TODO: should be in webgraph upstream +// package it.unimi.dsi.big.webgraph.labelling; +package org.softwareheritage.graph.labels; + +/* + * Copyright (C) 2020 TODO + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + + +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.fastutil.longs.LongArrays; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** A list of longs represented in fixed width. Each list is prefixed by its + * length written in {@linkplain OutputBitStream#writeGamma(int) γ + * coding}. + */ + +public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel { + /** The bit width used to represent the value of this label. */ + private final int width; + + /** Creates a new fixed-width long label. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + * @param value the value of this label. + */ + public FixedWidthLongListLabel(String key, int width, long[] value) { + super(key, value); + for(int i = value.length; i-- != 0;) if (value[i] < 0 || value[i] >= 1L << width) throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i])); + this.width = width; + } + + /** Creates a new fixed-width label with an empty list. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + */ + public FixedWidthLongListLabel(String key, int width) { + this(key, width, LongArrays.EMPTY_ARRAY); + } + + /** Creates a new fixed-width long label using the given key and width + * with an empty list. + * + * @param arg two strings containing the key and the width of this label. + */ + public FixedWidthLongListLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public Label copy() { + return new FixedWidthLongListLabel(key, width, value.clone()); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + long readBits = inputBitStream.readBits(); + value = new long[inputBitStream.readGamma()]; + for(int i = 0; i < value.length; i++) value[i] = inputBitStream.readLong(width); + return (int)(inputBitStream.readBits() - readBits); + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + int bits = outputBitStream.writeGamma(value.length); + for(int i = 0; i < value.length; i++) bits += outputBitStream.writeLong(value[i], width); + return bits; + } + + /** Returns -1 (the fixed width refers to a single long, not to the entire list). + * @return -1; + */ + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java new file mode 100644 index 0000000..efeef38 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java @@ -0,0 +1,108 @@ +package org.softwareheritage.graph.labels; + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Software Heritage graph edge labels following Webgraph labels convention. + * + * @author The Software Heritage developers + */ + +public class SwhLabel extends AbstractLabel { + private final String key; + private final int width; + // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry to store branch names) + public DirEntry[] value; + // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long + private final FixedWidthLongListLabel longList; + + private static final DirEntry[] EMPTY_ARRAY = {}; + + public SwhLabel(String key, int width, DirEntry[] value) { + this.key = key; + this.width = width; + this.value = value; + + long[] valueEncoded = new long[value.length]; + for (int i = 0; i < value.length; i++) + valueEncoded[i] = value[i].toEncoded(); + this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); + } + + public SwhLabel(String key, int width) { + this(key, width, EMPTY_ARRAY); + } + + public SwhLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + int ret = longList.fromBitStream(inputBitStream, sourceUnused); + // Decode values from their internal long representation + value = new DirEntry[longList.value.length]; + for (int i = 0; i < value.length; i++) + value[i] = new DirEntry(longList.value[i]); + return ret; + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + // Values have already been encoded in the SwhLabel constructor + return longList.toBitStream(outputBitStream, sourceUnused); + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[]{key}; + } + + @Override + public Class[] attributeTypes() { + return new Class[]{DirEntry[].class}; + } + + @Override + public Object get(String s) { + if (this.key.equals(key)) + return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public Label copy() { + return new SwhLabel(key, width, value.clone()); + } + + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java index 0d87a61..47905cd 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -1,215 +1,541 @@ package org.softwareheritage.graph.maps; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; +import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.io.FastBufferedReader; -import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.labels.DirEntry; +import org.softwareheritage.graph.labels.SwhLabel; import java.io.*; +import java.lang.reflect.Array; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.TimeUnit; public class LabelMapBuilder { - final static String SORT_BUFFER_SIZE = "40%"; - final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); + String graphPath; + String debugPath; + String tmpDir; + ImmutableGraph graph; + + Object2LongFunction swhIdMph; + long[][] orderMap; + Object2LongFunction filenameMph; + long numFilenames; + int totalLabelWidth; + + public LabelMapBuilder(String graphPath, String debugPath, String tmpDir) { + this.graphPath = graphPath; + this.debugPath = debugPath; + this.tmpDir = tmpDir; + } + private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd', "debug-path", "Store the intermediate representation here for debug"), new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp", "Temporary directory path"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } - public static void main(String[] args) throws IOException { + public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String tmpDir = config.getString("tmpDir"); String debugPath = config.getString("debugPath"); - logger.info("Starting label map generation..."); - computeLabelMap(graphPath, debugPath, tmpDir); - logger.info("Label map generation ended."); + LabelMapBuilder builder = new LabelMapBuilder(graphPath, debugPath, tmpDir); + + logger.info("Loading graph and MPH functions..."); + builder.loadGraph(); + + // builder.computeLabelMapSort(); + builder.computeLabelMapBsort(); } @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphBasename) throws IOException { - Object2LongFunction mphMap = null; + static Object2LongFunction loadMPH(String mphBasename) throws IOException { + Object2LongFunction mphMap = null; try { - logger.info("loading MPH function..."); - mphMap = (Object2LongFunction) BinIO.loadObject(mphBasename + ".mph"); - logger.info("MPH function loaded"); + mphMap = (Object2LongFunction) BinIO.loadObject(mphBasename + ".mph"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } return mphMap; } - static long getMPHSize(Object2LongFunction mph) { + static long getMPHSize(Object2LongFunction mph) { return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); } - static long SwhIDToNode(String strSWHID, Object2LongFunction mphMap, long[][] orderMap) { - long mphId = mphMap.getLong(strSWHID); - return BigArrays.get(orderMap, mphId); + void computeLabelMapSort() throws IOException { + // Pass the intermediate representation to sort(1) so that we see the labels in the order they will + // appear in the label file. + ProcessBuilder processBuilder = new ProcessBuilder(); + processBuilder.command("sort", "-k1,1n", "-k2,2n", // Numerical sort + "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); + Process sort = processBuilder.start(); + BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); + // BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); + FastBufferedInputStream sort_stdout = new FastBufferedInputStream(sort.getInputStream()); + + final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); + hashLabelStream(fbis, new EdgeLabelLineWriter() { + @Override + public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { + sort_stdin.write((src + "\t" + dst + "\t" + filenameId + "\t" + permission + "\n") + .getBytes(StandardCharsets.US_ASCII)); + } + }); + sort_stdin.close(); + + EdgeLabelLineIterator mapLines = new TextualEdgeLabelLineIterator(sort_stdout); + writeLabels(mapLines); + logger.info("Done"); + } + + void computeLabelMapBsort() throws IOException, InterruptedException { + // Pass the intermediate representation to bsort(1) so that we see the labels in the order they will + // appear in the label file. + + String tmpFile = tmpDir + "/labelsToSort.bin"; + final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); + final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmpFile))); + + // Number of bytes to represent a node. + final int nodeBytes = (Long.SIZE - Long.numberOfLeadingZeros(graph.numNodes())) / 8 + 1; + ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); + logger.info("Writing labels to a packed binary files (node bytes: {})", nodeBytes); + + hashLabelStream(fbis, new EdgeLabelLineWriter() { + @Override + public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { + buffer.putLong(0, src); + out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); + buffer.putLong(0, dst); + out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); + out.writeLong(filenameId); + out.writeInt(permission); + } + }); + + ProcessBuilder processBuilder = new ProcessBuilder(); + processBuilder.command( + "/home/seirl/bsort/src/bsort", + "-v", + "-r", String.valueOf(nodeBytes * 2 + Long.BYTES + Integer.BYTES), + "-k", String.valueOf(nodeBytes * 2), + tmpFile + ); + Process sort = processBuilder.start(); + sort.waitFor(); + + final DataInputStream sortedLabels = new DataInputStream(new BufferedInputStream(new FileInputStream(tmpFile))); + BinaryEdgeLabelLineIterator mapLines = new BinaryEdgeLabelLineIterator(sortedLabels, nodeBytes); + writeLabels(mapLines); + + logger.info("Done"); } - static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { - // Compute intermediate representation as: "

*

* * @author The Software Heritage developers */ public class NodeMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class); /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV"); System.exit(1); } String graphPath = args[0]; String tmpDir = args[1]; logger.info("starting maps generation..."); precomputeNodeIdMap(graphPath, tmpDir); logger.info("maps generation completed"); } /** * Computes and dumps on disk mapping files. * * @param graphPath path of the compressed graph */ // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plSWHID2Node.itemsName = "swhid→node"; plNode2SWHID.itemsName = "node→swhid"; /* * avg speed for swhid→node is sometime skewed due to write to the sort pipe hanging when sort is * sorting; hence also desplay local speed */ plSWHID2Node.displayLocalSpeed = true; // first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) - Object2LongFunction mphMap = null; + Object2LongFunction mphMap = null; try { logger.info("loading MPH function..."); - mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); + mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); logger.info("MPH function loaded"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); plSWHID2Node.expectedUpdates = nbIds; plNode2SWHID.expectedUpdates = nbIds; // second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); logger.info("loading BFS order file..."); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); logger.info("BFS order file loaded"); if (loaded != nbIds) { logger.error("graph contains " + nbIds + " nodes, but read " + loaded); System.exit(2); } /* * Create mapping SWHID -> WebGraph node id, by sequentially reading nodes, hashing them with MPH, * and permuting according to BFS order */ FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator swhidIterator = new LineIterator(buffer); /* * The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically * sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the * sorting job to /usr/bin/sort via pipes */ ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); // for the binary format of swhidToNodeMap, see Python module swh.graph.swhid:SwhidToIntMap // for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap try (DataOutputStream swhidToNodeMap = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(graphPath + Graph.SWHID_TO_NODE))); BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( new FileOutputStream(graphPath + Graph.NODE_TO_SWHID))) { /* * background handler for sort output, it will be fed SWHID/node pairs while swhidToNodeMap is being * filled, and will itself fill nodeToSwhidMap as soon as data from sort is ready */ SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); outputHandler.start(); /* * Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to * efficiently check edge restrictions. */ final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); final int nbBitsPerNodeType = log2NbTypes; LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); plSWHID2Node.start("filling swhid2node map"); for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { String swhidStr = swhidIterator.next().toString(); SWHID swhid = new SWHID(swhidStr); byte[] swhidBin = swhid.toBytes(); - long mphId = mphMap.getLong(swhidStr); + long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII)); long nodeId = BigArrays.get(bfsMap, mphId); swhidToNodeMap.write(swhidBin, 0, swhidBin.length); swhidToNodeMap.writeLong(nodeId); sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); nodeTypesMap.set(nodeId, swhid.getType().ordinal()); plSWHID2Node.lightUpdate(); } plSWHID2Node.done(); sort_stdin.close(); // write type map logger.info("storing type map"); BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); logger.info("type map stored"); // wait for nodeToSwhidMap filling try { logger.info("waiting for node2swhid map..."); int sortExitCode = sort.waitFor(); if (sortExitCode != 0) { logger.error("sort returned non-zero exit code: " + sortExitCode); System.exit(2); } outputHandler.join(); } catch (InterruptedException e) { logger.error("processing of sort output failed with: " + e); System.exit(2); } } } private static class SortOutputHandler extends Thread { private Scanner input; private OutputStream output; private ProgressLogger pl; SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { this.input = new Scanner(input, StandardCharsets.US_ASCII); this.output = output; this.pl = pl; } public void run() { boolean sortDone = false; logger.info("node2swhid: waiting for sort output..."); while (input.hasNextLine()) { if (!sortDone) { sortDone = true; this.pl.start("filling node2swhid map"); } String line = input.nextLine(); // format: SWHID NODE_ID SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID try { output.write((byte[]) swhid.toBytes()); } catch (IOException e) { logger.error("writing to node->SWHID map failed with: " + e); } this.pl.lightUpdate(); } this.pl.done(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java new file mode 100644 index 0000000..1938355 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java @@ -0,0 +1,81 @@ +package org.softwareheritage.graph.utils; + +import com.google.common.primitives.Longs; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.Arrays; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.io.ByteDiskQueue; +import it.unimi.dsi.io.FastBufferedReader; +import it.unimi.dsi.io.LineIterator; +import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.experiments.topology.ConnectedComponents; + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; + +public class ExportSubdataset { + @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast + static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { + return (Object2LongFunction) BinIO.loadObject(mphPath); + } + + public static void main(String[] args) throws IOException, ClassNotFoundException { + System.err.print("Loading everything..."); + String graphPath = args[0]; + Graph graph = new Graph(graphPath); + Object2LongFunction mphMap = loadMPH(graphPath + ".mph"); + System.err.println(" done."); + + final long n = graph.numNodes(); + + // Allow enough memory to behave like in-memory queue + int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); + + // Use a disk based queue to store BFS frontier + final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); + final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); + final byte[] byteBuf = new byte[Long.BYTES]; + // WARNING: no 64-bit version of this data-structure, but it can support + // indices up to 2^37 + LongArrayBitVector visited = LongArrayBitVector.ofLength(n); + + FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); + LineIterator lineIterator = new LineIterator(buffer); + + while (lineIterator.hasNext()) { + String line = lineIterator.next().toString(); + long i; + try { + // i = mphMap.getLong(line.getBytes(StandardCharsets.UTF_8)); + i = graph.getNodeId(new SWHID(line)); + } catch (IllegalArgumentException e) { + continue; + } + + queue.enqueue(Longs.toByteArray(i)); + visited.set(i); + + while (!queue.isEmpty()) { + queue.dequeue(byteBuf); + final long currentNode = Longs.fromByteArray(byteBuf); + SWHID currentNodeSWHID = graph.getSWHID(currentNode); + + final LazyLongIterator iterator = graph.successors(currentNode); + long succ; + while ((succ = iterator.nextLong()) != -1) { + System.out.format("%s %s\n", currentNodeSWHID, graph.getSWHID(succ)); + if (visited.getBoolean(succ)) + continue; + visited.set(succ); + queue.enqueue(Longs.toByteArray(succ)); + } + } + + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java index ca7012b..3be8ae0 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java @@ -1,50 +1,51 @@ package org.softwareheritage.graph.utils; import com.martiansoftware.jsap.*; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class MPHTranslate { private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(MPHTranslate.class.getName(), "", new Parameter[]{new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.REQUIRED, "Filename of the serialized MPH"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast - static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { - return (Object2LongFunction) BinIO.loadObject(mphPath); + static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { + return (Object2LongFunction) BinIO.loadObject(mphPath); } public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String mphPath = config.getString("function"); - Object2LongFunction mphMap = loadMPH(mphPath); + Object2LongFunction mphMap = loadMPH(mphPath); + // TODO: wasteful to convert to/from bytes FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); - System.out.println(mphMap.getLong(line)); + System.out.println(mphMap.getLong(line.getBytes(StandardCharsets.US_ASCII))); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java index 0e0f604..5380bb8 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java @@ -1,40 +1,42 @@ package org.softwareheritage.graph.utils; +import it.unimi.dsi.big.util.FrontCodedStringBigList; +import it.unimi.dsi.big.util.PermutedFrontCodedStringBigList; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.io.BinIO; -import it.unimi.dsi.util.PermutedFrontCodedStringList; +import org.softwareheritage.graph.labels.DirEntry; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; public class ReadLabelledGraph { public static void main(String[] args) throws IOException, ClassNotFoundException { String graphPath = args[0]; ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled"); NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes()); - PermutedFrontCodedStringList labelMap = (PermutedFrontCodedStringList) BinIO + FrontCodedStringBigList filenameMap = (FrontCodedStringBigList) BinIO .loadObject(graphPath + "-labels.fcl"); ArcLabelledNodeIterator it = graph.nodeIterator(); while (it.hasNext()) { long srcNode = it.nextLong(); ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - int[] labels = (int[]) s.label().get(); + DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { - for (int label : labels) { - System.out.format("%s %s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode), - labelMap.get(label)); + for (DirEntry label : labels) { + System.out.format("%s %s %s %d\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode), + filenameMap.get(label.filenameId), label.permission); } } else { System.out.format("%s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode)); } } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java new file mode 100644 index 0000000..52491b2 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java @@ -0,0 +1,57 @@ +package org.softwareheritage.graph.utils; + +import it.unimi.dsi.fastutil.BigArrays; +import it.unimi.dsi.fastutil.Size64; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.io.FastBufferedReader; +import it.unimi.dsi.io.LineIterator; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; + +public class WriteRevisionTimestamps { + @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast + static Object2LongFunction loadMPH(String mphPath) throws IOException, ClassNotFoundException { + return (Object2LongFunction) BinIO.loadObject(mphPath); + } + + public static void main(String[] args) throws IOException, ClassNotFoundException { + System.err.print("Loading everything..."); + String graphPath = args[0]; + String outputFile = args[1]; + Object2LongFunction mphMap = loadMPH(graphPath + ".mph"); + long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); + long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order"); + // NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds); + long[][] timestampArray = LongBigArrays.newBigArray(nbIds); + BigArrays.fill(timestampArray, Long.MIN_VALUE); + System.err.println(" done."); + + // TODO: wasteful to convert to/from bytes + FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); + LineIterator lineIterator = new LineIterator(buffer); + + while (lineIterator.hasNext()) { + String line = lineIterator.next().toString(); + String[] line_elements = line.split("[ \\t]"); + + // SWHID currentRev = new SWHID(line_elements[0].strip()); + long revId = -1; + long timestamp = -1; + try { + // revId = nodeIdMap.getNodeId(currentRev); + long revHash = mphMap.getLong(line_elements[0].strip().getBytes(StandardCharsets.US_ASCII)); + revId = BigArrays.get(nodePerm, revHash); + timestamp = Long.parseLong(line_elements[1].strip()); + } catch (IllegalArgumentException e) { + continue; + } + BigArrays.set(timestampArray, revId, timestamp); + // System.err.println(revId + " " + timestamp); + } + BinIO.storeLongs(timestampArray, outputFile); + } +}