diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java index 4f2dd00..d71d7d8 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java @@ -1,95 +1,103 @@ // TODO: should be in webgraph upstream // package it.unimi.dsi.big.webgraph.labelling; package org.softwareheritage.graph.labels; /* * Copyright (C) 2020 TODO * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . * */ import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; import it.unimi.dsi.big.webgraph.labelling.Label; import java.util.Arrays; -/** An abstract (single-attribute) list-of-longs label. -* -*

This class provides basic methods for a label holding a list of longs. -* Concrete implementations may impose further requirements on the long. -* -*

Implementing subclasses must provide constructors, {@link Label#copy()}, -* {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} -* and possibly override {@link #toString()}. -*/ +/** + * An abstract (single-attribute) list-of-longs label. + * + *

+ * This class provides basic methods for a label holding a list of longs. Concrete implementations + * may impose further requirements on the long. + * + *

+ * Implementing subclasses must provide constructors, {@link Label#copy()}, + * {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, + * {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} and possibly override + * {@link #toString()}. + */ public abstract class AbstractLongListLabel extends AbstractLabel implements Label { - /** The key of the attribute represented by this label. */ - protected final String key; - /** The values of the attribute represented by this label. */ - public long[] value; + /** The key of the attribute represented by this label. */ + protected final String key; + /** The values of the attribute represented by this label. */ + public long[] value; - /** Creates an long label with given key and value. - * - * @param key the (only) key of this label. - * @param value the value of this label. - */ - public AbstractLongListLabel(String key, long[] value) { - this.key = key; - this.value = value; - } + /** + * Creates an long label with given key and value. + * + * @param key the (only) key of this label. + * @param value the value of this label. + */ + public AbstractLongListLabel(String key, long[] value) { + this.key = key; + this.value = value; + } - @Override - public String wellKnownAttributeKey() { - return key; - } + @Override + public String wellKnownAttributeKey() { + return key; + } - @Override - public String[] attributeKeys() { - return new String[] { key }; - } + @Override + public String[] attributeKeys() { + return new String[]{key}; + } - @Override - public Class[] attributeTypes() { - return new Class[] { long[].class }; - } + @Override + public Class[] attributeTypes() { + return new Class[]{long[].class}; + } - @Override - public Object get(String key) { - if (this.key.equals(key)) return value; - throw new IllegalArgumentException(); - } + @Override + public Object get(String key) { + if (this.key.equals(key)) + return value; + throw new IllegalArgumentException(); + } - @Override - public Object get() { - return value; - } + @Override + public Object get() { + return value; + } - @Override - public String toString() { - return key + ":" + Arrays.toString(value); - } + @Override + public String toString() { + return key + ":" + Arrays.toString(value); + } - @Override - public boolean equals(Object x) { - if (x instanceof AbstractLongListLabel) return Arrays.equals(value, ((AbstractLongListLabel)x).value); - else return false; - } + @Override + public boolean equals(Object x) { + if (x instanceof AbstractLongListLabel) + return Arrays.equals(value, ((AbstractLongListLabel) x).value); + else + return false; + } - @Override - public int hashCode() { - return Arrays.hashCode(value); - } + @Override + public int hashCode() { + return Arrays.hashCode(value); + } } diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java index f99ec3e..f1672d9 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java @@ -1,108 +1,115 @@ // TODO: should be in webgraph upstream // package it.unimi.dsi.big.webgraph.labelling; package org.softwareheritage.graph.labels; /* * Copyright (C) 2020 TODO * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . * */ - import it.unimi.dsi.big.webgraph.labelling.Label; import it.unimi.dsi.fastutil.longs.LongArrays; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import java.io.IOException; import java.util.Arrays; -/** A list of longs represented in fixed width. Each list is prefixed by its - * length written in {@linkplain OutputBitStream#writeGamma(int) γ - * coding}. +/** + * A list of longs represented in fixed width. Each list is prefixed by its length written in + * {@linkplain OutputBitStream#writeGamma(int) γ coding}. */ public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel { - /** The bit width used to represent the value of this label. */ - private final int width; + /** The bit width used to represent the value of this label. */ + private final int width; - /** Creates a new fixed-width long label. - * - * @param key the (only) key of this label. - * @param width the label width (in bits). - * @param value the value of this label. - */ - public FixedWidthLongListLabel(String key, int width, long[] value) { - super(key, value); - for(int i = value.length; i-- != 0;) if (value[i] < 0 || value[i] >= 1L << width) throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i])); - this.width = width; - } + /** + * Creates a new fixed-width long label. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + * @param value the value of this label. + */ + public FixedWidthLongListLabel(String key, int width, long[] value) { + super(key, value); + for (int i = value.length; i-- != 0;) + if (value[i] < 0 || value[i] >= 1L << width) + throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i])); + this.width = width; + } - /** Creates a new fixed-width label with an empty list. - * - * @param key the (only) key of this label. - * @param width the label width (in bits). - */ - public FixedWidthLongListLabel(String key, int width) { - this(key, width, LongArrays.EMPTY_ARRAY); - } + /** + * Creates a new fixed-width label with an empty list. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + */ + public FixedWidthLongListLabel(String key, int width) { + this(key, width, LongArrays.EMPTY_ARRAY); + } - /** Creates a new fixed-width long label using the given key and width - * with an empty list. - * - * @param arg two strings containing the key and the width of this label. - */ - public FixedWidthLongListLabel(String... arg) { - this(arg[0], Integer.parseInt(arg[1])); - } + /** + * Creates a new fixed-width long label using the given key and width with an empty list. + * + * @param arg two strings containing the key and the width of this label. + */ + public FixedWidthLongListLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } - @Override - public Label copy() { - return new FixedWidthLongListLabel(key, width, value.clone()); - } + @Override + public Label copy() { + return new FixedWidthLongListLabel(key, width, value.clone()); + } - @Override - public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { - long readBits = inputBitStream.readBits(); - value = new long[inputBitStream.readGamma()]; - for(int i = 0; i < value.length; i++) value[i] = inputBitStream.readLong(width); - return (int)(inputBitStream.readBits() - readBits); - } + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + long readBits = inputBitStream.readBits(); + value = new long[inputBitStream.readGamma()]; + for (int i = 0; i < value.length; i++) + value[i] = inputBitStream.readLong(width); + return (int) (inputBitStream.readBits() - readBits); + } - @Override - public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { - int bits = outputBitStream.writeGamma(value.length); - for(int i = 0; i < value.length; i++) bits += outputBitStream.writeLong(value[i], width); - return bits; - } + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + int bits = outputBitStream.writeGamma(value.length); + for (int i = 0; i < value.length; i++) + bits += outputBitStream.writeLong(value[i], width); + return bits; + } - /** Returns -1 (the fixed width refers to a single long, not to the entire list). - * @return -1; - */ - @Override - public int fixedWidth() { - return -1; - } + /** + * Returns -1 (the fixed width refers to a single long, not to the entire list). + * + * @return -1; + */ + @Override + public int fixedWidth() { + return -1; + } - @Override - public String toString() { - return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; - } + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } - @Override - public String toSpec() { - return this.getClass().getName() + "(" + key + "," + width + ")"; - } + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } } diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java index efeef38..41f7d5f 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java @@ -1,108 +1,109 @@ package org.softwareheritage.graph.labels; import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; import it.unimi.dsi.big.webgraph.labelling.Label; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import java.io.IOException; import java.util.Arrays; /** * Software Heritage graph edge labels following Webgraph labels convention. * * @author The Software Heritage developers */ public class SwhLabel extends AbstractLabel { private final String key; private final int width; - // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry to store branch names) + // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry + // to store branch names) public DirEntry[] value; // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long private final FixedWidthLongListLabel longList; private static final DirEntry[] EMPTY_ARRAY = {}; public SwhLabel(String key, int width, DirEntry[] value) { this.key = key; this.width = width; this.value = value; long[] valueEncoded = new long[value.length]; for (int i = 0; i < value.length; i++) valueEncoded[i] = value[i].toEncoded(); this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); } public SwhLabel(String key, int width) { this(key, width, EMPTY_ARRAY); } public SwhLabel(String... arg) { this(arg[0], Integer.parseInt(arg[1])); } @Override public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { int ret = longList.fromBitStream(inputBitStream, sourceUnused); // Decode values from their internal long representation value = new DirEntry[longList.value.length]; for (int i = 0; i < value.length; i++) value[i] = new DirEntry(longList.value[i]); return ret; } @Override public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { // Values have already been encoded in the SwhLabel constructor return longList.toBitStream(outputBitStream, sourceUnused); } @Override public String wellKnownAttributeKey() { return key; } @Override public String[] attributeKeys() { return new String[]{key}; } @Override public Class[] attributeTypes() { return new Class[]{DirEntry[].class}; } @Override public Object get(String s) { if (this.key.equals(key)) return value; throw new IllegalArgumentException(); } @Override public Object get() { return value; } @Override public Label copy() { return new SwhLabel(key, width, value.clone()); } @Override public int fixedWidth() { return -1; } @Override public String toString() { return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; } @Override public String toSpec() { return this.getClass().getName() + "(" + key + "," + width + ")"; } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java index 47905cd..ce140c5 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -1,541 +1,527 @@ package org.softwareheritage.graph.maps; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; import it.unimi.dsi.fastutil.longs.LongBigArrays; -import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.labels.DirEntry; import org.softwareheritage.graph.labels.SwhLabel; import java.io.*; -import java.lang.reflect.Array; -import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.TimeUnit; public class LabelMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); String graphPath; String debugPath; String tmpDir; ImmutableGraph graph; Object2LongFunction swhIdMph; long[][] orderMap; Object2LongFunction filenameMph; long numFilenames; int totalLabelWidth; public LabelMapBuilder(String graphPath, String debugPath, String tmpDir) { this.graphPath = graphPath; this.debugPath = debugPath; this.tmpDir = tmpDir; } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd', "debug-path", "Store the intermediate representation here for debug"), new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp", "Temporary directory path"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String tmpDir = config.getString("tmpDir"); String debugPath = config.getString("debugPath"); LabelMapBuilder builder = new LabelMapBuilder(graphPath, debugPath, tmpDir); logger.info("Loading graph and MPH functions..."); builder.loadGraph(); // builder.computeLabelMapSort(); builder.computeLabelMapBsort(); } @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast static Object2LongFunction loadMPH(String mphBasename) throws IOException { Object2LongFunction mphMap = null; try { mphMap = (Object2LongFunction) BinIO.loadObject(mphBasename + ".mph"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } return mphMap; } static long getMPHSize(Object2LongFunction mph) { return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); } void computeLabelMapSort() throws IOException { // Pass the intermediate representation to sort(1) so that we see the labels in the order they will // appear in the label file. ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "-k1,1n", "-k2,2n", // Numerical sort "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); // BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); FastBufferedInputStream sort_stdout = new FastBufferedInputStream(sort.getInputStream()); final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); hashLabelStream(fbis, new EdgeLabelLineWriter() { @Override public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { sort_stdin.write((src + "\t" + dst + "\t" + filenameId + "\t" + permission + "\n") .getBytes(StandardCharsets.US_ASCII)); } }); sort_stdin.close(); EdgeLabelLineIterator mapLines = new TextualEdgeLabelLineIterator(sort_stdout); writeLabels(mapLines); logger.info("Done"); } void computeLabelMapBsort() throws IOException, InterruptedException { // Pass the intermediate representation to bsort(1) so that we see the labels in the order they will // appear in the label file. String tmpFile = tmpDir + "/labelsToSort.bin"; final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmpFile))); // Number of bytes to represent a node. final int nodeBytes = (Long.SIZE - Long.numberOfLeadingZeros(graph.numNodes())) / 8 + 1; ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); logger.info("Writing labels to a packed binary files (node bytes: {})", nodeBytes); hashLabelStream(fbis, new EdgeLabelLineWriter() { @Override public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { buffer.putLong(0, src); out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); buffer.putLong(0, dst); out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); out.writeLong(filenameId); out.writeInt(permission); } }); ProcessBuilder processBuilder = new ProcessBuilder(); - processBuilder.command( - "/home/seirl/bsort/src/bsort", - "-v", - "-r", String.valueOf(nodeBytes * 2 + Long.BYTES + Integer.BYTES), - "-k", String.valueOf(nodeBytes * 2), - tmpFile - ); + processBuilder.command("/home/seirl/bsort/src/bsort", "-v", "-r", + String.valueOf(nodeBytes * 2 + Long.BYTES + Integer.BYTES), "-k", String.valueOf(nodeBytes * 2), + tmpFile); Process sort = processBuilder.start(); sort.waitFor(); final DataInputStream sortedLabels = new DataInputStream(new BufferedInputStream(new FileInputStream(tmpFile))); BinaryEdgeLabelLineIterator mapLines = new BinaryEdgeLabelLineIterator(sortedLabels, nodeBytes); writeLabels(mapLines); logger.info("Done"); } void loadGraph() throws IOException { graph = BVGraph.loadMapped(graphPath); swhIdMph = loadMPH(graphPath); orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); BinIO.loadLongs(graphPath + ".order", orderMap); filenameMph = loadMPH(graphPath + "-labels"); numFilenames = getMPHSize(filenameMph); totalLabelWidth = DirEntry.labelWidth(numFilenames); } void hashLabelStream(FastBufferedInputStream input, EdgeLabelLineWriter writer) throws IOException { // Compute intermediate representation and write it on : // "