diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java new file mode 100644 index 0000000..4f2dd00 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java @@ -0,0 +1,95 @@ +// TODO: should be in webgraph upstream +// package it.unimi.dsi.big.webgraph.labelling; +package org.softwareheritage.graph.labels; + +/* + * Copyright (C) 2020 TODO + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; + +import java.util.Arrays; + +/** An abstract (single-attribute) list-of-longs label. +* +*

This class provides basic methods for a label holding a list of longs. +* Concrete implementations may impose further requirements on the long. +* +*

Implementing subclasses must provide constructors, {@link Label#copy()}, +* {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} +* and possibly override {@link #toString()}. +*/ + +public abstract class AbstractLongListLabel extends AbstractLabel implements Label { + /** The key of the attribute represented by this label. */ + protected final String key; + /** The values of the attribute represented by this label. */ + public long[] value; + + /** Creates an long label with given key and value. + * + * @param key the (only) key of this label. + * @param value the value of this label. + */ + public AbstractLongListLabel(String key, long[] value) { + this.key = key; + this.value = value; + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[] { key }; + } + + @Override + public Class[] attributeTypes() { + return new Class[] { long[].class }; + } + + @Override + public Object get(String key) { + if (this.key.equals(key)) return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value); + } + + @Override + public boolean equals(Object x) { + if (x instanceof AbstractLongListLabel) return Arrays.equals(value, ((AbstractLongListLabel)x).value); + else return false; + } + + @Override + public int hashCode() { + return Arrays.hashCode(value); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java new file mode 100644 index 0000000..a614a7e --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java @@ -0,0 +1,131 @@ +package org.softwareheritage.graph.labels; + +/** + * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be encoded in a single long + * type, to re-use Webgraph interface. + * + * @author The Software Heritage developers + */ +public class DirEntry { + public long filenameId; + public int permission; + + public DirEntry(long filenameId, int permission) + { + this.filenameId = filenameId; + this.permission = permission; + } + + public DirEntry(long dirEntryEncoded) + { + this.filenameId = dirEntryEncoded >> Permission.NB_BITS_PER_TYPE; + int dirBytes = (int) (dirEntryEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); + this.permission = Permission.Type.fromEncoded(dirBytes); + } + + public long toEncoded() + { + return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission); + } + + public static int labelWidth(long numLabels) + { + int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); + if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) { + System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now."); + System.exit(2); + } + return filenameIdWidth + Permission.NB_BITS_PER_TYPE; + } + + /** + * Permission types present in the Software Heritage graph. + * + * @author The Software Heritage developers + */ + private static class Permission { + public static final int NB_BITS_PER_TYPE = + (int) Math.ceil(Math.log(Permission.Type.values().length) / Math.log(2)); + + public enum Type { + CONTENT, + EXECUTABLE_CONTENT, + SYMLINK, + DIRECTORY, + REVISION; + + public static Permission.Type fromIntCode(int intCode) { + switch (intCode) { + case 0: + return CONTENT; + case 1: + return EXECUTABLE_CONTENT; + case 2: + return SYMLINK; + case 3: + return DIRECTORY; + case 4: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission code: " + intCode); + } + + public static int toIntCode(Permission.Type type) { + switch (type) { + case CONTENT: + return 0; + case EXECUTABLE_CONTENT: + return 1; + case SYMLINK: + return 2; + case DIRECTORY: + return 3; + case REVISION: + return 4; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static Permission.Type fromIntPerm(int intPerm) { + switch (intPerm) { + case 0100644: + return CONTENT; + case 0100755: + return EXECUTABLE_CONTENT; + case 0120000: + return SYMLINK; + case 0040000: + return DIRECTORY; + case 0160000: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission: " + intPerm); + } + + public static int toIntPerm(Permission.Type type) { + switch (type) { + case CONTENT: + return 0100644; + case EXECUTABLE_CONTENT: + return 0100755; + case SYMLINK: + return 0120000; + case DIRECTORY: + return 0040000; + case REVISION: + return 0160000; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static int fromEncoded(int encoded) { + return toIntPerm(fromIntCode(encoded)); + } + + public static int toEncoded(int permission) { + return toIntCode(fromIntPerm(permission)); + } + } + } + +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java new file mode 100644 index 0000000..f99ec3e --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java @@ -0,0 +1,108 @@ +// TODO: should be in webgraph upstream +// package it.unimi.dsi.big.webgraph.labelling; +package org.softwareheritage.graph.labels; + +/* + * Copyright (C) 2020 TODO + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + + +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.fastutil.longs.LongArrays; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** A list of longs represented in fixed width. Each list is prefixed by its + * length written in {@linkplain OutputBitStream#writeGamma(int) γ + * coding}. + */ + +public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel { + /** The bit width used to represent the value of this label. */ + private final int width; + + /** Creates a new fixed-width long label. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + * @param value the value of this label. + */ + public FixedWidthLongListLabel(String key, int width, long[] value) { + super(key, value); + for(int i = value.length; i-- != 0;) if (value[i] < 0 || value[i] >= 1L << width) throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i])); + this.width = width; + } + + /** Creates a new fixed-width label with an empty list. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + */ + public FixedWidthLongListLabel(String key, int width) { + this(key, width, LongArrays.EMPTY_ARRAY); + } + + /** Creates a new fixed-width long label using the given key and width + * with an empty list. + * + * @param arg two strings containing the key and the width of this label. + */ + public FixedWidthLongListLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public Label copy() { + return new FixedWidthLongListLabel(key, width, value.clone()); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + long readBits = inputBitStream.readBits(); + value = new long[inputBitStream.readGamma()]; + for(int i = 0; i < value.length; i++) value[i] = inputBitStream.readLong(width); + return (int)(inputBitStream.readBits() - readBits); + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + int bits = outputBitStream.writeGamma(value.length); + for(int i = 0; i < value.length; i++) bits += outputBitStream.writeLong(value[i], width); + return bits; + } + + /** Returns -1 (the fixed width refers to a single long, not to the entire list). + * @return -1; + */ + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java new file mode 100644 index 0000000..efeef38 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java @@ -0,0 +1,108 @@ +package org.softwareheritage.graph.labels; + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Software Heritage graph edge labels following Webgraph labels convention. + * + * @author The Software Heritage developers + */ + +public class SwhLabel extends AbstractLabel { + private final String key; + private final int width; + // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry to store branch names) + public DirEntry[] value; + // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long + private final FixedWidthLongListLabel longList; + + private static final DirEntry[] EMPTY_ARRAY = {}; + + public SwhLabel(String key, int width, DirEntry[] value) { + this.key = key; + this.width = width; + this.value = value; + + long[] valueEncoded = new long[value.length]; + for (int i = 0; i < value.length; i++) + valueEncoded[i] = value[i].toEncoded(); + this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); + } + + public SwhLabel(String key, int width) { + this(key, width, EMPTY_ARRAY); + } + + public SwhLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + int ret = longList.fromBitStream(inputBitStream, sourceUnused); + // Decode values from their internal long representation + value = new DirEntry[longList.value.length]; + for (int i = 0; i < value.length; i++) + value[i] = new DirEntry(longList.value[i]); + return ret; + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + // Values have already been encoded in the SwhLabel constructor + return longList.toBitStream(outputBitStream, sourceUnused); + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[]{key}; + } + + @Override + public Class[] attributeTypes() { + return new Class[]{DirEntry[].class}; + } + + @Override + public Object get(String s) { + if (this.key.equals(key)) + return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public Label copy() { + return new SwhLabel(key, width, value.clone()); + } + + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java index 0d87a61..e7cf34d 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -1,215 +1,223 @@ package org.softwareheritage.graph.maps; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.labels.DirEntry; +import org.softwareheritage.graph.labels.SwhLabel; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.TimeUnit; public class LabelMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd', "debug-path", "Store the intermediate representation here for debug"), new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp", "Temporary directory path"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String tmpDir = config.getString("tmpDir"); String debugPath = config.getString("debugPath"); logger.info("Starting label map generation..."); computeLabelMap(graphPath, debugPath, tmpDir); logger.info("Label map generation ended."); } @SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast static Object2LongFunction loadMPH(String mphBasename) throws IOException { Object2LongFunction mphMap = null; try { logger.info("loading MPH function..."); mphMap = (Object2LongFunction) BinIO.loadObject(mphBasename + ".mph"); logger.info("MPH function loaded"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } return mphMap; } static long getMPHSize(Object2LongFunction mph) { return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); } static long SwhIDToNode(String strSWHID, Object2LongFunction mphMap, long[][] orderMap) { long mphId = mphMap.getLong(strSWHID); return BigArrays.get(orderMap, mphId); } static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException { // Compute intermediate representation as: "