diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java
new file mode 100644
index 0000000..4f2dd00
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java
@@ -0,0 +1,95 @@
+// TODO: should be in webgraph upstream
+// package it.unimi.dsi.big.webgraph.labelling;
+package org.softwareheritage.graph.labels;
+
+/*
+ * Copyright (C) 2020 TODO
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ *
+ */
+
+import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
+import it.unimi.dsi.big.webgraph.labelling.Label;
+
+import java.util.Arrays;
+
+/** An abstract (single-attribute) list-of-longs label.
+*
+*
This class provides basic methods for a label holding a list of longs.
+* Concrete implementations may impose further requirements on the long.
+*
+*
Implementing subclasses must provide constructors, {@link Label#copy()},
+* {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)}
+* and possibly override {@link #toString()}.
+*/
+
+public abstract class AbstractLongListLabel extends AbstractLabel implements Label {
+ /** The key of the attribute represented by this label. */
+ protected final String key;
+ /** The values of the attribute represented by this label. */
+ public long[] value;
+
+ /** Creates an long label with given key and value.
+ *
+ * @param key the (only) key of this label.
+ * @param value the value of this label.
+ */
+ public AbstractLongListLabel(String key, long[] value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ @Override
+ public String wellKnownAttributeKey() {
+ return key;
+ }
+
+ @Override
+ public String[] attributeKeys() {
+ return new String[] { key };
+ }
+
+ @Override
+ public Class>[] attributeTypes() {
+ return new Class[] { long[].class };
+ }
+
+ @Override
+ public Object get(String key) {
+ if (this.key.equals(key)) return value;
+ throw new IllegalArgumentException();
+ }
+
+ @Override
+ public Object get() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return key + ":" + Arrays.toString(value);
+ }
+
+ @Override
+ public boolean equals(Object x) {
+ if (x instanceof AbstractLongListLabel) return Arrays.equals(value, ((AbstractLongListLabel)x).value);
+ else return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(value);
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
new file mode 100644
index 0000000..a614a7e
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
@@ -0,0 +1,131 @@
+package org.softwareheritage.graph.labels;
+
+/**
+ * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be encoded in a single long
+ * type, to re-use Webgraph interface.
+ *
+ * @author The Software Heritage developers
+ */
+public class DirEntry {
+ public long filenameId;
+ public int permission;
+
+ public DirEntry(long filenameId, int permission)
+ {
+ this.filenameId = filenameId;
+ this.permission = permission;
+ }
+
+ public DirEntry(long dirEntryEncoded)
+ {
+ this.filenameId = dirEntryEncoded >> Permission.NB_BITS_PER_TYPE;
+ int dirBytes = (int) (dirEntryEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1));
+ this.permission = Permission.Type.fromEncoded(dirBytes);
+ }
+
+ public long toEncoded()
+ {
+ return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission);
+ }
+
+ public static int labelWidth(long numLabels)
+ {
+ int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2));
+ if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) {
+ System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now.");
+ System.exit(2);
+ }
+ return filenameIdWidth + Permission.NB_BITS_PER_TYPE;
+ }
+
+ /**
+ * Permission types present in the Software Heritage graph.
+ *
+ * @author The Software Heritage developers
+ */
+ private static class Permission {
+ public static final int NB_BITS_PER_TYPE =
+ (int) Math.ceil(Math.log(Permission.Type.values().length) / Math.log(2));
+
+ public enum Type {
+ CONTENT,
+ EXECUTABLE_CONTENT,
+ SYMLINK,
+ DIRECTORY,
+ REVISION;
+
+ public static Permission.Type fromIntCode(int intCode) {
+ switch (intCode) {
+ case 0:
+ return CONTENT;
+ case 1:
+ return EXECUTABLE_CONTENT;
+ case 2:
+ return SYMLINK;
+ case 3:
+ return DIRECTORY;
+ case 4:
+ return REVISION;
+ }
+ throw new IllegalArgumentException("Unknown node permission code: " + intCode);
+ }
+
+ public static int toIntCode(Permission.Type type) {
+ switch (type) {
+ case CONTENT:
+ return 0;
+ case EXECUTABLE_CONTENT:
+ return 1;
+ case SYMLINK:
+ return 2;
+ case DIRECTORY:
+ return 3;
+ case REVISION:
+ return 4;
+ }
+ throw new IllegalArgumentException("Unknown node permission type: " + type);
+ }
+
+ public static Permission.Type fromIntPerm(int intPerm) {
+ switch (intPerm) {
+ case 0100644:
+ return CONTENT;
+ case 0100755:
+ return EXECUTABLE_CONTENT;
+ case 0120000:
+ return SYMLINK;
+ case 0040000:
+ return DIRECTORY;
+ case 0160000:
+ return REVISION;
+ }
+ throw new IllegalArgumentException("Unknown node permission: " + intPerm);
+ }
+
+ public static int toIntPerm(Permission.Type type) {
+ switch (type) {
+ case CONTENT:
+ return 0100644;
+ case EXECUTABLE_CONTENT:
+ return 0100755;
+ case SYMLINK:
+ return 0120000;
+ case DIRECTORY:
+ return 0040000;
+ case REVISION:
+ return 0160000;
+ }
+ throw new IllegalArgumentException("Unknown node permission type: " + type);
+ }
+
+ public static int fromEncoded(int encoded) {
+ return toIntPerm(fromIntCode(encoded));
+ }
+
+ public static int toEncoded(int permission) {
+ return toIntCode(fromIntPerm(permission));
+ }
+ }
+ }
+
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java
new file mode 100644
index 0000000..f99ec3e
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java
@@ -0,0 +1,108 @@
+// TODO: should be in webgraph upstream
+// package it.unimi.dsi.big.webgraph.labelling;
+package org.softwareheritage.graph.labels;
+
+/*
+ * Copyright (C) 2020 TODO
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ *
+ */
+
+
+import it.unimi.dsi.big.webgraph.labelling.Label;
+import it.unimi.dsi.fastutil.longs.LongArrays;
+import it.unimi.dsi.io.InputBitStream;
+import it.unimi.dsi.io.OutputBitStream;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/** A list of longs represented in fixed width. Each list is prefixed by its
+ * length written in {@linkplain OutputBitStream#writeGamma(int) γ
+ * coding}.
+ */
+
+public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel {
+ /** The bit width used to represent the value of this label. */
+ private final int width;
+
+ /** Creates a new fixed-width long label.
+ *
+ * @param key the (only) key of this label.
+ * @param width the label width (in bits).
+ * @param value the value of this label.
+ */
+ public FixedWidthLongListLabel(String key, int width, long[] value) {
+ super(key, value);
+ for(int i = value.length; i-- != 0;) if (value[i] < 0 || value[i] >= 1L << width) throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i]));
+ this.width = width;
+ }
+
+ /** Creates a new fixed-width label with an empty list.
+ *
+ * @param key the (only) key of this label.
+ * @param width the label width (in bits).
+ */
+ public FixedWidthLongListLabel(String key, int width) {
+ this(key, width, LongArrays.EMPTY_ARRAY);
+ }
+
+ /** Creates a new fixed-width long label using the given key and width
+ * with an empty list.
+ *
+ * @param arg two strings containing the key and the width of this label.
+ */
+ public FixedWidthLongListLabel(String... arg) {
+ this(arg[0], Integer.parseInt(arg[1]));
+ }
+
+ @Override
+ public Label copy() {
+ return new FixedWidthLongListLabel(key, width, value.clone());
+ }
+
+ @Override
+ public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
+ long readBits = inputBitStream.readBits();
+ value = new long[inputBitStream.readGamma()];
+ for(int i = 0; i < value.length; i++) value[i] = inputBitStream.readLong(width);
+ return (int)(inputBitStream.readBits() - readBits);
+ }
+
+ @Override
+ public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
+ int bits = outputBitStream.writeGamma(value.length);
+ for(int i = 0; i < value.length; i++) bits += outputBitStream.writeLong(value[i], width);
+ return bits;
+ }
+
+ /** Returns -1 (the fixed width refers to a single long, not to the entire list).
+ * @return -1;
+ */
+ @Override
+ public int fixedWidth() {
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
+ }
+
+ @Override
+ public String toSpec() {
+ return this.getClass().getName() + "(" + key + "," + width + ")";
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
new file mode 100644
index 0000000..efeef38
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
@@ -0,0 +1,108 @@
+package org.softwareheritage.graph.labels;
+
+import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
+import it.unimi.dsi.big.webgraph.labelling.Label;
+import it.unimi.dsi.io.InputBitStream;
+import it.unimi.dsi.io.OutputBitStream;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Software Heritage graph edge labels following Webgraph labels convention.
+ *
+ * @author The Software Heritage developers
+ */
+
+public class SwhLabel extends AbstractLabel {
+ private final String key;
+ private final int width;
+ // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry to store branch names)
+ public DirEntry[] value;
+ // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long
+ private final FixedWidthLongListLabel longList;
+
+ private static final DirEntry[] EMPTY_ARRAY = {};
+
+ public SwhLabel(String key, int width, DirEntry[] value) {
+ this.key = key;
+ this.width = width;
+ this.value = value;
+
+ long[] valueEncoded = new long[value.length];
+ for (int i = 0; i < value.length; i++)
+ valueEncoded[i] = value[i].toEncoded();
+ this.longList = new FixedWidthLongListLabel(key, width, valueEncoded);
+ }
+
+ public SwhLabel(String key, int width) {
+ this(key, width, EMPTY_ARRAY);
+ }
+
+ public SwhLabel(String... arg) {
+ this(arg[0], Integer.parseInt(arg[1]));
+ }
+
+ @Override
+ public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
+ int ret = longList.fromBitStream(inputBitStream, sourceUnused);
+ // Decode values from their internal long representation
+ value = new DirEntry[longList.value.length];
+ for (int i = 0; i < value.length; i++)
+ value[i] = new DirEntry(longList.value[i]);
+ return ret;
+ }
+
+ @Override
+ public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
+ // Values have already been encoded in the SwhLabel constructor
+ return longList.toBitStream(outputBitStream, sourceUnused);
+ }
+
+ @Override
+ public String wellKnownAttributeKey() {
+ return key;
+ }
+
+ @Override
+ public String[] attributeKeys() {
+ return new String[]{key};
+ }
+
+ @Override
+ public Class>[] attributeTypes() {
+ return new Class[]{DirEntry[].class};
+ }
+
+ @Override
+ public Object get(String s) {
+ if (this.key.equals(key))
+ return value;
+ throw new IllegalArgumentException();
+ }
+
+ @Override
+ public Object get() {
+ return value;
+ }
+
+ @Override
+ public Label copy() {
+ return new SwhLabel(key, width, value.clone());
+ }
+
+ @Override
+ public int fixedWidth() {
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
+ }
+
+ @Override
+ public String toSpec() {
+ return this.getClass().getName() + "(" + key + "," + width + ")";
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
index 0d87a61..e7cf34d 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
@@ -1,215 +1,223 @@
package org.softwareheritage.graph.maps;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
-import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.big.webgraph.BVGraph;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.NodeIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.softwareheritage.graph.labels.DirEntry;
+import org.softwareheritage.graph.labels.SwhLabel;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class LabelMapBuilder {
final static String SORT_BUFFER_SIZE = "40%";
final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class);
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd',
"debug-path", "Store the intermediate representation here for debug"),
new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp",
"Temporary directory path"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) throws IOException {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String tmpDir = config.getString("tmpDir");
String debugPath = config.getString("debugPath");
logger.info("Starting label map generation...");
computeLabelMap(graphPath, debugPath, tmpDir);
logger.info("Label map generation ended.");
}
@SuppressWarnings("unchecked") // Suppress warning for Object2LongFunction cast
static Object2LongFunction loadMPH(String mphBasename) throws IOException {
Object2LongFunction mphMap = null;
try {
logger.info("loading MPH function...");
mphMap = (Object2LongFunction) BinIO.loadObject(mphBasename + ".mph");
logger.info("MPH function loaded");
} catch (ClassNotFoundException e) {
logger.error("unknown class object in .mph file: " + e);
System.exit(2);
}
return mphMap;
}
static long getMPHSize(Object2LongFunction mph) {
return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size();
}
static long SwhIDToNode(String strSWHID, Object2LongFunction mphMap, long[][] orderMap) {
long mphId = mphMap.getLong(strSWHID);
return BigArrays.get(orderMap, mphId);
}
static void computeLabelMap(String graphPath, String debugPath, String tmpDir) throws IOException {
// Compute intermediate representation as: "