diff --git a/java/pom.xml b/java/pom.xml
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -57,7 +57,12 @@
it.unimi.dsi
fastutil
- 8.4.1
+ 8.4.2
+
+
+ it.unimi.dsi
+ dsiutils
+ 2.6.7
it.unimi.dsi
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
new file mode 100644
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
@@ -0,0 +1,131 @@
+package org.softwareheritage.graph.labels;
+
+/**
+ * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be encoded in a single long
+ * type, to re-use Webgraph interface.
+ *
+ * @author The Software Heritage developers
+ */
+public class DirEntry {
+ public long filenameId;
+ public int permission;
+
+ public DirEntry(long filenameId, int permission)
+ {
+ this.filenameId = filenameId;
+ this.permission = permission;
+ }
+
+ public DirEntry(long dirEntryEncoded)
+ {
+ this.filenameId = dirEntryEncoded >> Permission.NB_BITS_PER_TYPE;
+ int dirBytes = (int) (dirEntryEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1));
+ this.permission = Permission.Type.fromEncoded(dirBytes);
+ }
+
+ public long toEncoded()
+ {
+ return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission);
+ }
+
+ public static int labelWidth(long numLabels)
+ {
+ int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2));
+ if (filenameIdWidth > 64 - Permission.NB_BITS_PER_TYPE) {
+ System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + (64 - Permission.NB_BITS_PER_TYPE) + " for now.");
+ System.exit(2);
+ }
+ return filenameIdWidth + Permission.NB_BITS_PER_TYPE;
+ }
+
+ /**
+ * Permission types present in the Software Heritage graph.
+ *
+ * @author The Software Heritage developers
+ */
+ private static class Permission {
+ public static final int NB_BITS_PER_TYPE =
+ (int) Math.ceil(Math.log(Permission.Type.values().length) / Math.log(2));
+
+ public enum Type {
+ CONTENT,
+ EXECUTABLE_CONTENT,
+ SYMLINK,
+ DIRECTORY,
+ REVISION;
+
+ public static Permission.Type fromIntCode(int intCode) {
+ switch (intCode) {
+ case 0:
+ return CONTENT;
+ case 1:
+ return EXECUTABLE_CONTENT;
+ case 2:
+ return SYMLINK;
+ case 3:
+ return DIRECTORY;
+ case 4:
+ return REVISION;
+ }
+ throw new IllegalArgumentException("Unknown node permission code: " + intCode);
+ }
+
+ public static int toIntCode(Permission.Type type) {
+ switch (type) {
+ case CONTENT:
+ return 0;
+ case EXECUTABLE_CONTENT:
+ return 1;
+ case SYMLINK:
+ return 2;
+ case DIRECTORY:
+ return 3;
+ case REVISION:
+ return 4;
+ }
+ throw new IllegalArgumentException("Unknown node permission type: " + type);
+ }
+
+ public static Permission.Type fromIntPerm(int intPerm) {
+ switch (intPerm) {
+ case 0100644:
+ return CONTENT;
+ case 0100755:
+ return EXECUTABLE_CONTENT;
+ case 0120000:
+ return SYMLINK;
+ case 0040000:
+ return DIRECTORY;
+ case 0160000:
+ return REVISION;
+ }
+ throw new IllegalArgumentException("Unknown node permission: " + intPerm);
+ }
+
+ public static int toIntPerm(Permission.Type type) {
+ switch (type) {
+ case CONTENT:
+ return 0100644;
+ case EXECUTABLE_CONTENT:
+ return 0100755;
+ case SYMLINK:
+ return 0120000;
+ case DIRECTORY:
+ return 0040000;
+ case REVISION:
+ return 0160000;
+ }
+ throw new IllegalArgumentException("Unknown node permission type: " + type);
+ }
+
+ public static int fromEncoded(int encoded) {
+ return toIntPerm(fromIntCode(encoded));
+ }
+
+ public static int toEncoded(int permission) {
+ return toIntCode(fromIntPerm(permission));
+ }
+ }
+ }
+
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
new file mode 100644
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
@@ -0,0 +1,109 @@
+package org.softwareheritage.graph.labels;
+
+import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
+import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel;
+import it.unimi.dsi.big.webgraph.labelling.Label;
+import it.unimi.dsi.io.InputBitStream;
+import it.unimi.dsi.io.OutputBitStream;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Software Heritage graph edge labels following Webgraph labels convention.
+ *
+ * @author The Software Heritage developers
+ */
+
+public class SwhLabel extends AbstractLabel {
+ private final String key;
+ private final int width;
+ // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry to store branch names)
+ public DirEntry[] value;
+ // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long
+ private final FixedWidthLongListLabel longList;
+
+ private static final DirEntry[] EMPTY_ARRAY = {};
+
+ public SwhLabel(String key, int width, DirEntry[] value) {
+ this.key = key;
+ this.width = width;
+ this.value = value;
+
+ long[] valueEncoded = new long[value.length];
+ for (int i = 0; i < value.length; i++)
+ valueEncoded[i] = value[i].toEncoded();
+ this.longList = new FixedWidthLongListLabel(key, width, valueEncoded);
+ }
+
+ public SwhLabel(String key, int width) {
+ this(key, width, EMPTY_ARRAY);
+ }
+
+ public SwhLabel(String... arg) {
+ this(arg[0], Integer.parseInt(arg[1]));
+ }
+
+ @Override
+ public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
+ int ret = longList.fromBitStream(inputBitStream, sourceUnused);
+ // Decode values from their internal long representation
+ value = new DirEntry[longList.value.length];
+ for (int i = 0; i < value.length; i++)
+ value[i] = new DirEntry(longList.value[i]);
+ return ret;
+ }
+
+ @Override
+ public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
+ // Values have already been encoded in the SwhLabel constructor
+ return longList.toBitStream(outputBitStream, sourceUnused);
+ }
+
+ @Override
+ public String wellKnownAttributeKey() {
+ return key;
+ }
+
+ @Override
+ public String[] attributeKeys() {
+ return new String[]{key};
+ }
+
+ @Override
+ public Class>[] attributeTypes() {
+ return new Class[]{DirEntry[].class};
+ }
+
+ @Override
+ public Object get(String s) {
+ if (this.key.equals(key))
+ return value;
+ throw new IllegalArgumentException();
+ }
+
+ @Override
+ public Object get() {
+ return value;
+ }
+
+ @Override
+ public Label copy() {
+ return new SwhLabel(key, width, value.clone());
+ }
+
+ @Override
+ public int fixedWidth() {
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
+ }
+
+ @Override
+ public String toSpec() {
+ return this.getClass().getName() + "(" + key + "," + width + ")";
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
--- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
@@ -4,8 +4,6 @@
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
-import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntLabel;
-import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
@@ -20,6 +18,8 @@
import it.unimi.dsi.big.webgraph.NodeIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.softwareheritage.graph.labels.DirEntry;
+import org.softwareheritage.graph.labels.SwhLabel;
import java.io.*;
import java.nio.charset.StandardCharsets;
@@ -106,13 +106,10 @@
long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph));
BinIO.loadLongs(graphPath + ".order", orderMap);
- Object2LongFunction labelMPH = loadMPH(graphPath + "-labels");
- long numLabels = getMPHSize(labelMPH);
- int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2));
- if (labelWidth > 30) {
- logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now.");
- System.exit(2);
- }
+ Object2LongFunction filenameMph = loadMPH(graphPath + "-filename-labels");
+ long numFilenames = getMPHSize(filenameMph);
+
+ int totalLabelWidth = DirEntry.labelWidth(numFilenames);
ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plInter.itemsName = "edges";
@@ -123,7 +120,7 @@
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command(
"sort",
- "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields
+ "-k1,1n", "-k2,2n", // Numerical sort
"--numeric-sort",
"--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir
@@ -139,14 +136,15 @@
plInter.start("Piping intermediate representation to sort(1)");
while (edgeIterator.hasNext()) {
String[] edge = edgeIterator.next().toString().split(" ");
- if (edge.length < 3)
+ if (edge.length < 4)
continue;
long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap);
long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap);
- long labelId = labelMPH.getLong(edge[2]);
+ long filenameId = filenameMph.getLong(edge[2]);
+ int permission = Integer.parseInt(edge[3]);
- sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n")
+ sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n")
.getBytes(StandardCharsets.US_ASCII));
plInter.lightUpdate();
}
@@ -176,22 +174,23 @@
NodeIterator it = graph.nodeIterator();
long labelSrcNode = -1;
long labelDstNode = -1;
- long labelId = -1;
+ long labelFilenameId = -1;
+ int labelPermission = -1;
while (it.hasNext()) {
long srcNode = it.nextLong();
// Fill a hashmap with the labels of each edge starting from this node
- HashMap> successorsLabels = new HashMap<>();
+ HashMap> successorsLabels = new HashMap<>();
while (labelSrcNode <= srcNode) {
if (labelSrcNode == srcNode) {
successorsLabels
.computeIfAbsent(
labelDstNode,
k -> new ArrayList<>()
- ).add(labelId);
+ ).add(new DirEntry(labelFilenameId, labelPermission));
if (debugFile != null) {
- debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n");
+ debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n");
}
}
@@ -202,18 +201,17 @@
String[] parts = line.split("\\t");
labelSrcNode = Long.parseLong(parts[0]);
labelDstNode = Long.parseLong(parts[1]);
- labelId = Long.parseLong(parts[2]);
+ labelFilenameId = Long.parseLong(parts[2]);
+ labelPermission = Integer.parseInt(parts[3]);
}
int bits = 0;
LazyLongIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
- List edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList());
- bits += labels.writeGamma(edgeLabels.size());
- for (Long label : edgeLabels) {
- bits += labels.writeLong(label, labelWidth);
- }
+ List currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList());
+ SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, currentLabels.toArray(new DirEntry[0]));
+ bits += l.toBitStream(labels, -1);
}
offsets.writeGamma(bits);
@@ -228,7 +226,7 @@
PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties" ));
pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName());
- pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")" );
+ pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName() + "(DirEntry," + totalLabelWidth + ")" );
pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath);
pw.close();
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
@@ -1,10 +1,11 @@
package org.softwareheritage.graph.utils;
+import it.unimi.dsi.big.util.PermutedFrontCodedStringBigList;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
import it.unimi.dsi.fastutil.io.BinIO;
-import it.unimi.dsi.util.PermutedFrontCodedStringList;
+import org.softwareheritage.graph.labels.DirEntry;
import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
@@ -15,7 +16,7 @@
ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled");
NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes());
- PermutedFrontCodedStringList labelMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-labels.fcl");
+ PermutedFrontCodedStringBigList filenameMap = (PermutedFrontCodedStringBigList) BinIO.loadObject(graphPath + "-filename-labels.fcl");
ArcLabelledNodeIterator it = graph.nodeIterator();
while (it.hasNext()) {
@@ -24,14 +25,15 @@
ArcLabelledNodeIterator.LabelledArcIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
- int[] labels = (int[]) s.label().get();
+ DirEntry[] labels = (DirEntry[]) s.label().get();
if (labels.length > 0) {
- for (int label : labels) {
+ for (DirEntry label : labels) {
System.out.format(
- "%s %s %s\n",
+ "%s %s %s %d\n",
nodeMap.getSWHID(srcNode),
nodeMap.getSWHID(dstNode),
- labelMap.get(label)
+ filenameMap.get(label.filenameId),
+ label.permission
);
}
} else {