Page MenuHomeSoftware Heritage

D4006.id14145.diff
No OneTemporary

D4006.id14145.diff

diff --git a/java/src/main/java/org/softwareheritage/graph/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/SwhLabel.java
new file mode 100644
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhLabel.java
@@ -0,0 +1,30 @@
+package org.softwareheritage.graph;
+
+/**
+ * Wrapper class to store the edge labels of the graph. Labels can be encoded in a single long type.
+ *
+ * @author The Software Heritage developers
+ */
+public class SwhLabel {
+ public long filenameId;
+ public int permission;
+
+ public SwhLabel(long filenameId, int permission)
+ {
+ this.filenameId = filenameId;
+ this.permission = permission;
+ }
+
+ public static SwhLabel fromEncoded(long labelEncoded)
+ {
+ long filenameId = labelEncoded >> SwhPerm.TOTAL_NB_BITS;
+ int permission = (int) (labelEncoded & ((1 << SwhPerm.TOTAL_NB_BITS) - 1));
+ return new SwhLabel(filenameId, permission);
+ }
+
+ public static long toEncoded(SwhLabel label)
+ {
+ long labelEncoded = (label.filenameId << SwhPerm.TOTAL_NB_BITS) + label.permission;
+ return labelEncoded;
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhPerm.java b/java/src/main/java/org/softwareheritage/graph/SwhPerm.java
new file mode 100644
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhPerm.java
@@ -0,0 +1,84 @@
+package org.softwareheritage.graph;
+
+/**
+ * Permission types present in the Software Heritage graph.
+ *
+ * @author The Software Heritage developers
+ */
+
+public class SwhPerm {
+ /** Software Heritage archives 5 types of permissions (listed below), so we only need 3 bits to compress them. */
+ public static final int TOTAL_NB_BITS = 3;
+
+ public enum Type {
+ CONTENT,
+ EXECUTABLE_CONTENT,
+ SYMLINK,
+ DIRECTORY,
+ REVISION;
+
+ public static Type fromInt(int intType) {
+ switch (intType) {
+ case 0:
+ return CONTENT;
+ case 1:
+ return EXECUTABLE_CONTENT;
+ case 2:
+ return SYMLINK;
+ case 3:
+ return DIRECTORY;
+ case 4:
+ return REVISION;
+ }
+ throw new IllegalArgumentException("Unknown node type: " + intType);
+ }
+
+ public static int toInt(Type type) {
+ switch (type) {
+ case CONTENT:
+ return 0;
+ case EXECUTABLE_CONTENT:
+ return 1;
+ case SYMLINK:
+ return 2;
+ case DIRECTORY:
+ return 3;
+ case REVISION:
+ return 4;
+ }
+ throw new IllegalArgumentException("Unknown node type: " + type);
+ }
+
+ public static Type fromOct(int octType) {
+ switch (octType) {
+ case 0100644:
+ return CONTENT;
+ case 0100755:
+ return EXECUTABLE_CONTENT;
+ case 0120000:
+ return SYMLINK;
+ case 0040000:
+ return DIRECTORY;
+ case 0160000:
+ return REVISION;
+ }
+ throw new IllegalArgumentException("Unknown node type: " + octType);
+ }
+
+ public static int toOct(Type type) {
+ switch (type) {
+ case CONTENT:
+ return 0100644;
+ case EXECUTABLE_CONTENT:
+ return 0100755;
+ case SYMLINK:
+ return 0120000;
+ case DIRECTORY:
+ return 0040000;
+ case REVISION:
+ return 0160000;
+ }
+ throw new IllegalArgumentException("Unknown node type: " + type);
+ }
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
--- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java
@@ -4,8 +4,6 @@
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
-import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntLabel;
-import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
@@ -20,6 +18,9 @@
import it.unimi.dsi.big.webgraph.NodeIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.softwareheritage.graph.SwhLabel;
+import org.softwareheritage.graph.SwhPerm;
+import org.softwareheritage.graph.webgraph.FixedWidthSwhLabelList;
import java.io.*;
import java.nio.charset.StandardCharsets;
@@ -106,14 +107,17 @@
long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph));
BinIO.loadLongs(graphPath + ".order", orderMap);
- Object2LongFunction<String> labelMPH = loadMPH(graphPath + "-labels");
- long numLabels = getMPHSize(labelMPH);
- int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2));
- if (labelWidth > 30) {
- logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now.");
+ // TODO: change the path to be explicit it is only filenames
+ Object2LongFunction<String> filenameMph = loadMPH(graphPath + "-labels");
+ long numFilenames = getMPHSize(filenameMph);
+ int filenameIdWidth = (int) Math.ceil(Math.log(numFilenames) / Math.log(2));
+ if (filenameIdWidth > 30) {
+ logger.error("FIXME: Too many filenames, we can't handle more than 2^30 for now.");
System.exit(2);
}
+ int totalLabelWidth = filenameIdWidth + SwhPerm.TOTAL_NB_BITS;
+
ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plInter.itemsName = "edges";
plInter.expectedUpdates = graph.numArcs();
@@ -123,7 +127,7 @@
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command(
"sort",
- "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields
+ "-k1,1n", "-k2,2n", "-k3,3n", "-k4,4n", // Numerical sort on all fields
"--numeric-sort",
"--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir
@@ -139,14 +143,15 @@
plInter.start("Piping intermediate representation to sort(1)");
while (edgeIterator.hasNext()) {
String[] edge = edgeIterator.next().toString().split(" ");
- if (edge.length < 3)
+ if (edge.length < 4)
continue;
long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap);
long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap);
- long labelId = labelMPH.getLong(edge[2]);
+ long filenameId = filenameMph.getLong(edge[2]);
+ int permission = Integer.parseInt(edge[3]);
- sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n")
+ sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n")
.getBytes(StandardCharsets.US_ASCII));
plInter.lightUpdate();
}
@@ -176,22 +181,23 @@
NodeIterator it = graph.nodeIterator();
long labelSrcNode = -1;
long labelDstNode = -1;
- long labelId = -1;
+ long labelFilenameId = -1;
+ int labelPermission = -1;
while (it.hasNext()) {
long srcNode = it.nextLong();
// Fill a hashmap with the labels of each edge starting from this node
- HashMap<Long, List<Long>> successorsLabels = new HashMap<>();
+ HashMap<Long, List<SwhLabel>> successorsLabels = new HashMap<>();
while (labelSrcNode <= srcNode) {
if (labelSrcNode == srcNode) {
successorsLabels
.computeIfAbsent(
labelDstNode,
k -> new ArrayList<>()
- ).add(labelId);
+ ).add(new SwhLabel(labelFilenameId, labelPermission));
if (debugFile != null) {
- debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n");
+ debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n");
}
}
@@ -202,17 +208,20 @@
String[] parts = line.split("\\t");
labelSrcNode = Long.parseLong(parts[0]);
labelDstNode = Long.parseLong(parts[1]);
- labelId = Long.parseLong(parts[2]);
+ labelFilenameId = Long.parseLong(parts[2]);
+ SwhPerm.Type permissionType = SwhPerm.Type.fromOct(Integer.parseInt(parts[3]));
+ labelPermission = SwhPerm.Type.toInt(permissionType);
}
int bits = 0;
LazyLongIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
- List<Long> edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList());
- bits += labels.writeGamma(edgeLabels.size());
- for (Long label : edgeLabels) {
- bits += labels.writeLong(label, labelWidth);
+ List<SwhLabel> currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList());
+ bits += labels.writeGamma(currentLabels.size());
+ for (SwhLabel label : currentLabels) {
+ long labelEncoded = SwhLabel.toEncoded(label);
+ bits += labels.writeLong(labelEncoded, totalLabelWidth);
}
}
offsets.writeGamma(bits);
@@ -228,7 +237,7 @@
PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties" ));
pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName());
- pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")" );
+ pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthSwhLabelList.class.getName() + "(SwhLabel," + totalLabelWidth + ")" );
pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath);
pw.close();
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
@@ -5,6 +5,8 @@
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.util.PermutedFrontCodedStringList;
+import org.softwareheritage.graph.SwhLabel;
+import org.softwareheritage.graph.SwhPerm;
import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
@@ -15,7 +17,8 @@
ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled");
NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes());
- PermutedFrontCodedStringList labelMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-labels.fcl");
+ // TODO: change the path to be explicit it is only filenames
+ PermutedFrontCodedStringList filenameMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-labels.fcl");
ArcLabelledNodeIterator it = graph.nodeIterator();
while (it.hasNext()) {
@@ -24,14 +27,16 @@
ArcLabelledNodeIterator.LabelledArcIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
- int[] labels = (int[]) s.label().get();
+ SwhLabel[] labels = (SwhLabel[]) s.label().get();
if (labels.length > 0) {
- for (int label : labels) {
+ for (SwhLabel label : labels) {
+ SwhPerm.Type permissionType = SwhPerm.Type.fromInt(label.permission);
System.out.format(
- "%s %s %s\n",
+ "%s %s %s %d\n",
nodeMap.getSWHID(srcNode),
nodeMap.getSWHID(dstNode),
- labelMap.get(label)
+ filenameMap.get((int) label.filenameId),
+ SwhPerm.Type.toOct(permissionType)
);
}
} else {
diff --git a/java/src/main/java/org/softwareheritage/graph/webgraph/FixedWidthSwhLabelList.java b/java/src/main/java/org/softwareheritage/graph/webgraph/FixedWidthSwhLabelList.java
new file mode 100644
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/webgraph/FixedWidthSwhLabelList.java
@@ -0,0 +1,109 @@
+package org.softwareheritage.graph.webgraph;
+
+import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
+import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel;
+import it.unimi.dsi.big.webgraph.labelling.Label;
+import it.unimi.dsi.io.InputBitStream;
+import it.unimi.dsi.io.OutputBitStream;
+import org.softwareheritage.graph.SwhLabel;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * List of {@link SwhLabel} represented in fixed width and following Webgraph labels convention.
+ *
+ * @author The Software Heritage developers
+ */
+
+public class FixedWidthSwhLabelList extends AbstractLabel {
+ private final String key;
+ private final int width;
+ public SwhLabel[] value;
+ // Use existing Webgraph class to represent a list of SwhLabel as a list of encoded long
+ private final FixedWidthLongListLabel longList;
+
+ private static final SwhLabel[] EMPTY_ARRAY = {};
+
+ public FixedWidthSwhLabelList(String key, int width, SwhLabel[] value) {
+ this.key = key;
+ this.width = width;
+ this.value = value;
+
+ long[] valueEncoded = new long[value.length];
+ for (int i = 0; i < value.length; i++)
+ valueEncoded[i] = SwhLabel.toEncoded(value[i]);
+ this.longList = new FixedWidthLongListLabel(key, width, valueEncoded);
+ }
+
+ public FixedWidthSwhLabelList(String key, int width) {
+ this(key, width, EMPTY_ARRAY);
+ }
+
+ public FixedWidthSwhLabelList(String... arg) {
+ this(arg[0], Integer.parseInt(arg[1]));
+ }
+
+ @Override
+ public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
+ int ret = longList.fromBitStream(inputBitStream, sourceUnused);
+ // Decode values from their internal long representation
+ value = new SwhLabel[longList.value.length];
+ for (int i = 0; i < value.length; i++)
+ value[i] = SwhLabel.fromEncoded(longList.value[i]);
+ return ret;
+ }
+
+ @Override
+ public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
+ // Values have already been encoded in the FixedWidthSwhLabelList constructor
+ return longList.toBitStream(outputBitStream, sourceUnused);
+ }
+
+ @Override
+ public String wellKnownAttributeKey() {
+ return key;
+ }
+
+ @Override
+ public String[] attributeKeys() {
+ return new String[]{key};
+ }
+
+ @Override
+ public Class<?>[] attributeTypes() {
+ return new Class[]{SwhLabel[].class};
+ }
+
+ @Override
+ public Object get(String s) {
+ if (this.key.equals(key))
+ return value;
+ throw new IllegalArgumentException();
+ }
+
+ @Override
+ public Object get() {
+ return value;
+ }
+
+ @Override
+ public Label copy() {
+ return new FixedWidthSwhLabelList(key, width, value.clone());
+ }
+
+ @Override
+ public int fixedWidth() {
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
+ }
+
+ @Override
+ public String toSpec() {
+ return this.getClass().getName() + "(" + key + "," + width + ")";
+ }
+}

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 2:53 AM (1 d, 21 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230862

Event Timeline