diff --git a/java/src/main/java/org/softwareheritage/graph/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/SwhLabel.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhLabel.java @@ -0,0 +1,30 @@ +package org.softwareheritage.graph; + +/** + * Wrapper class to store the edge labels of the graph. Labels can be encoded in a single long type. + * + * @author The Software Heritage developers + */ +public class SwhLabel { + public long filenameId; + public int permission; + + public SwhLabel(long filenameId, int permission) + { + this.filenameId = filenameId; + this.permission = permission; + } + + public static SwhLabel fromEncoded(long labelEncoded) + { + long filenameId = labelEncoded >> SwhPerm.TOTAL_NB_BITS; + int permission = (int) (labelEncoded & ((1 << SwhPerm.TOTAL_NB_BITS) - 1)); + return new SwhLabel(filenameId, permission); + } + + public static long toEncoded(SwhLabel label) + { + long labelEncoded = (label.filenameId << SwhPerm.TOTAL_NB_BITS) + label.permission; + return labelEncoded; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhPerm.java b/java/src/main/java/org/softwareheritage/graph/SwhPerm.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhPerm.java @@ -0,0 +1,84 @@ +package org.softwareheritage.graph; + +/** + * Permission types present in the Software Heritage graph. + * + * @author The Software Heritage developers + */ + +public class SwhPerm { + /** Software Heritage archives 5 types of permissions (listed below), so we only need 3 bits to compress them. */ + public static final int TOTAL_NB_BITS = 3; + + public enum Type { + CONTENT, + EXECUTABLE_CONTENT, + SYMLINK, + DIRECTORY, + REVISION; + + public static Type fromInt(int intType) { + switch (intType) { + case 0: + return CONTENT; + case 1: + return EXECUTABLE_CONTENT; + case 2: + return SYMLINK; + case 3: + return DIRECTORY; + case 4: + return REVISION; + } + throw new IllegalArgumentException("Unknown node type: " + intType); + } + + public static int toInt(Type type) { + switch (type) { + case CONTENT: + return 0; + case EXECUTABLE_CONTENT: + return 1; + case SYMLINK: + return 2; + case DIRECTORY: + return 3; + case REVISION: + return 4; + } + throw new IllegalArgumentException("Unknown node type: " + type); + } + + public static Type fromOct(int octType) { + switch (octType) { + case 0100644: + return CONTENT; + case 0100755: + return EXECUTABLE_CONTENT; + case 0120000: + return SYMLINK; + case 0040000: + return DIRECTORY; + case 0160000: + return REVISION; + } + throw new IllegalArgumentException("Unknown node type: " + octType); + } + + public static int toOct(Type type) { + switch (type) { + case CONTENT: + return 0100644; + case EXECUTABLE_CONTENT: + return 0100755; + case SYMLINK: + return 0120000; + case DIRECTORY: + return 0040000; + case REVISION: + return 0160000; + } + throw new IllegalArgumentException("Unknown node type: " + type); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -4,8 +4,6 @@ import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntLabel; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; @@ -20,6 +18,9 @@ import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.SwhLabel; +import org.softwareheritage.graph.SwhPerm; +import org.softwareheritage.graph.webgraph.FixedWidthSwhLabelList; import java.io.*; import java.nio.charset.StandardCharsets; @@ -106,14 +107,17 @@ long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); BinIO.loadLongs(graphPath + ".order", orderMap); - Object2LongFunction labelMPH = loadMPH(graphPath + "-labels"); - long numLabels = getMPHSize(labelMPH); - int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); - if (labelWidth > 30) { - logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now."); + // TODO: change the path to be explicit it is only filenames + Object2LongFunction filenameMph = loadMPH(graphPath + "-labels"); + long numFilenames = getMPHSize(filenameMph); + int filenameIdWidth = (int) Math.ceil(Math.log(numFilenames) / Math.log(2)); + if (filenameIdWidth > 30) { + logger.error("FIXME: Too many filenames, we can't handle more than 2^30 for now."); System.exit(2); } + int totalLabelWidth = filenameIdWidth + SwhPerm.TOTAL_NB_BITS; + ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plInter.itemsName = "edges"; plInter.expectedUpdates = graph.numArcs(); @@ -123,7 +127,7 @@ ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command( "sort", - "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields + "-k1,1n", "-k2,2n", "-k3,3n", "-k4,4n", // Numerical sort on all fields "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir @@ -139,14 +143,15 @@ plInter.start("Piping intermediate representation to sort(1)"); while (edgeIterator.hasNext()) { String[] edge = edgeIterator.next().toString().split(" "); - if (edge.length < 3) + if (edge.length < 4) continue; long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); - long labelId = labelMPH.getLong(edge[2]); + long filenameId = filenameMph.getLong(edge[2]); + int permission = Integer.parseInt(edge[3]); - sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n") + sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n") .getBytes(StandardCharsets.US_ASCII)); plInter.lightUpdate(); } @@ -176,22 +181,23 @@ NodeIterator it = graph.nodeIterator(); long labelSrcNode = -1; long labelDstNode = -1; - long labelId = -1; + long labelFilenameId = -1; + int labelPermission = -1; while (it.hasNext()) { long srcNode = it.nextLong(); // Fill a hashmap with the labels of each edge starting from this node - HashMap> successorsLabels = new HashMap<>(); + HashMap> successorsLabels = new HashMap<>(); while (labelSrcNode <= srcNode) { if (labelSrcNode == srcNode) { successorsLabels .computeIfAbsent( labelDstNode, k -> new ArrayList<>() - ).add(labelId); + ).add(new SwhLabel(labelFilenameId, labelPermission)); if (debugFile != null) { - debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n"); + debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n"); } } @@ -202,17 +208,20 @@ String[] parts = line.split("\\t"); labelSrcNode = Long.parseLong(parts[0]); labelDstNode = Long.parseLong(parts[1]); - labelId = Long.parseLong(parts[2]); + labelFilenameId = Long.parseLong(parts[2]); + SwhPerm.Type permissionType = SwhPerm.Type.fromOct(Integer.parseInt(parts[3])); + labelPermission = SwhPerm.Type.toInt(permissionType); } int bits = 0; LazyLongIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - List edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); - bits += labels.writeGamma(edgeLabels.size()); - for (Long label : edgeLabels) { - bits += labels.writeLong(label, labelWidth); + List currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); + bits += labels.writeGamma(currentLabels.size()); + for (SwhLabel label : currentLabels) { + long labelEncoded = SwhLabel.toEncoded(label); + bits += labels.writeLong(labelEncoded, totalLabelWidth); } } offsets.writeGamma(bits); @@ -228,7 +237,7 @@ PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties" )); pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); - pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")" ); + pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthSwhLabelList.class.getName() + "(SwhLabel," + totalLabelWidth + ")" ); pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); pw.close(); } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java @@ -5,6 +5,8 @@ import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.util.PermutedFrontCodedStringList; +import org.softwareheritage.graph.SwhLabel; +import org.softwareheritage.graph.SwhPerm; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; @@ -15,7 +17,8 @@ ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled"); NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes()); - PermutedFrontCodedStringList labelMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-labels.fcl"); + // TODO: change the path to be explicit it is only filenames + PermutedFrontCodedStringList filenameMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-labels.fcl"); ArcLabelledNodeIterator it = graph.nodeIterator(); while (it.hasNext()) { @@ -24,14 +27,16 @@ ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - int[] labels = (int[]) s.label().get(); + SwhLabel[] labels = (SwhLabel[]) s.label().get(); if (labels.length > 0) { - for (int label : labels) { + for (SwhLabel label : labels) { + SwhPerm.Type permissionType = SwhPerm.Type.fromInt(label.permission); System.out.format( - "%s %s %s\n", + "%s %s %s %d\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode), - labelMap.get(label) + filenameMap.get((int) label.filenameId), + SwhPerm.Type.toOct(permissionType) ); } } else { diff --git a/java/src/main/java/org/softwareheritage/graph/webgraph/FixedWidthSwhLabelList.java b/java/src/main/java/org/softwareheritage/graph/webgraph/FixedWidthSwhLabelList.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/webgraph/FixedWidthSwhLabelList.java @@ -0,0 +1,109 @@ +package org.softwareheritage.graph.webgraph; + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; +import org.softwareheritage.graph.SwhLabel; + +import java.io.IOException; +import java.util.Arrays; + +/** + * List of {@link SwhLabel} represented in fixed width and following Webgraph labels convention. + * + * @author The Software Heritage developers + */ + +public class FixedWidthSwhLabelList extends AbstractLabel { + private final String key; + private final int width; + public SwhLabel[] value; + // Use existing Webgraph class to represent a list of SwhLabel as a list of encoded long + private final FixedWidthLongListLabel longList; + + private static final SwhLabel[] EMPTY_ARRAY = {}; + + public FixedWidthSwhLabelList(String key, int width, SwhLabel[] value) { + this.key = key; + this.width = width; + this.value = value; + + long[] valueEncoded = new long[value.length]; + for (int i = 0; i < value.length; i++) + valueEncoded[i] = SwhLabel.toEncoded(value[i]); + this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); + } + + public FixedWidthSwhLabelList(String key, int width) { + this(key, width, EMPTY_ARRAY); + } + + public FixedWidthSwhLabelList(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + int ret = longList.fromBitStream(inputBitStream, sourceUnused); + // Decode values from their internal long representation + value = new SwhLabel[longList.value.length]; + for (int i = 0; i < value.length; i++) + value[i] = SwhLabel.fromEncoded(longList.value[i]); + return ret; + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + // Values have already been encoded in the FixedWidthSwhLabelList constructor + return longList.toBitStream(outputBitStream, sourceUnused); + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[]{key}; + } + + @Override + public Class[] attributeTypes() { + return new Class[]{SwhLabel[].class}; + } + + @Override + public Object get(String s) { + if (this.key.equals(key)) + return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public Label copy() { + return new FixedWidthSwhLabelList(key, width, value.clone()); + } + + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +}