diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java @@ -0,0 +1,114 @@ +package org.softwareheritage.graph.labels; + +/** + * Directory entries metadata are stored as edge labels on the graph. Labels can be encoded in a single long type, to + * re-use Webgraph interface. + * + * @author The Software Heritage developers + */ +public class DirEntry { + public long filenameId; + public int permission; + + public DirEntry(long filenameId, int permission) + { + this.filenameId = filenameId; + this.permission = permission; + } + + public static DirEntry fromEncoded(long labelEncoded) + { + long filenameId = labelEncoded >> Permission.NB_BITS_PER_TYPE; + int permission = (int) (labelEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); + return new DirEntry(filenameId, permission); + } + + public static long toEncoded(DirEntry label) + { + long labelEncoded = (label.filenameId << Permission.NB_BITS_PER_TYPE) + label.permission; + return labelEncoded; + } + + /** + * Permission types present in the Software Heritage graph. + * + * @author The Software Heritage developers + */ + public static class Permission { + public static final int NB_BITS_PER_TYPE = + (int) Math.ceil(Math.log(Permission.Type.values().length) / Math.log(2)); + + public enum Type { + CONTENT, + EXECUTABLE_CONTENT, + SYMLINK, + DIRECTORY, + REVISION; + + public static Permission.Type fromIntCode(int intCode) { + switch (intCode) { + case 0: + return CONTENT; + case 1: + return EXECUTABLE_CONTENT; + case 2: + return SYMLINK; + case 3: + return DIRECTORY; + case 4: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission code: " + intCode); + } + + public static int toIntCode(Permission.Type type) { + switch (type) { + case CONTENT: + return 0; + case EXECUTABLE_CONTENT: + return 1; + case SYMLINK: + return 2; + case DIRECTORY: + return 3; + case REVISION: + return 4; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static Permission.Type fromIntPerm(int intPerm) { + switch (intPerm) { + case 0100644: + return CONTENT; + case 0100755: + return EXECUTABLE_CONTENT; + case 0120000: + return SYMLINK; + case 0040000: + return DIRECTORY; + case 0160000: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission: " + intPerm); + } + + public static int toIntPerm(Permission.Type type) { + switch (type) { + case CONTENT: + return 0100644; + case EXECUTABLE_CONTENT: + return 0100755; + case SYMLINK: + return 0120000; + case DIRECTORY: + return 0040000; + case REVISION: + return 0160000; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + } + } + +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthDirEntryList.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthDirEntryList.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthDirEntryList.java @@ -0,0 +1,108 @@ +package org.softwareheritage.graph.labels; + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** + * List of {@link DirEntry} represented in fixed width and following Webgraph labels convention. + * + * @author The Software Heritage developers + */ + +public class FixedWidthDirEntryList extends AbstractLabel { + private final String key; + private final int width; + public DirEntry[] value; + // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long + private final FixedWidthLongListLabel longList; + + private static final DirEntry[] EMPTY_ARRAY = {}; + + public FixedWidthDirEntryList(String key, int width, DirEntry[] value) { + this.key = key; + this.width = width; + this.value = value; + + long[] valueEncoded = new long[value.length]; + for (int i = 0; i < value.length; i++) + valueEncoded[i] = DirEntry.toEncoded(value[i]); + this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); + } + + public FixedWidthDirEntryList(String key, int width) { + this(key, width, EMPTY_ARRAY); + } + + public FixedWidthDirEntryList(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + int ret = longList.fromBitStream(inputBitStream, sourceUnused); + // Decode values from their internal long representation + value = new DirEntry[longList.value.length]; + for (int i = 0; i < value.length; i++) + value[i] = DirEntry.fromEncoded(longList.value[i]); + return ret; + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + // Values have already been encoded in the FixedWidthDirEntryList constructor + return longList.toBitStream(outputBitStream, sourceUnused); + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[]{key}; + } + + @Override + public Class[] attributeTypes() { + return new Class[]{DirEntry[].class}; + } + + @Override + public Object get(String s) { + if (this.key.equals(key)) + return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public Label copy() { + return new FixedWidthDirEntryList(key, width, value.clone()); + } + + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -4,8 +4,6 @@ import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntLabel; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; @@ -20,6 +18,9 @@ import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.labels.DirEntry; +import org.softwareheritage.graph.labels.DirEntry.Permission; +import org.softwareheritage.graph.labels.FixedWidthDirEntryList; import java.io.*; import java.nio.charset.StandardCharsets; @@ -106,13 +107,11 @@ long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); BinIO.loadLongs(graphPath + ".order", orderMap); - Object2LongFunction labelMPH = loadMPH(graphPath + "-labels"); - long numLabels = getMPHSize(labelMPH); - int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); - if (labelWidth > 30) { - logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now."); - System.exit(2); - } + Object2LongFunction filenameMph = loadMPH(graphPath + "-filename-labels"); + long numFilenames = getMPHSize(filenameMph); + int filenameIdWidth = (int) Math.ceil(Math.log(numFilenames) / Math.log(2)); + + int totalLabelWidth = filenameIdWidth + Permission.NB_BITS_PER_TYPE; ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plInter.itemsName = "edges"; @@ -123,7 +122,7 @@ ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command( "sort", - "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields + "-k1,1n", "-k2,2n", // Numerical sort "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir @@ -139,14 +138,15 @@ plInter.start("Piping intermediate representation to sort(1)"); while (edgeIterator.hasNext()) { String[] edge = edgeIterator.next().toString().split(" "); - if (edge.length < 3) + if (edge.length < 4) continue; long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); - long labelId = labelMPH.getLong(edge[2]); + long filenameId = filenameMph.getLong(edge[2]); + int permission = Integer.parseInt(edge[3]); - sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n") + sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n") .getBytes(StandardCharsets.US_ASCII)); plInter.lightUpdate(); } @@ -176,22 +176,23 @@ NodeIterator it = graph.nodeIterator(); long labelSrcNode = -1; long labelDstNode = -1; - long labelId = -1; + long labelFilenameId = -1; + int labelPermission = -1; while (it.hasNext()) { long srcNode = it.nextLong(); // Fill a hashmap with the labels of each edge starting from this node - HashMap> successorsLabels = new HashMap<>(); + HashMap> successorsLabels = new HashMap<>(); while (labelSrcNode <= srcNode) { if (labelSrcNode == srcNode) { successorsLabels .computeIfAbsent( labelDstNode, k -> new ArrayList<>() - ).add(labelId); + ).add(new DirEntry(labelFilenameId, labelPermission)); if (debugFile != null) { - debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n"); + debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n"); } } @@ -202,18 +203,19 @@ String[] parts = line.split("\\t"); labelSrcNode = Long.parseLong(parts[0]); labelDstNode = Long.parseLong(parts[1]); - labelId = Long.parseLong(parts[2]); + labelFilenameId = Long.parseLong(parts[2]); + Permission.Type permissionType = Permission.Type.fromIntPerm(Integer.parseInt(parts[3])); + labelPermission = Permission.Type.toIntCode(permissionType); } int bits = 0; LazyLongIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - List edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); - bits += labels.writeGamma(edgeLabels.size()); - for (Long label : edgeLabels) { - bits += labels.writeLong(label, labelWidth); - } + List currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); + FixedWidthDirEntryList l = new FixedWidthDirEntryList( + "edgelabel", totalLabelWidth, currentLabels.toArray(new DirEntry[0])); + bits += l.toBitStream(labels, -1); } offsets.writeGamma(bits); @@ -228,7 +230,7 @@ PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties" )); pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); - pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")" ); + pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + FixedWidthDirEntryList.class.getName() + "(DirEntry," + totalLabelWidth + ")" ); pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); pw.close(); } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java @@ -5,6 +5,8 @@ import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.util.PermutedFrontCodedStringList; +import org.softwareheritage.graph.labels.DirEntry; +import org.softwareheritage.graph.labels.DirEntry.Permission; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; @@ -15,7 +17,7 @@ ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled"); NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes()); - PermutedFrontCodedStringList labelMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-labels.fcl"); + PermutedFrontCodedStringList filenameMap = (PermutedFrontCodedStringList) BinIO.loadObject(graphPath + "-filename-labels.fcl"); ArcLabelledNodeIterator it = graph.nodeIterator(); while (it.hasNext()) { @@ -24,14 +26,16 @@ ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - int[] labels = (int[]) s.label().get(); + DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { - for (int label : labels) { + for (DirEntry label : labels) { + Permission.Type permissionType = Permission.Type.fromIntCode(label.permission); System.out.format( - "%s %s %s\n", + "%s %s %s %d\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode), - labelMap.get(label) + filenameMap.get((int) label.filenameId), + Permission.Type.toIntPerm(permissionType) ); } } else {