diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java @@ -0,0 +1,95 @@ +// TODO: should be in webgraph upstream +// package it.unimi.dsi.big.webgraph.labelling; +package org.softwareheritage.graph.labels; + +/* + * Copyright (C) 2020 TODO + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; + +import java.util.Arrays; + +/** An abstract (single-attribute) list-of-longs label. +* +*

This class provides basic methods for a label holding a list of longs. +* Concrete implementations may impose further requirements on the long. +* +*

Implementing subclasses must provide constructors, {@link Label#copy()}, +* {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} +* and possibly override {@link #toString()}. +*/ + +public abstract class AbstractLongListLabel extends AbstractLabel implements Label { + /** The key of the attribute represented by this label. */ + protected final String key; + /** The values of the attribute represented by this label. */ + public long[] value; + + /** Creates an long label with given key and value. + * + * @param key the (only) key of this label. + * @param value the value of this label. + */ + public AbstractLongListLabel(String key, long[] value) { + this.key = key; + this.value = value; + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[] { key }; + } + + @Override + public Class[] attributeTypes() { + return new Class[] { long[].class }; + } + + @Override + public Object get(String key) { + if (this.key.equals(key)) return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value); + } + + @Override + public boolean equals(Object x) { + if (x instanceof AbstractLongListLabel) return Arrays.equals(value, ((AbstractLongListLabel)x).value); + else return false; + } + + @Override + public int hashCode() { + return Arrays.hashCode(value); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java @@ -0,0 +1,131 @@ +package org.softwareheritage.graph.labels; + +/** + * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be encoded in a single long + * type, to re-use Webgraph interface. + * + * @author The Software Heritage developers + */ +public class DirEntry { + public long filenameId; + public int permission; + + public DirEntry(long filenameId, int permission) + { + this.filenameId = filenameId; + this.permission = permission; + } + + public DirEntry(long dirEntryEncoded) + { + this.filenameId = dirEntryEncoded >> Permission.NB_BITS_PER_TYPE; + int dirBytes = (int) (dirEntryEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); + this.permission = Permission.Type.fromEncoded(dirBytes); + } + + public long toEncoded() + { + return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission); + } + + public static int labelWidth(long numLabels) + { + int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); + if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) { + System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now."); + System.exit(2); + } + return filenameIdWidth + Permission.NB_BITS_PER_TYPE; + } + + /** + * Permission types present in the Software Heritage graph. + * + * @author The Software Heritage developers + */ + private static class Permission { + public static final int NB_BITS_PER_TYPE = + (int) Math.ceil(Math.log(Permission.Type.values().length) / Math.log(2)); + + public enum Type { + CONTENT, + EXECUTABLE_CONTENT, + SYMLINK, + DIRECTORY, + REVISION; + + public static Permission.Type fromIntCode(int intCode) { + switch (intCode) { + case 0: + return CONTENT; + case 1: + return EXECUTABLE_CONTENT; + case 2: + return SYMLINK; + case 3: + return DIRECTORY; + case 4: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission code: " + intCode); + } + + public static int toIntCode(Permission.Type type) { + switch (type) { + case CONTENT: + return 0; + case EXECUTABLE_CONTENT: + return 1; + case SYMLINK: + return 2; + case DIRECTORY: + return 3; + case REVISION: + return 4; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static Permission.Type fromIntPerm(int intPerm) { + switch (intPerm) { + case 0100644: + return CONTENT; + case 0100755: + return EXECUTABLE_CONTENT; + case 0120000: + return SYMLINK; + case 0040000: + return DIRECTORY; + case 0160000: + return REVISION; + } + throw new IllegalArgumentException("Unknown node permission: " + intPerm); + } + + public static int toIntPerm(Permission.Type type) { + switch (type) { + case CONTENT: + return 0100644; + case EXECUTABLE_CONTENT: + return 0100755; + case SYMLINK: + return 0120000; + case DIRECTORY: + return 0040000; + case REVISION: + return 0160000; + } + throw new IllegalArgumentException("Unknown node permission type: " + type); + } + + public static int fromEncoded(int encoded) { + return toIntPerm(fromIntCode(encoded)); + } + + public static int toEncoded(int permission) { + return toIntCode(fromIntPerm(permission)); + } + } + } + +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java @@ -0,0 +1,108 @@ +// TODO: should be in webgraph upstream +// package it.unimi.dsi.big.webgraph.labelling; +package org.softwareheritage.graph.labels; + +/* + * Copyright (C) 2020 TODO + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + + +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.fastutil.longs.LongArrays; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** A list of longs represented in fixed width. Each list is prefixed by its + * length written in {@linkplain OutputBitStream#writeGamma(int) γ + * coding}. + */ + +public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel { + /** The bit width used to represent the value of this label. */ + private final int width; + + /** Creates a new fixed-width long label. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + * @param value the value of this label. + */ + public FixedWidthLongListLabel(String key, int width, long[] value) { + super(key, value); + for(int i = value.length; i-- != 0;) if (value[i] < 0 || value[i] >= 1L << width) throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i])); + this.width = width; + } + + /** Creates a new fixed-width label with an empty list. + * + * @param key the (only) key of this label. + * @param width the label width (in bits). + */ + public FixedWidthLongListLabel(String key, int width) { + this(key, width, LongArrays.EMPTY_ARRAY); + } + + /** Creates a new fixed-width long label using the given key and width + * with an empty list. + * + * @param arg two strings containing the key and the width of this label. + */ + public FixedWidthLongListLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public Label copy() { + return new FixedWidthLongListLabel(key, width, value.clone()); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + long readBits = inputBitStream.readBits(); + value = new long[inputBitStream.readGamma()]; + for(int i = 0; i < value.length; i++) value[i] = inputBitStream.readLong(width); + return (int)(inputBitStream.readBits() - readBits); + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + int bits = outputBitStream.writeGamma(value.length); + for(int i = 0; i < value.length; i++) bits += outputBitStream.writeLong(value[i], width); + return bits; + } + + /** Returns -1 (the fixed width refers to a single long, not to the entire list). + * @return -1; + */ + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java new file mode 100644 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java @@ -0,0 +1,108 @@ +package org.softwareheritage.graph.labels; + +import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.Label; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Software Heritage graph edge labels following Webgraph labels convention. + * + * @author The Software Heritage developers + */ + +public class SwhLabel extends AbstractLabel { + private final String key; + private final int width; + // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry to store branch names) + public DirEntry[] value; + // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long + private final FixedWidthLongListLabel longList; + + private static final DirEntry[] EMPTY_ARRAY = {}; + + public SwhLabel(String key, int width, DirEntry[] value) { + this.key = key; + this.width = width; + this.value = value; + + long[] valueEncoded = new long[value.length]; + for (int i = 0; i < value.length; i++) + valueEncoded[i] = value[i].toEncoded(); + this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); + } + + public SwhLabel(String key, int width) { + this(key, width, EMPTY_ARRAY); + } + + public SwhLabel(String... arg) { + this(arg[0], Integer.parseInt(arg[1])); + } + + @Override + public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { + int ret = longList.fromBitStream(inputBitStream, sourceUnused); + // Decode values from their internal long representation + value = new DirEntry[longList.value.length]; + for (int i = 0; i < value.length; i++) + value[i] = new DirEntry(longList.value[i]); + return ret; + } + + @Override + public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { + // Values have already been encoded in the SwhLabel constructor + return longList.toBitStream(outputBitStream, sourceUnused); + } + + @Override + public String wellKnownAttributeKey() { + return key; + } + + @Override + public String[] attributeKeys() { + return new String[]{key}; + } + + @Override + public Class[] attributeTypes() { + return new Class[]{DirEntry[].class}; + } + + @Override + public Object get(String s) { + if (this.key.equals(key)) + return value; + throw new IllegalArgumentException(); + } + + @Override + public Object get() { + return value; + } + + @Override + public Label copy() { + return new SwhLabel(key, width, value.clone()); + } + + @Override + public int fixedWidth() { + return -1; + } + + @Override + public String toString() { + return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; + } + + @Override + public String toSpec() { + return this.getClass().getName() + "(" + key + "," + width + ")"; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java @@ -4,7 +4,6 @@ import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.big.webgraph.labelling.FixedWidthIntListLabel; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; @@ -19,6 +18,8 @@ import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.labels.DirEntry; +import org.softwareheritage.graph.labels.SwhLabel; import java.io.*; import java.nio.charset.StandardCharsets; @@ -96,13 +97,10 @@ long[][] orderMap = LongBigArrays.newBigArray(getMPHSize(swhIdMph)); BinIO.loadLongs(graphPath + ".order", orderMap); - Object2LongFunction labelMPH = loadMPH(graphPath + "-labels"); - long numLabels = getMPHSize(labelMPH); - int labelWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); - if (labelWidth > 30) { - logger.error("FIXME: Too many labels, we can't handle more than 2^30 for now."); - System.exit(2); - } + Object2LongFunction filenameMph = loadMPH(graphPath + "-filename-labels"); + long numFilenames = getMPHSize(filenameMph); + + int totalLabelWidth = DirEntry.labelWidth(numFilenames); ProgressLogger plInter = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plInter.itemsName = "edges"; @@ -113,8 +111,13 @@ * appear in the label file. */ ProcessBuilder processBuilder = new ProcessBuilder(); - processBuilder.command("sort", "-k1,1n", "-k2,2n", "-k3,3n", // Numerical sort on all fields - "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); + processBuilder.command( + "sort", + "-k1,1n", "-k2,2n", // Numerical sort + "--numeric-sort", + "--buffer-size", SORT_BUFFER_SIZE, + "--temporary-directory", tmpDir + ); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); @@ -125,14 +128,16 @@ plInter.start("Piping intermediate representation to sort(1)"); while (edgeIterator.hasNext()) { String[] edge = edgeIterator.next().toString().split(" "); - if (edge.length < 3) + if (edge.length < 4) continue; long srcNode = SwhIDToNode(edge[0], swhIdMph, orderMap); long dstNode = SwhIDToNode(edge[1], swhIdMph, orderMap); - long labelId = labelMPH.getLong(edge[2]); + long filenameId = filenameMph.getLong(edge[2]); + int permission = Integer.parseInt(edge[3]); - sort_stdin.write((srcNode + "\t" + dstNode + "\t" + labelId + "\n").getBytes(StandardCharsets.US_ASCII)); + sort_stdin.write((srcNode + "\t" + dstNode + "\t" + filenameId + "\t" + permission + "\n") + .getBytes(StandardCharsets.US_ASCII)); plInter.lightUpdate(); } plInter.done(); @@ -159,18 +164,23 @@ NodeIterator it = graph.nodeIterator(); long labelSrcNode = -1; long labelDstNode = -1; - long labelId = -1; + long labelFilenameId = -1; + int labelPermission = -1; while (it.hasNext()) { long srcNode = it.nextLong(); // Fill a hashmap with the labels of each edge starting from this node - HashMap> successorsLabels = new HashMap<>(); + HashMap> successorsLabels = new HashMap<>(); while (labelSrcNode <= srcNode) { if (labelSrcNode == srcNode) { - successorsLabels.computeIfAbsent(labelDstNode, k -> new ArrayList<>()).add(labelId); + successorsLabels + .computeIfAbsent( + labelDstNode, + k -> new ArrayList<>() + ).add(new DirEntry(labelFilenameId, labelPermission)); if (debugFile != null) { - debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelId + "\n"); + debugFile.write(labelSrcNode + " " + labelDstNode + " " + labelFilenameId + " " + labelPermission + "\n"); } } @@ -181,18 +191,17 @@ String[] parts = line.split("\\t"); labelSrcNode = Long.parseLong(parts[0]); labelDstNode = Long.parseLong(parts[1]); - labelId = Long.parseLong(parts[2]); + labelFilenameId = Long.parseLong(parts[2]); + labelPermission = Integer.parseInt(parts[3]); } int bits = 0; LazyLongIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - List edgeLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); - bits += labels.writeGamma(edgeLabels.size()); - for (Long label : edgeLabels) { - bits += labels.writeLong(label, labelWidth); - } + List currentLabels = successorsLabels.getOrDefault(dstNode, Collections.emptyList()); + SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, currentLabels.toArray(new DirEntry[0])); + bits += l.toBitStream(labels, -1); } offsets.writeGamma(bits); @@ -207,8 +216,7 @@ PrintWriter pw = new PrintWriter(new FileWriter((new File(graphPath)).getName() + "-labelled.properties")); pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); - pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " - + FixedWidthIntListLabel.class.getName() + "(TEST," + labelWidth + ")"); + pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName() + "(DirEntry," + totalLabelWidth + ")" ); pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + graphPath); pw.close(); } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java @@ -1,10 +1,11 @@ package org.softwareheritage.graph.utils; +import it.unimi.dsi.big.util.PermutedFrontCodedStringBigList; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.io.BinIO; -import it.unimi.dsi.util.PermutedFrontCodedStringList; +import org.softwareheritage.graph.labels.DirEntry; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; @@ -15,8 +16,7 @@ ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled"); NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes()); - PermutedFrontCodedStringList labelMap = (PermutedFrontCodedStringList) BinIO - .loadObject(graphPath + "-labels.fcl"); + PermutedFrontCodedStringBigList filenameMap = (PermutedFrontCodedStringBigList) BinIO.loadObject(graphPath + "-filename-labels.fcl"); ArcLabelledNodeIterator it = graph.nodeIterator(); while (it.hasNext()) { @@ -25,11 +25,16 @@ ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - int[] labels = (int[]) s.label().get(); + DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { - for (int label : labels) { - System.out.format("%s %s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode), - labelMap.get(label)); + for (DirEntry label : labels) { + System.out.format( + "%s %s %s %d\n", + nodeMap.getSWHID(srcNode), + nodeMap.getSWHID(dstNode), + filenameMap.get(label.filenameId), + label.permission + ); } } else { System.out.format("%s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode));