diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java index 770d150..637daee 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java @@ -1,324 +1,323 @@ package org.softwareheritage.graph; import it.unimi.dsi.big.util.MappedFrontCodedStringBigList; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.bytes.ByteBigList; import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; import it.unimi.dsi.fastutil.ints.IntBigList; import it.unimi.dsi.fastutil.ints.IntMappedBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.longs.LongMappedBigList; import it.unimi.dsi.fastutil.shorts.ShortBigList; import it.unimi.dsi.fastutil.shorts.ShortMappedBigList; import it.unimi.dsi.sux4j.util.EliasFanoLongBigList; import org.apache.commons.configuration2.ex.ConfigurationException; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.IOException; import java.io.RandomAccessFile; import java.util.Base64; /** * This objects contains SWH graph properties such as node labels. * * Some property mappings are necessary because Software Heritage uses string based persistent * identifiers (SWHID) while WebGraph uses integers internally. * * The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph * using SWHID) and the output (convert back to SWHID for users results). * * Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a * long id → node type map is stored as well to avoid a full SWHID lookup. * * @see NodeIdMap * @see NodeTypesMap */ public class SwhGraphProperties { private final String path; private final NodeIdMap nodeIdMap; private final NodeTypesMap nodeTypesMap; private LongBigList authorTimestamp; private ShortBigList authorTimestampOffset; private LongBigList committerTimestamp; private ShortBigList committerTimestampOffset; private LongBigList contentLength; private LongArrayBitVector contentIsSkipped; private IntBigList authorId; private IntBigList committerId; private ByteBigList messageBuffer; private LongBigList messageOffsets; private ByteBigList tagNameBuffer; private LongBigList tagNameOffsets; private MappedFrontCodedStringBigList edgeLabelNames; protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { this.path = path; this.nodeIdMap = nodeIdMap; this.nodeTypesMap = nodeTypesMap; } public static SwhGraphProperties load(String path) throws IOException { return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path)); } /** * Cleans up resources after use. */ public void close() throws IOException { - nodeIdMap.close(); edgeLabelNames.close(); } /** Return the basename of the compressed graph */ public String getPath() { return path; } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return nodeIdMap.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return nodeIdMap.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } private static LongBigList loadMappedLongs(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return LongMappedBigList.map(raf.getChannel()); } } private static IntBigList loadMappedInts(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return IntMappedBigList.map(raf.getChannel()); } } private static ShortBigList loadMappedShorts(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return ShortMappedBigList.map(raf.getChannel()); } } private static ByteBigList loadMappedBytes(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return ByteMappedBigList.map(raf.getChannel()); } } private static LongBigList loadEFLongs(String path) throws IOException { try { return (EliasFanoLongBigList) BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e); } } private static byte[] getLine(ByteBigList byteArray, long start) { long end = start; while (end < byteArray.size64() && byteArray.getByte(end) != '\n') { end++; } int length = (int) (end - start); byte[] buffer = new byte[length]; byteArray.getElements(start, buffer, 0, length); return buffer; } /** Load the sizes of the content nodes */ public void loadContentLength() throws IOException { contentLength = loadMappedLongs(path + ".property.content.length.bin"); } /** Get the size (in bytes) of the given content node */ public Long getContentLength(long nodeId) { if (contentLength == null) { throw new IllegalStateException("Content lengths not loaded"); } long res = contentLength.getLong(nodeId); return (res >= 0) ? res : null; } /** Load the IDs of the persons (authors and committers) */ public void loadPersonIds() throws IOException { authorId = loadMappedInts(path + ".property.author_id.bin"); committerId = loadMappedInts(path + ".property.committer_id.bin"); } /** Get a unique integer ID representing the author of the given revision or release node */ public Long getAuthorId(long nodeId) { if (authorId == null) { throw new IllegalStateException("Author IDs not loaded"); } long res = authorId.getInt(nodeId); return (res >= 0) ? res : null; } /** Get a unique integer ID representing the committer of the given revision node */ public Long getCommitterId(long nodeId) { if (committerId == null) { throw new IllegalStateException("Committer IDs not loaded"); } long res = committerId.getInt(nodeId); return (res >= 0) ? res : null; } /** * Loads a boolean array indicating whether the given content node was skipped during archive * ingestion */ public void loadContentIsSkipped() throws IOException { try { contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin"); } catch (ClassNotFoundException e) { throw new IOException(e); } } /** Returns whether the given content node was skipped during archive ingestion */ public boolean isContentSkipped(long nodeId) { if (contentIsSkipped == null) { throw new IllegalStateException("Skipped content array not loaded"); } return contentIsSkipped.getBoolean(nodeId); } /** Load the timestamps at which the releases and revisions were authored */ public void loadAuthorTimestamps() throws IOException { authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin"); authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin"); } /** Return the timestamp at which the given revision or release was authored */ public Long getAuthorTimestamp(long nodeId) { if (authorTimestamp == null) { throw new IllegalStateException("Author timestamps not loaded"); } long res = authorTimestamp.getLong(nodeId); return (res > Long.MIN_VALUE) ? res : null; } /** Return the timestamp offset at which the given revision or release was authored */ public Short getAuthorTimestampOffset(long nodeId) { if (authorTimestampOffset == null) { throw new IllegalStateException("Author timestamp offsets not loaded"); } short res = authorTimestampOffset.getShort(nodeId); return (res > Short.MIN_VALUE) ? res : null; } /** Load the timestamps at which the releases and revisions were committed */ public void loadCommitterTimestamps() throws IOException { committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin"); committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin"); } /** Return the timestamp at which the given revision was committed */ public Long getCommitterTimestamp(long nodeId) { if (committerTimestamp == null) { throw new IllegalStateException("Committer timestamps not loaded"); } long res = committerTimestamp.getLong(nodeId); return (res > Long.MIN_VALUE) ? res : null; } /** Return the timestamp offset at which the given revision was committed */ public Short getCommitterTimestampOffset(long nodeId) { if (committerTimestampOffset == null) { throw new IllegalStateException("Committer timestamp offsets not loaded"); } short res = committerTimestampOffset.getShort(nodeId); return (res > Short.MIN_VALUE) ? res : null; } /** Load the revision messages, the release messages and the origin URLs */ public void loadMessages() throws IOException { messageBuffer = loadMappedBytes(path + ".property.message.bin"); messageOffsets = loadMappedLongs(path + ".property.message.offset.bin"); } /** Get the message of the given revision or release node */ public byte[] getMessage(long nodeId) { if (messageBuffer == null || messageOffsets == null) { throw new IllegalStateException("Messages not loaded"); } long startOffset = messageOffsets.getLong(nodeId); if (startOffset == -1) { return null; } return Base64.getDecoder().decode(getLine(messageBuffer, startOffset)); } /** Get the URL of the given origin node */ public String getUrl(long nodeId) { byte[] url = getMessage(nodeId); return (url != null) ? new String(url) : null; } /** Load the release names */ public void loadTagNames() throws IOException { tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin"); tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin"); } /** Get the name of the given release node */ public byte[] getTagName(long nodeId) { if (tagNameBuffer == null || tagNameOffsets == null) { throw new IllegalStateException("Tag names not loaded"); } long startOffset = tagNameOffsets.getLong(nodeId); if (startOffset == -1) { return null; } return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset)); } /** Load the arc label names (directory entry names and snapshot branch names) */ public void loadLabelNames() throws IOException { try { edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl"); } catch (ConfigurationException e) { throw new IOException(e); } } /** * Get the arc label name (either a directory entry name or snapshot branch name) associated with * the given label ID */ public byte[] getLabelName(long labelId) { if (edgeLabelNames == null) { throw new IllegalStateException("Label names not loaded"); } return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId)); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java b/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java deleted file mode 100644 index fe4dec6..0000000 --- a/java/src/main/java/org/softwareheritage/graph/maps/MapFile.java +++ /dev/null @@ -1,66 +0,0 @@ -package org.softwareheritage.graph.maps; - -import it.unimi.dsi.io.ByteBufferInputStream; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; - -/** - * Wrapper class around very big mmap()-ed file. - *

- * Java has a limit for mmap()-ed files because of unsupported 64-bit indexing. The - * dsiutils ByteBufferInputStream is used to overcome - * this Java limit. - * - * @author The Software Heritage developers - */ - -public class MapFile { - /** Memory-mapped file buffer */ - ByteBufferInputStream bufferMap; - /** Fixed line length of the mmap()-ed file */ - int lineLength; - - /** - * Constructor. - * - * @param path file path to mmap() - * @param lineLength fixed length of a line in the file - */ - public MapFile(String path, int lineLength) throws IOException { - this.bufferMap = null; - this.lineLength = lineLength; - - try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { - FileChannel fileChannel = mapFile.getChannel(); - bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); - } - } - - /** - * Returns a specific line in the file. - * - * @param lineIndex line number in the file - * @return the line at the specified position - */ - public byte[] readAtLine(long lineIndex) { - byte[] buffer = new byte[lineLength]; - long position = lineIndex * (long) lineLength; - bufferMap.position(position); - bufferMap.read(buffer, 0, lineLength); - return buffer; - } - - public long size() { - return bufferMap.length() / (long) lineLength; - } - - /** - * Closes the mmap()-ed file. - */ - public void close() throws IOException { - bufferMap.close(); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java index 93e1ebb..7ca8c77 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,191 +1,189 @@ package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.Size64; +import it.unimi.dsi.fastutil.bytes.ByteBigList; +import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; +import it.unimi.dsi.fastutil.longs.LongMappedBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.util.ByteBufferLongBigList; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.compress.NodeMapBuilder; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using * an mmap()-ed .order file containing the graph permutation. * * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ public class NodeIdMap implements Size64 { /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; /** File extension for the long node id to SWHID map */ public static final String NODE_TO_SWHID = ".node2swhid.bin"; /** Graph path and basename */ String graphPath; /** mmap()-ed NODE_TO_SWHID file */ - MapFile nodeToSwhMap; + ByteBigList nodeToSwhMap; /** Minimal perfect hash (MPH) function SWHID -> initial order */ Object2LongFunction mph; /** mmap()-ed long list with the permutation initial order -> graph order */ LongBigList orderMap; /** * Constructor. * * @param graphPath full graph path */ public NodeIdMap(String graphPath) throws IOException { this.graphPath = graphPath; // node -> SWHID - this.nodeToSwhMap = new MapFile(graphPath + NODE_TO_SWHID, SWHID_BIN_SIZE); + try (RandomAccessFile raf = new RandomAccessFile(graphPath + NODE_TO_SWHID, "r")) { + this.nodeToSwhMap = ByteMappedBigList.map(raf.getChannel()); + } // SWHID -> node this.mph = loadMph(graphPath + ".mph"); - try (RandomAccessFile mapFile = new RandomAccessFile(new File(graphPath + ".order"), "r")) { - FileChannel fileChannel = mapFile.getChannel(); - this.orderMap = ByteBufferLongBigList.map(fileChannel); + this.orderMap = LongMappedBigList.map(mapFile.getChannel()); } } @SuppressWarnings("unchecked") public static Object2LongFunction loadMph(String path) throws IOException { Object obj; try { obj = BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e.getMessage()); } Object2LongFunction res = (Object2LongFunction) obj; // Backward-compatibility for old maps parametrized with . // New maps should be parametrized with , which is faster. try { // Try to call it with bytes, will fail if it's a O2LF. res.getLong("42".getBytes(StandardCharsets.UTF_8)); } catch (ClassCastException e) { class StringCompatibleByteFunction implements Object2LongFunction, Size64 { private final Object2LongFunction legacyFunction; public StringCompatibleByteFunction(Object2LongFunction legacyFunction) { this.legacyFunction = legacyFunction; } @Override public long getLong(Object o) { byte[] bi = (byte[]) o; return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8)); } + @SuppressWarnings("deprecation") @Override public int size() { return legacyFunction.size(); } @Override public long size64() { return (legacyFunction instanceof Size64) ? ((Size64) legacyFunction).size64() : legacyFunction.size(); } } Object2LongFunction mphLegacy = (Object2LongFunction) obj; return new StringCompatibleByteFunction(mphLegacy); } // End of backward-compatibility block return res; } /** * Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the * SWHID is valid. * * @param swhid node represented as bytes * @return corresponding node as a long id */ public long getNodeId(byte[] swhid) { // 1. Hash the SWHID with the MPH to get its original ID long origNodeId = mph.getLong(swhid); // 2. Use the order permutation to get the position in the permuted graph return this.orderMap.getLong(origNodeId); } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} * @param checkExists if true, error if the SWHID is not present in the graph, if false the check * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ public long getNodeId(SWHID swhid, boolean checkExists) { // Convert the SWHID to bytes and call getNodeId() long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII)); // Check that the position effectively corresponds to a real node using the reverse map. // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. if (!checkExists || getSWHID(nodeId).equals(swhid)) { return nodeId; } else { throw new IllegalArgumentException("Unknown SWHID: " + swhid); } } public long getNodeId(SWHID swhid) { return getNodeId(swhid, true); } /** * Converts a node long id to corresponding SWHID. * * @param nodeId node as a long id * @return corresponding node as a {@link SWHID} * @see SWHID */ public SWHID getSWHID(long nodeId) { /* * Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's * swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid */ - if (nodeId < 0 || nodeId >= nodeToSwhMap.size()) { - throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nodeToSwhMap.size()); + if (nodeId < 0 || nodeId >= nodeToSwhMap.size64()) { + throw new IllegalArgumentException( + "Node id " + nodeId + " should be between 0 and " + nodeToSwhMap.size64()); } - return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId)); - } - - /** - * Closes the mapping files. - */ - public void close() throws IOException { - nodeToSwhMap.close(); + byte[] swhid = new byte[SWHID_BIN_SIZE]; + nodeToSwhMap.getElements(nodeId * SWHID_BIN_SIZE, swhid, 0, SWHID_BIN_SIZE); + return SWHID.fromBytes(swhid); } /** Return the number of nodes in the map. */ @Override public long size64() { - return nodeToSwhMap.size(); + return nodeToSwhMap.size64(); } }