diff --git a/java/src/main/java/org/softwareheritage/graph/SwhPID.java b/java/src/main/java/org/softwareheritage/graph/SwhPID.java index a36d399..c355834 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhPID.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhPID.java @@ -1,108 +1,126 @@ package org.softwareheritage.graph; import java.lang.System; import com.fasterxml.jackson.annotation.JsonValue; import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.DecoderException; import org.softwareheritage.graph.Node; /** * A Software Heritage PID, see persistent * identifier documentation. * * @author The Software Heritage developers */ public class SwhPID { /** Fixed hash length of the PID */ public static final int HASH_LENGTH = 40; /** Full PID as a string */ String swhPID; /** PID node type */ Node.Type type; /** * Constructor. * * @param swhPID full PID as a string */ public SwhPID(String swhPID) { this.swhPID = swhPID; // PID format: 'swh:1:type:hash' String[] parts = swhPID.split(":"); if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) { throw new IllegalArgumentException("malformed SWH PID: " + swhPID); } this.type = Node.Type.fromStr(parts[2]); if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) { throw new IllegalArgumentException("malformed SWH PID: " + swhPID); } } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SwhPID)) return false; SwhPID other = (SwhPID) otherObj; return swhPID.equals(other.getSwhPID()); } @Override public int hashCode() { return swhPID.hashCode(); } @Override public String toString() { return swhPID; } /** Converts PID to a compact binary representation. * * The binary format is specified in the Python module * swh.graph.pid:str_to_bytes . */ public byte[] toBytes() { byte[] bytes = new byte[22]; byte[] digest; bytes[0] = (byte) 1; // namespace version bytes[1] = (byte) Node.Type.toInt(this.type); // PID type try { digest = Hex.decodeHex(this.swhPID.substring(10)); // SHA1 hash System.arraycopy(digest, 0, bytes, 2, digest.length); } catch (DecoderException e) { throw new IllegalArgumentException("invalid hex sequence in PID: " + this.swhPID); } return bytes; } + /** Creates a SwhPID from a compact binary representation. + * + * The binary format is specified in the Python module + * swh.graph.pid:str_to_bytes . + */ + public static SwhPID fromBytes(byte[] input) { + byte[] digest = new byte[20]; + System.arraycopy(input, 2, digest, 0, digest.length); + + String pidStr = String.format( + "swh:%d:%s:%s", + input[0], + Node.Type.fromInt(input[1]).toString().toLowerCase(), + Hex.encodeHexString(digest) + ); + return new SwhPID(pidStr); + } + /** * Returns full PID as a string. * * @return full PID string */ @JsonValue public String getSwhPID() { return swhPID; } /** * Returns PID node type. * * @return PID corresponding {@link Node.Type} * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType() { return type; } } diff --git a/java/src/main/java/org/softwareheritage/graph/backend/MapFile.java b/java/src/main/java/org/softwareheritage/graph/backend/MapFile.java index b63ae87..a9a8b67 100644 --- a/java/src/main/java/org/softwareheritage/graph/backend/MapFile.java +++ b/java/src/main/java/org/softwareheritage/graph/backend/MapFile.java @@ -1,63 +1,62 @@ package org.softwareheritage.graph.backend; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import it.unimi.dsi.io.ByteBufferInputStream; /** * Wrapper class around very big mmap()-ed file. *
* Java has a limit for mmap()-ed files because of unsupported 64-bit indexing. The dsiutils ByteBufferInputStream is used to overcome this * Java limit. * * @author The Software Heritage developers */ public class MapFile { /** Memory-mapped file buffer */ ByteBufferInputStream bufferMap; /** Fixed line length of the mmap()-ed file */ int lineLength; /** * Constructor. * * @param path file path to mmap() * @param lineLength fixed length of a line in the file */ public MapFile(String path, int lineLength) throws IOException { this.bufferMap = null; this.lineLength = lineLength; try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { FileChannel fileChannel = mapFile.getChannel(); bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); } } /** * Returns a specific line in the file. * * @param lineIndex line number in the file * @return the line at the specified position */ - public String readAtLine(long lineIndex) { + public byte[] readAtLine(long lineIndex) { byte[] buffer = new byte[lineLength]; long position = lineIndex * (long) lineLength; bufferMap.position(position); bufferMap.read(buffer, 0, lineLength); - String line = new String(buffer); - return line.trim(); + return buffer; } /** * Closes the mmap()-ed file. */ public void close() throws IOException { bufferMap.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java index d9c3c3d..e86b5ab 100644 --- a/java/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/backend/NodeIdMap.java @@ -1,114 +1,115 @@ package org.softwareheritage.graph.backend; import java.io.IOException; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SwhPID; import org.softwareheritage.graph.backend.MapFile; /** * Mapping between internal long node id and external SWH PID. * * Mappings in both directions are pre-computed and dumped on disk in the * {@link MapBuilder} class, then they are loaded here using mmap(). * * @author The Software Heritage developers * @see org.softwareheritage.graph.backend.MapBuilder */ public class NodeIdMap { /** Fixed length of full SWH PID */ public static final int SWH_ID_LENGTH = 50; /** Fixed length of long node id */ public static final int NODE_ID_LENGTH = 20; + /** Fixed length of binary SWH PID buffer */ + public static final int SWH_ID_BIN_SIZE = 22; + /** Fixed length of binary node id buffer */ + public static final int NODE_ID_BIN_SIZE = 8; + /** Graph path and basename */ String graphPath; /** Number of ids to map */ long nbIds; /** mmap()-ed PID_TO_NODE file */ MapFile swhToNodeMap; /** mmap()-ed NODE_TO_PID file */ MapFile nodeToSwhMap; /** * Constructor. * * @param graphPath full graph path * @param nbNodes number of nodes in the graph */ public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; - // +1 are for spaces and end of lines - int swhToNodeLineLength = SWH_ID_LENGTH + 1 + NODE_ID_LENGTH + 1; - int nodeToSwhLineLength = SWH_ID_LENGTH + 1; - this.swhToNodeMap = new MapFile(graphPath + Graph.PID_TO_NODE, swhToNodeLineLength); - this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_PID, nodeToSwhLineLength); + this.swhToNodeMap = new MapFile(graphPath + Graph.PID_TO_NODE, SWH_ID_BIN_SIZE + NODE_ID_BIN_SIZE); + this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_PID, SWH_ID_BIN_SIZE); } /** * Converts SWH PID to corresponding long node id. * * @param swhPID node represented as a {@link SwhPID} * @return corresponding node as a long id * @see org.softwareheritage.graph.SwhPID */ public long getNodeId(SwhPID swhPID) { - // Each line in PID_TO_NODE is formatted as: swhPID nodeId // The file is sorted by swhPID, hence we can binary search on swhPID to get corresponding // nodeId long start = 0; long end = nbIds - 1; while (start <= end) { long lineNumber = (start + end) / 2L; - String[] parts = swhToNodeMap.readAtLine(lineNumber).split(" "); - if (parts.length != 2) { - break; - } + byte[] buffer = swhToNodeMap.readAtLine(lineNumber); + byte[] pidBuffer = new byte[SWH_ID_BIN_SIZE]; + byte[] nodeBuffer = new byte[NODE_ID_BIN_SIZE]; + System.arraycopy(buffer, 0, pidBuffer, 0, SWH_ID_BIN_SIZE); + System.arraycopy(buffer, SWH_ID_BIN_SIZE, nodeBuffer, 0, NODE_ID_BIN_SIZE); - String currentSwhPID = parts[0]; - long currentNodeId = Long.parseLong(parts[1]); + String currentSwhPID = SwhPID.fromBytes(pidBuffer).getSwhPID(); + long currentNodeId = java.nio.ByteBuffer.wrap(nodeBuffer).getLong(); int cmp = currentSwhPID.compareTo(swhPID.toString()); if (cmp == 0) { return currentNodeId; } else if (cmp < 0) { start = lineNumber + 1; } else { end = lineNumber - 1; } } throw new IllegalArgumentException("Unknown SWH PID: " + swhPID); } /** * Converts a node long id to corresponding SWH PID. * * @param nodeId node as a long id * @return corresponding node as a {@link SwhPID} * @see org.softwareheritage.graph.SwhPID */ public SwhPID getSwhPID(long nodeId) { // Each line in NODE_TO_PID is formatted as: swhPID // The file is ordered by nodeId, meaning node0's swhPID is at line 0, hence we can read the // nodeId-th line to get corresponding swhPID if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } - String swhPID = nodeToSwhMap.readAtLine(nodeId); - return new SwhPID(swhPID); + return SwhPID.fromBytes(nodeToSwhMap.readAtLine(nodeId)); } /** * Closes the mapping files. */ public void close() throws IOException { swhToNodeMap.close(); nodeToSwhMap.close(); } }