diff --git a/java/server/pom.xml b/java/server/pom.xml index 8d70888..2fed083 100644 --- a/java/server/pom.xml +++ b/java/server/pom.xml @@ -1,209 +1,214 @@ 4.0.0 org.softwareheritage.graph swh-graph ${git.closest.tag.name} swh-graph https://forge.softwareheritage.org/source/swh-graph/ UTF-8 11 ch.qos.logback logback-classic 1.2.3 junit junit 4.11 test org.hamcrest hamcrest 2.1 test io.javalin javalin 3.0.0 org.slf4j slf4j-simple 1.7.26 com.fasterxml.jackson.core jackson-databind 2.9.8 it.unimi.dsi webgraph-big 3.5.1 it.unimi.dsi fastutil 8.3.0 it.unimi.dsi law-library 2.6.0 org.apache.hadoop hadoop-common org.umlgraph umlgraph org.eclipse.jetty.aggregate jetty-all com.martiansoftware jsap 2.1 net.sf.py4j py4j 0.10.8.1 + + commons-codec + commons-codec + 1.11 + maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 -verbose -Xlint:all maven-surefire-plugin 2.22.1 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0 maven-assembly-plugin org.softwareheritage.graph.App jar-with-dependencies false make-assembly package single pl.project13.maven git-commit-id-plugin 3.0.1 get-the-git-infos revision initialize true true true true v* git.closest.tag.name ^v true org.apache.maven.plugins maven-javadoc-plugin 3.1.1 diff --git a/java/server/src/main/java/org/softwareheritage/graph/Graph.java b/java/server/src/main/java/org/softwareheritage/graph/Graph.java index 1221984..2c86c48 100644 --- a/java/server/src/main/java/org/softwareheritage/graph/Graph.java +++ b/java/server/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,227 +1,227 @@ package org.softwareheritage.graph; import java.io.IOException; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.lang.FlyweightPrototype; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhPID; import org.softwareheritage.graph.backend.NodeIdMap; import org.softwareheritage.graph.backend.NodeTypesMap; /** * Main class storing the compressed graph and node id mappings. *

* The compressed graph is stored using the WebGraph * ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent * identifiers (PID) while WebGraph uses integers internally. These two mappings (long id ↔ * PID) are used for the input (users refer to the graph using PID) and the output (convert back to * PID for users results). However, since graph traversal can be restricted depending on the node * type (see {@link AllowedEdges}), a long id → node type map is stored as well to avoid a full * PID lookup. * * @author The Software Heritage developers * @see org.softwareheritage.graph.AllowedEdges * @see org.softwareheritage.graph.backend.NodeIdMap * @see org.softwareheritage.graph.backend.NodeTypesMap */ public class Graph implements FlyweightPrototype { /** File extension for the SWH PID to long node id map */ - public static final String PID_TO_NODE = ".pid2node.csv"; + public static final String PID_TO_NODE = ".pid2node.bin"; /** File extension for the long node id to SWH PID map */ - public static final String NODE_TO_PID = ".node2pid.csv"; - /** File extension for the long node id to node typ map */ + public static final String NODE_TO_PID = ".node2pid.bin"; + /** File extension for the long node id to node type map */ public static final String NODE_TO_TYPE = ".node2type.map"; /** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */ BVGraph graph; /** Transposed compressed graph (used for backward traversals) */ BVGraph graphTransposed; /** Path and basename of the compressed graph */ String path; /** Mapping long id ↔ SWH PIDs */ NodeIdMap nodeIdMap; /** Mapping long id → node types */ NodeTypesMap nodeTypesMap; /** * Constructor. * * @param path path and basename of the compressed graph to load */ public Graph(String path) throws IOException { this.graph = BVGraph.load(path); this.graphTransposed = BVGraph.load(path + "-transposed"); this.path = path; this.nodeTypesMap = new NodeTypesMap(path); // TODO: we don't need to load the nodeIdMap now that all the // translation between ints and PIDs is happening on the Python side. // However, some parts of the code still depend on this, so it's // commented out while we decide on what to do with it. this.nodeIdMap = null; // new NodeIdMap(path, getNbNodes()); } // Protected empty constructor to implement copy() protected Graph() { } /** * Return a flyweight copy of the graph. */ @Override public Graph copy() { final Graph ng = new Graph(); ng.graph = this.graph.copy(); ng.graphTransposed = this.graphTransposed.copy(); ng.path = path; ng.nodeIdMap = this.nodeIdMap; ng.nodeTypesMap = this.nodeTypesMap; return ng; } /** * Cleans up graph resources after use. */ public void cleanUp() throws IOException { nodeIdMap.close(); } /** * Returns the graph full path. * * @return graph full path */ public String getPath() { return path; } /** * Converts {@link SwhPID} node to long. * * @param swhPID node specified as a {@link SwhPID} * @return internal long node id * @see org.softwareheritage.graph.SwhPID */ public long getNodeId(SwhPID swhPID) { return nodeIdMap.getNodeId(swhPID); } /** * Converts long id node to {@link SwhPID}. * * @param nodeId node specified as a long id * @return external SWH PID * @see org.softwareheritage.graph.SwhPID */ public SwhPID getSwhPID(long nodeId) { return nodeIdMap.getSwhPID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see org.softwareheritage.graph.Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } /** * Returns number of nodes in the graph. * * @return number of nodes in the graph */ public long getNbNodes() { return graph.numNodes(); } /** * Returns number of edges in the graph. * * @return number of edges in the graph */ public long getNbEdges() { return graph.numArcs(); } /** * Returns lazy iterator of successors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of successors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } /** * Returns the outdegree of a node. * * @param nodeId node specified as a long id * @return outdegree of a node */ public long outdegree(long nodeId) { return graph.outdegree(nodeId); } /** * Returns lazy iterator of predecessors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of predecessors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator predecessors(long nodeId) { return graphTransposed.successors(nodeId); } /** * Returns the indegree of a node. * * @param nodeId node specified as a long id * @return indegree of a node */ public long indegree(long nodeId) { return graphTransposed.outdegree(nodeId); } /** * Returns the degree of a node, depending on graph orientation. * * @param nodeId node specified as a long id * @param useTransposed boolean value to use transposed graph * @return degree of a node */ public long degree(long nodeId, boolean useTransposed) { return (useTransposed) ? indegree(nodeId) : outdegree(nodeId); } /** * Returns the neighbors of a node (as a lazy iterator), depending on graph orientation. * * @param nodeId node specified as a long id * @param useTransposed boolean value to use transposed graph * @return lazy iterator of neighbors of the node, specified as a WebGraph LazyLongIterator */ public LazyLongIterator neighbors(long nodeId, boolean useTransposed) { return (useTransposed) ? predecessors(nodeId) : successors(nodeId); } /** * Returns the underlying BVGraph. * * @param useTransposed boolean value to use transposed graph * @return WebGraph BVGraph */ public BVGraph getBVGraph(boolean useTransposed) { return (useTransposed) ? this.graphTransposed : this.graph; } } diff --git a/java/server/src/main/java/org/softwareheritage/graph/Node.java b/java/server/src/main/java/org/softwareheritage/graph/Node.java index 04fd073..5c04ac1 100644 --- a/java/server/src/main/java/org/softwareheritage/graph/Node.java +++ b/java/server/src/main/java/org/softwareheritage/graph/Node.java @@ -1,93 +1,105 @@ package org.softwareheritage.graph; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * A node in the Software Heritage graph. * * @author The Software Heritage developers */ public class Node { /** * Software Heritage graph node types, as described in the * data model. */ public enum Type { /** Content node */ CNT, /** Directory node */ DIR, /** Origin node */ ORI, /** Release node */ REL, /** Revision node */ REV, /** Snapshot node */ SNP; /** * Converts integer to corresponding SWH node type. * * @param intType node type represented as an integer * @return the corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public static Node.Type fromInt(int intType) { switch (intType) { - case 0: - return CNT; - case 1: - return DIR; - case 2: - return ORI; - case 3: - return REL; - case 4: - return REV; - case 5: - return SNP; + case 0: return CNT; + case 1: return DIR; + case 2: return ORI; + case 3: return REL; + case 4: return REV; + case 5: return SNP; } return null; } + /** + * Converts node types to the corresponding int value + * + * @param type node type as an enum + * @return the corresponding int value + */ + public static int toInt(Node.Type type) { + switch (type) { + case CNT: return 0; + case DIR: return 1; + case ORI: return 2; + case REL: return 3; + case REV: return 4; + case SNP: return 5; + } + throw new IllegalArgumentException("Unknown node type: " + type); + } + /** * Converts string to corresponding SWH node type. * * @param strType node type represented as a string * @return the corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public static Node.Type fromStr(String strType) { if (!strType.matches("cnt|dir|ori|rel|rev|snp")) { throw new IllegalArgumentException("Unknown node type: " + strType); } return Node.Type.valueOf(strType.toUpperCase()); } /** * Parses SWH node type possible values from formatted string (see the API * syntax). * * @param strFmtType node types represented as a formatted string * @return a list containing the {@link Node.Type} values * @see org.softwareheritage.graph.Node.Type */ public static ArrayList parse(String strFmtType) { ArrayList types = new ArrayList<>(); if (strFmtType.equals("*")) { List nodeTypes = Arrays.asList(Node.Type.values()); types.addAll(nodeTypes); } else { types.add(Node.Type.fromStr(strFmtType)); } return types; } } } diff --git a/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java b/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java index 34f44e1..75c7aa8 100644 --- a/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java +++ b/java/server/src/main/java/org/softwareheritage/graph/SwhPID.java @@ -1,98 +1,108 @@ package org.softwareheritage.graph; +import java.lang.System; + import com.fasterxml.jackson.annotation.JsonValue; +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.codec.DecoderException; import org.softwareheritage.graph.Node; /** * A Software Heritage PID, see persistent * identifier documentation. * * @author The Software Heritage developers */ public class SwhPID { /** Fixed hash length of the PID */ public static final int HASH_LENGTH = 40; /** Full PID as a string */ String swhPID; /** PID node type */ Node.Type type; - /** PID hex-encoded SHA1 hash */ - String hash; /** * Constructor. * * @param swhPID full PID as a string */ public SwhPID(String swhPID) { this.swhPID = swhPID; // PID format: 'swh:1:type:hash' String[] parts = swhPID.split(":"); if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) { - throw new IllegalArgumentException( - "Expected SWH PID format to be 'swh:1:type:hash', got: " + swhPID); + throw new IllegalArgumentException("malformed SWH PID: " + swhPID); } - this.type = Node.Type.fromStr(parts[2]); - - this.hash = parts[3]; - if (!hash.matches("[0-9a-f]{" + HASH_LENGTH + "}")) { - throw new IllegalArgumentException("Wrong SWH PID hash format in: " + swhPID); + if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) { + throw new IllegalArgumentException("malformed SWH PID: " + swhPID); } } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SwhPID)) return false; SwhPID other = (SwhPID) otherObj; return swhPID.equals(other.getSwhPID()); } @Override public int hashCode() { return swhPID.hashCode(); } @Override public String toString() { return swhPID; } + /** Converts PID to a compact binary representation. + * + * The binary format is specified in the Python module + * swh.graph.pid:str_to_bytes . + */ + public byte[] toBytes() { + byte[] bytes = new byte[22]; + byte[] digest; + + bytes[0] = (byte) 1; // namespace version + bytes[1] = (byte) Node.Type.toInt(this.type); // PID type + try { + digest = Hex.decodeHex(this.swhPID.substring(10)); // SHA1 hash + System.arraycopy(digest, 0, bytes, 2, digest.length); + } catch (DecoderException e) { + throw new IllegalArgumentException("invalid hex sequence in PID: " + this.swhPID); + } + + return bytes; + } + /** * Returns full PID as a string. * * @return full PID string */ @JsonValue public String getSwhPID() { return swhPID; } /** * Returns PID node type. * * @return PID corresponding {@link Node.Type} * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType() { return type; } - - /** - * Returns PID hex-encoded SHA1 hash. - * - * @return PID string hash - */ - public String getHash() { - return hash; - } } diff --git a/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java b/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java index 3a8d19a..35ca81f 100644 --- a/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java +++ b/java/server/src/main/java/org/softwareheritage/graph/backend/Setup.java @@ -1,123 +1,124 @@ package org.softwareheritage.graph.backend; -import java.io.BufferedWriter; +import java.io.BufferedOutputStream; +import java.io.DataOutputStream; import java.io.FileInputStream; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.Writer; import java.util.zip.GZIPInputStream; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectBigArrays; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SwhPID; import org.softwareheritage.graph.backend.NodeTypesMap; /** * Pre-processing steps (such as dumping mapping files on disk) before running the graph service. * * @author The Software Heritage developers */ public class Setup { /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: NODES_CSV_GZ COMPRESSED_GRAPH_BASE_NAME"); System.exit(1); } String nodesPath = args[0]; String graphPath = args[1]; System.out.println("Pre-computing node id maps..."); long startTime = System.nanoTime(); precomputeNodeIdMap(nodesPath, graphPath); long endTime = System.nanoTime(); double duration = (endTime - startTime) / 1_000_000_000; System.out.println("Done in: " + duration + " seconds"); } /** * Computes and dumps on disk mapping files. * * @param nodesPath path of the compressed csv nodes file * @param graphPath path of the compressed graph */ // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String nodesPath, String graphPath) throws IOException { // First internal mapping: SWH PID (string) -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("The .mph file contains unknown class object: " + e); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); // Second internal mapping: WebGraph MPH (long) -> BFS ordering (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); if (loaded != nbIds) { throw new IllegalArgumentException("Graph contains " + nbIds + " nodes, but read " + loaded); } // Dump complete mapping for all nodes: SWH PID (string) <=> WebGraph node id (long) InputStream nodesStream = new GZIPInputStream(new FileInputStream(nodesPath)); FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(nodesStream, "UTF-8")); LineIterator swhPIDIterator = new LineIterator(buffer); - try (Writer swhToNodeMap = new BufferedWriter(new FileWriter(graphPath + Graph.PID_TO_NODE)); - Writer nodeToSwhMap = new BufferedWriter(new FileWriter(graphPath + Graph.NODE_TO_PID))) { - // nodeToSwhMap needs to write SWH PID in order of node id, so use a temporary array + // for the binary format of pidToNodeMap, see Python module swh.graph.pid:PidToIntMap + // for the binary format of nodeToPidMap, see Python module swh.graph.pid:IntToPidMap + try (DataOutputStream pidToNodeMap = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(graphPath + Graph.PID_TO_NODE))); + BufferedOutputStream nodeToPidMap = new BufferedOutputStream(new FileOutputStream(graphPath + Graph.NODE_TO_PID))) { + // nodeToPidMap needs to write SWH PID in order of node id, so use a temporary array Object[][] nodeToSwhPID = ObjectBigArrays.newBigArray(nbIds); // To effectively run edge restriction during graph traversals, we store node id (long) -> SWH // type map. This is represented as a bitmap using minimum number of bits per Node.Type. final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); final int nbBitsPerNodeType = log2NbTypes; LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); for (long iNode = 0; iNode < nbIds && swhPIDIterator.hasNext(); iNode++) { String strSwhPID = swhPIDIterator.next().toString(); + SwhPID swhPID = new SwhPID(strSwhPID); + byte[] swhPIDBin = swhPID.toBytes(); + long mphId = mphMap.getLong(strSwhPID); long nodeId = LongBigArrays.get(bfsMap, mphId); - String paddedNodeId = String.format("%0" + NodeIdMap.NODE_ID_LENGTH + "d", nodeId); - String line = strSwhPID + " " + paddedNodeId + "\n"; - swhToNodeMap.write(line); + pidToNodeMap.write(swhPIDBin, 0, swhPIDBin.length); + pidToNodeMap.writeLong(nodeId); - ObjectBigArrays.set(nodeToSwhPID, nodeId, strSwhPID); - - SwhPID swhPID = new SwhPID(strSwhPID); + ObjectBigArrays.set(nodeToSwhPID, nodeId, swhPIDBin); nodeTypesMap.set(nodeId, swhPID.getType().ordinal()); } BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); for (long iNode = 0; iNode < nbIds; iNode++) { - String line = ObjectBigArrays.get(nodeToSwhPID, iNode).toString() + "\n"; - nodeToSwhMap.write(line); + nodeToPidMap.write((byte[]) ObjectBigArrays.get(nodeToSwhPID, iNode)); } } } } diff --git a/swh/graph/tests/dataset/output/example.node2pid.bin b/swh/graph/tests/dataset/output/example.node2pid.bin index 7755043..c0e13f5 100644 Binary files a/swh/graph/tests/dataset/output/example.node2pid.bin and b/swh/graph/tests/dataset/output/example.node2pid.bin differ diff --git a/swh/graph/tests/dataset/output/example.pid2node.bin b/swh/graph/tests/dataset/output/example.pid2node.bin index 753264c..70fa0aa 100644 Binary files a/swh/graph/tests/dataset/output/example.pid2node.bin and b/swh/graph/tests/dataset/output/example.pid2node.bin differ