diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java index c266dbc..0c382f7 100644 --- a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java +++ b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java @@ -1,91 +1,98 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.util.ArrayList; /** * Edge restriction based on node types, used when visiting the graph. *

* Software Heritage * graph contains multiple node types (contents, directories, revisions, ...) and restricting * the traversal to specific node types is necessary for many querying operations: * use cases. * * @author The Software Heritage developers */ public class AllowedEdges { /** * 2D boolean matrix storing access rights for all combination of src/dst node types (first * dimension is source, second dimension is destination), when edge restriction is not enforced this * array is set to null for early bypass. */ public boolean[][] restrictedTo; /** * Constructor. * * @param edgesFmt a formatted string describing allowed * edges */ public AllowedEdges(String edgesFmt) { int nbNodeTypes = Node.Type.values().length; this.restrictedTo = new boolean[nbNodeTypes][nbNodeTypes]; // Special values (null, empty, "*") if (edgesFmt == null || edgesFmt.isEmpty()) { return; } if (edgesFmt.equals("*")) { // Allows for quick bypass (with simple null check) when no edge restriction restrictedTo = null; return; } // Format: "src1:dst1,src2:dst2,[...]" String[] edgeTypes = edgesFmt.split(","); for (String edgeType : edgeTypes) { String[] nodeTypes = edgeType.split(":"); if (nodeTypes.length != 2) { throw new IllegalArgumentException("Cannot parse edge type: " + edgeType); } ArrayList srcTypes = Node.Type.parse(nodeTypes[0]); ArrayList dstTypes = Node.Type.parse(nodeTypes[1]); for (Node.Type srcType : srcTypes) { for (Node.Type dstType : dstTypes) { restrictedTo[srcType.ordinal()][dstType.ordinal()] = true; } } } } /** * Checks if a given edge can be followed during graph traversal. * * @param srcType edge source type * @param dstType edge destination type * @return true if allowed and false otherwise */ public boolean isAllowed(Node.Type srcType, Node.Type dstType) { if (restrictedTo == null) return true; return restrictedTo[srcType.ordinal()][dstType.ordinal()]; } /** * Return a new AllowedEdges instance with reversed edge restrictions. e.g. "src1:dst1,src2:dst2" * becomes "dst1:src1,dst2:src2" * * @return a new AllowedEdges instance with reversed edge restrictions */ public AllowedEdges reverse() { AllowedEdges reversed = new AllowedEdges(null); reversed.restrictedTo = new boolean[restrictedTo.length][restrictedTo[0].length]; for (int i = 0; i < restrictedTo.length; i++) { for (int j = 0; j < restrictedTo[0].length; j++) { reversed.restrictedTo[i][j] = restrictedTo[j][i]; } } return reversed; } } diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java index ddecfd4..b63edf2 100644 --- a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java +++ b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java @@ -1,50 +1,57 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; /** * Node type restriction, useful to implement filtering of returned nodes during traversal. * * @author The Software Heritage developers */ public class AllowedNodes { public boolean[] restrictedTo; /** * Constructor. * * @param nodesFmt a formatted string describing allowed nodes */ public AllowedNodes(String nodesFmt) { int nbNodeTypes = Node.Type.values().length; this.restrictedTo = new boolean[nbNodeTypes]; // Special values (null, empty, "*") if (nodesFmt == null || nodesFmt.isEmpty()) { return; } if (nodesFmt.equals("*")) { // Allows for quick bypass (with simple null check) when no node restriction restrictedTo = null; return; } // Format: "nodeType1,nodeType2,[...]" String[] nodeTypesStr = nodesFmt.split(","); for (String nodeTypeStr : nodeTypesStr) { for (Node.Type nodeType : Node.Type.parse(nodeTypeStr)) { this.restrictedTo[Node.Type.toInt(nodeType)] = true; } } } /** * Checks if a given node type is allowed. * * @param nodeType node type to check * @return true if allowed and false otherwise */ public boolean isAllowed(Node.Type nodeType) { if (restrictedTo == null) return true; return restrictedTo[Node.Type.toInt(nodeType)]; } } diff --git a/java/src/main/java/org/softwareheritage/graph/Node.java b/java/src/main/java/org/softwareheritage/graph/Node.java index be3efde..9d46a76 100644 --- a/java/src/main/java/org/softwareheritage/graph/Node.java +++ b/java/src/main/java/org/softwareheritage/graph/Node.java @@ -1,139 +1,146 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.util.*; /** * A node in the Software Heritage graph. * * @author The Software Heritage developers */ public class Node { /** * Software Heritage graph node types, as described in the * data model. */ public enum Type { /** Content node */ CNT, /** Directory node */ DIR, /** Origin node */ ORI, /** Release node */ REL, /** Revision node */ REV, /** Snapshot node */ SNP; /** * Converts integer to corresponding SWH node type. * * @param intType node type represented as an integer * @return the corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public static Node.Type fromInt(int intType) { switch (intType) { case 0: return CNT; case 1: return DIR; case 2: return ORI; case 3: return REL; case 4: return REV; case 5: return SNP; } return null; } /** * Converts node types to the corresponding int value * * @param type node type as an enum * @return the corresponding int value */ public static int toInt(Node.Type type) { switch (type) { case CNT: return 0; case DIR: return 1; case ORI: return 2; case REL: return 3; case REV: return 4; case SNP: return 5; } throw new IllegalArgumentException("Unknown node type: " + type); } /** * Converts string to corresponding SWH node type. * * @param strType node type represented as a string * @return the corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public static Node.Type fromStr(String strType) { if (!strType.matches("cnt|dir|ori|rel|rev|snp")) { throw new IllegalArgumentException("Unknown node type: " + strType); } return Node.Type.valueOf(strType.toUpperCase()); } /** * Converts byte array name to the int code of the corresponding SWH node type. Used for * performance-critical deserialization. * * @param name node type represented as a byte array (e.g. b"cnt") * @return the ordinal value of the corresponding {@link Node.Type} * @see org.softwareheritage.graph.Node.Type */ public static int byteNameToInt(byte[] name) { if (Arrays.equals(name, "cnt".getBytes())) { return 0; } else if (Arrays.equals(name, "dir".getBytes())) { return 1; } else if (Arrays.equals(name, "ori".getBytes())) { return 2; } else if (Arrays.equals(name, "rel".getBytes())) { return 3; } else if (Arrays.equals(name, "rev".getBytes())) { return 4; } else if (Arrays.equals(name, "snp".getBytes())) { return 5; } else return -1; } /** * Parses SWH node type possible values from formatted string (see the * API syntax). * * @param strFmtType node types represented as a formatted string * @return a list containing the {@link Node.Type} values * @see org.softwareheritage.graph.Node.Type */ public static ArrayList parse(String strFmtType) { ArrayList types = new ArrayList<>(); if (strFmtType.equals("*")) { List nodeTypes = Arrays.asList(Node.Type.values()); types.addAll(nodeTypes); } else { types.add(Node.Type.fromStr(strFmtType)); } return types; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/SWHID.java b/java/src/main/java/org/softwareheritage/graph/SWHID.java index 16aff83..18951fc 100644 --- a/java/src/main/java/org/softwareheritage/graph/SWHID.java +++ b/java/src/main/java/org/softwareheritage/graph/SWHID.java @@ -1,118 +1,125 @@ +/* + * Copyright (c) 2019 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import com.fasterxml.jackson.annotation.JsonValue; import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.binary.Hex; /** * A Software Heritage persistent identifier (SWHID), see persistent * identifier documentation. * * @author The Software Heritage developers */ public class SWHID { /** Fixed hash length of the SWHID */ public static final int HASH_LENGTH = 40; /** Full SWHID as a string */ String swhid; /** SWHID node type */ Node.Type type; /** * Constructor. * * @param swhid full SWHID as a string */ public SWHID(String swhid) { this.swhid = swhid; // SWHID format: 'swh:1:type:hash' String[] parts = swhid.split(":"); if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) { throw new IllegalArgumentException("malformed SWHID: " + swhid); } this.type = Node.Type.fromStr(parts[2]); if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) { throw new IllegalArgumentException("malformed SWHID: " + swhid); } } /** * Creates a SWHID from a compact binary representation. *

* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes . */ public static SWHID fromBytes(byte[] input) { byte[] digest = new byte[20]; System.arraycopy(input, 2, digest, 0, digest.length); String swhidStr = String.format("swh:%d:%s:%s", input[0], Node.Type.fromInt(input[1]).toString().toLowerCase(), Hex.encodeHexString(digest)); return new SWHID(swhidStr); } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SWHID)) return false; SWHID other = (SWHID) otherObj; return swhid.equals(other.getSWHID()); } @Override public int hashCode() { return swhid.hashCode(); } @Override public String toString() { return swhid; } /** * Converts SWHID to a compact binary representation. *

* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes . */ public byte[] toBytes() { byte[] bytes = new byte[22]; byte[] digest; bytes[0] = (byte) 1; // namespace version bytes[1] = (byte) Node.Type.toInt(this.type); // SWHID type try { digest = Hex.decodeHex(this.swhid.substring(10)); // SHA1 hash System.arraycopy(digest, 0, bytes, 2, digest.length); } catch (DecoderException e) { throw new IllegalArgumentException("invalid hex sequence in SWHID: " + this.swhid); } return bytes; } /** * Returns full SWHID as a string. * * @return full SWHID string */ @JsonValue public String getSWHID() { return swhid; } /** * Returns SWHID node type. * * @return SWHID corresponding {@link Node.Type} * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType() { return type; } } diff --git a/java/src/main/java/org/softwareheritage/graph/Subgraph.java b/java/src/main/java/org/softwareheritage/graph/Subgraph.java index 53ef937..591279c 100644 --- a/java/src/main/java/org/softwareheritage/graph/Subgraph.java +++ b/java/src/main/java/org/softwareheritage/graph/Subgraph.java @@ -1,224 +1,231 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.NodeIterator; import java.util.NoSuchElementException; public class Subgraph extends ImmutableGraph { private final SwhBidirectionalGraph underlyingGraph; public final AllowedNodes allowedNodeTypes; private long nodeCount = -1; /** * Constructor. * */ public Subgraph(SwhBidirectionalGraph underlyingGraph, AllowedNodes allowedNodeTypes) { this.underlyingGraph = underlyingGraph.copy(); this.allowedNodeTypes = allowedNodeTypes; } /** * Return a flyweight copy of the graph. */ @Override public Subgraph copy() { return new Subgraph(this.underlyingGraph.copy(), allowedNodeTypes); } @Override public boolean randomAccess() { return underlyingGraph.randomAccess(); } /** * Return a transposed version of the graph. */ public Subgraph transpose() { return new Subgraph(underlyingGraph.transpose(), allowedNodeTypes); } /** * Return a symmetric version of the graph. */ public Subgraph symmetrize() { return new Subgraph(underlyingGraph.symmetrize(), allowedNodeTypes); } /** * Returns number of nodes in the graph. * * @return number of nodes in the graph */ @Override public long numNodes() { if (nodeCount == -1) { for (long i = 0; i < underlyingGraph.numNodes(); ++i) { if (nodeExists(i)) ++nodeCount; } } return nodeCount; } /** * Returns number of edges in the graph. * * @return number of edges in the graph */ @Override public long numArcs() { throw new UnsupportedOperationException("Cannot determine the number of arcs in a subgraph"); } public long maxNodeNumber() { return underlyingGraph.numNodes(); } public boolean nodeExists(long node) { return allowedNodeTypes.isAllowed(underlyingGraph.getNodeType(node)); } /** * Returns lazy iterator of successors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of successors of the node, specified as a * WebGraph LazyLongIterator */ @Override public LazyLongIterator successors(long nodeId) { if (!nodeExists(nodeId)) { throw new IllegalArgumentException("Node " + nodeId + " not in subgraph"); } LazyLongIterator allSuccessors = underlyingGraph.successors(nodeId); return new LazyLongIterator() { @Override public long nextLong() { long neighbor; while ((neighbor = allSuccessors.nextLong()) != -1) { if (nodeExists(neighbor)) { return neighbor; } } return -1; } @Override public long skip(final long n) { long i; for (i = 0; i < n && nextLong() != -1; i++) ; return i; } }; } /** * Returns the outdegree of a node. * * @param nodeId node specified as a long id * @return outdegree of a node */ @Override public long outdegree(long nodeId) { long deg = 0; for (LazyLongIterator allSuccessors = successors(nodeId); allSuccessors.nextLong() != -1; ++deg) ; return deg; } @Override public NodeIterator nodeIterator() { return new NodeIterator() { final long n = numNodes(); long i = -1; long done = 0; @Override public boolean hasNext() { return done <= n; } @Override public long nextLong() { if (!hasNext()) throw new NoSuchElementException(); do { ++i; if (i >= underlyingGraph.numNodes()) throw new NoSuchElementException(); } while (!nodeExists(i)); ++done; return i; } @Override public long outdegree() { return Subgraph.this.outdegree(i); } @Override public LazyLongIterator successors() { return Subgraph.this.successors(i); } }; } /** * Returns lazy iterator of predecessors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of predecessors of the node, specified as a * WebGraph LazyLongIterator */ public LazyLongIterator predecessors(long nodeId) { return this.transpose().successors(nodeId); } /** * Returns the indegree of a node. * * @param nodeId node specified as a long id * @return indegree of a node */ public long indegree(long nodeId) { return this.transpose().outdegree(nodeId); } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return underlyingGraph.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return underlyingGraph.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see Node.Type */ public Node.Type getNodeType(long nodeId) { return underlyingGraph.getNodeType(nodeId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java index 446dd65..04b2a8c 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java @@ -1,180 +1,187 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.big.webgraph.BidirectionalImmutableGraph; import it.unimi.dsi.logging.ProgressLogger; import java.io.IOException; import java.io.InputStream; /** * Class representing the compressed Software Heritage graph in both directions (forward and * backward). * * This class uses the {@link BidirectionalImmutableGraph} class internally to implement the * backward equivalent of graph operations ({@link SwhBidirectionalGraph#indegree(long)}, * {@link SwhBidirectionalGraph#predecessors(long)}, etc.) by holding a reference to two * {@link SwhUnidirectionalGraph} (a forward graph and a backward graph). * * Both graphs share their graph properties in memory by storing references to the same * {@link SwhGraphProperties} object. * *

  *                 ┌──────────────┐
  *                 │ImmutableGraph◄────────┐
  *                 └────▲─────────┘        │extends
  *                      │                  │
  *                      │       ┌──────────┴────────────────┐
  *               extends│       │BidirectionalImmutableGraph│
  *                      │       └────────────▲──────────────┘
  *                      │                    │extends
  *       ┌──────────────┴───────┐     ┌──────┴──────────────┐
  *       │SwhUnidirectionalGraph│◄────┤SwhBidirectionalGraph│
  *       └──┬──────────────┬────┘     └────────┬───────────┬┘
  *          │              │    contains x2    │           │
  *          │              │                   │           │
  *          │    implements│                   │implements │
  *          │             ┌▼──────────┐        │           │
  *          │             │SwhGraph(I)◄────────┘           │
  * contains │             └───────────┘                    │contains
  *          │                                              │
  *          │            ┌──────────────────┐              │
  *          └────────────►SwhGraphProperties◄──────────────┘
  *                       └──────────────────┘
  * 
* * @author The Software Heritage developers * @see SwhUnidirectionalGraph */ public class SwhBidirectionalGraph extends BidirectionalImmutableGraph implements SwhGraph { /** Property data of the graph (id/type mappings etc.) */ public final SwhGraphProperties properties; private final SwhUnidirectionalGraph forwardGraph; private final SwhUnidirectionalGraph backwardGraph; public SwhBidirectionalGraph(SwhUnidirectionalGraph forwardGraph, SwhUnidirectionalGraph backwardGraph, SwhGraphProperties properties) { super(forwardGraph, backwardGraph); this.forwardGraph = forwardGraph; this.backwardGraph = backwardGraph; this.properties = properties; } private SwhBidirectionalGraph(BidirectionalImmutableGraph graph, SwhGraphProperties properties) { super(graph.forward, graph.backward); this.forwardGraph = new SwhUnidirectionalGraph(graph.forward, properties); this.backwardGraph = new SwhUnidirectionalGraph(graph.backward, properties); this.properties = properties; } public static SwhBidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadGraphOnly(method, path, is, pl); SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadGraphOnly(method, path + "-transposed", is, pl); SwhGraphProperties properties = SwhGraphProperties.load(path); forward.setProperties(properties); backward.setProperties(properties); return new SwhBidirectionalGraph(forward, backward, properties); } public static SwhBidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path, is, pl); SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path + "-transposed", is, pl); SwhGraphProperties properties = SwhGraphProperties.load(path); forward.setProperties(properties); backward.setProperties(properties); return new SwhBidirectionalGraph(forward, backward, properties); } // loadXXX methods from ImmutableGraph public static SwhBidirectionalGraph load(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.STANDARD, path, null, pl); } public static SwhBidirectionalGraph load(String path) throws IOException { return load(LoadMethod.STANDARD, path, null, null); } public static SwhBidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.MAPPED, path, null, pl); } public static SwhBidirectionalGraph loadMapped(String path) throws IOException { return load(LoadMethod.MAPPED, path, null, null); } public static SwhBidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.OFFLINE, path, null, pl); } public static SwhBidirectionalGraph loadOffline(String path) throws IOException { return load(LoadMethod.OFFLINE, path, null, null); } // Labelled versions of the loadXXX methods from ImmutableGraph public static SwhBidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, pl); } public static SwhBidirectionalGraph loadLabelled(String path) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, null); } public static SwhBidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, pl); } public static SwhBidirectionalGraph loadLabelledMapped(String path) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, null); } public static SwhBidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, pl); } public static SwhBidirectionalGraph loadLabelledOffline(String path) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, null); } @Override public SwhBidirectionalGraph copy() { return new SwhBidirectionalGraph(forwardGraph.copy(), backwardGraph.copy(), this.properties); } @Override public SwhBidirectionalGraph transpose() { return new SwhBidirectionalGraph(super.transpose(), this.properties); } @Override public SwhBidirectionalGraph symmetrize() { return new SwhBidirectionalGraph(super.symmetrize(), this.properties); } public SwhUnidirectionalGraph getForwardGraph() { return this.forwardGraph; } public SwhUnidirectionalGraph getBackwardGraph() { return this.backwardGraph; } /** * Returns a *labelled* lazy iterator over the successors of a given node. The iteration terminates * when -1 is returned. */ public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) { return forwardGraph.labelledSuccessors(x); } /** * Returns a *labelled* lazy iterator over the predecessors of a given node. The iteration * terminates when -1 is returned. */ public ArcLabelledNodeIterator.LabelledArcIterator labelledPredecessors(long x) { return backwardGraph.labelledSuccessors(x); } public void close() throws IOException { this.properties.close(); } @Override public SwhGraphProperties getProperties() { return properties; } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java index aa92536..432de35 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java @@ -1,144 +1,151 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.io.IOException; /** * Common interface for SWH graph classes. * * This interface forwards all property loading/access methods to the SwhGraphProperties object * returned by the getProperties() method of the implementing class. This allows API users to write * graph.getNodeType() instead of graph.getProperties().getNodeType(). */ public interface SwhGraph { /** * Cleans up graph resources after use. */ void close() throws IOException; /** * Returns the SWH graph properties object of this graph. * * @return graph properties */ SwhGraphProperties getProperties(); /** @see SwhGraphProperties#getPath() */ default String getPath() { return getProperties().getPath(); } /** @see SwhGraphProperties#getNodeId(SWHID) */ default long getNodeId(SWHID swhid) { return getProperties().getNodeId(swhid); } /** @see SwhGraphProperties#getSWHID(long) */ default SWHID getSWHID(long nodeId) { return getProperties().getSWHID(nodeId); } /** @see SwhGraphProperties#getNodeType(long) */ default Node.Type getNodeType(long nodeId) { return getProperties().getNodeType(nodeId); } /** @see SwhGraphProperties#loadContentLength() */ default void loadContentLength() throws IOException { getProperties().loadContentLength(); } /** @see SwhGraphProperties#getContentLength(long) */ default Long getContentLength(long nodeId) { return getProperties().getContentLength(nodeId); } /** @see SwhGraphProperties#loadPersonIds() */ default void loadPersonIds() throws IOException { getProperties().loadPersonIds(); } /** @see SwhGraphProperties#getAuthorId(long) */ default Long getAuthorId(long nodeId) { return getProperties().getAuthorId(nodeId); } /** @see SwhGraphProperties#getCommitterId(long) */ default Long getCommitterId(long nodeId) { return getProperties().getCommitterId(nodeId); } /** @see SwhGraphProperties#loadContentIsSkipped() */ default void loadContentIsSkipped() throws IOException { getProperties().loadContentIsSkipped(); } /** @see SwhGraphProperties#isContentSkipped(long) */ default boolean isContentSkipped(long nodeId) { return getProperties().isContentSkipped(nodeId); } /** @see SwhGraphProperties#loadAuthorTimestamps() */ default void loadAuthorTimestamps() throws IOException { getProperties().loadAuthorTimestamps(); } /** @see SwhGraphProperties#getAuthorTimestamp(long) */ default Long getAuthorTimestamp(long nodeId) { return getProperties().getAuthorTimestamp(nodeId); } /** @see SwhGraphProperties#getAuthorTimestampOffset(long) */ default Short getAuthorTimestampOffset(long nodeId) { return getProperties().getAuthorTimestampOffset(nodeId); } /** @see SwhGraphProperties#loadCommitterTimestamps() */ default void loadCommitterTimestamps() throws IOException { getProperties().loadCommitterTimestamps(); } /** @see SwhGraphProperties#getCommitterTimestamp(long) */ default Long getCommitterTimestamp(long nodeId) { return getProperties().getCommitterTimestamp(nodeId); } /** @see SwhGraphProperties#getCommitterTimestampOffset(long) */ default Short getCommitterTimestampOffset(long nodeId) { return getProperties().getCommitterTimestampOffset(nodeId); } /** @see SwhGraphProperties#loadMessages() */ default void loadMessages() throws IOException { getProperties().loadMessages(); } /** @see SwhGraphProperties#getMessage(long) */ default byte[] getMessage(long nodeId) { return getProperties().getMessage(nodeId); } /** @see SwhGraphProperties#getUrl(long) */ default String getUrl(long nodeId) { return getProperties().getUrl(nodeId); } /** @see SwhGraphProperties#loadTagNames() */ default void loadTagNames() throws IOException { getProperties().loadTagNames(); } /** @see SwhGraphProperties#getTagName(long) */ default byte[] getTagName(long nodeId) { return getProperties().getTagName(nodeId); } /** @see SwhGraphProperties#loadLabelNames() */ default void loadLabelNames() throws IOException { getProperties().loadLabelNames(); } /** @see SwhGraphProperties#getLabelName(long) */ default byte[] getLabelName(long labelId) { return getProperties().getLabelName(labelId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java index 637daee..9de9762 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java @@ -1,323 +1,330 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import it.unimi.dsi.big.util.MappedFrontCodedStringBigList; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.bytes.ByteBigList; import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; import it.unimi.dsi.fastutil.ints.IntBigList; import it.unimi.dsi.fastutil.ints.IntMappedBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.longs.LongMappedBigList; import it.unimi.dsi.fastutil.shorts.ShortBigList; import it.unimi.dsi.fastutil.shorts.ShortMappedBigList; import it.unimi.dsi.sux4j.util.EliasFanoLongBigList; import org.apache.commons.configuration2.ex.ConfigurationException; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.IOException; import java.io.RandomAccessFile; import java.util.Base64; /** * This objects contains SWH graph properties such as node labels. * * Some property mappings are necessary because Software Heritage uses string based persistent * identifiers (SWHID) while WebGraph uses integers internally. * * The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph * using SWHID) and the output (convert back to SWHID for users results). * * Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a * long id → node type map is stored as well to avoid a full SWHID lookup. * * @see NodeIdMap * @see NodeTypesMap */ public class SwhGraphProperties { private final String path; private final NodeIdMap nodeIdMap; private final NodeTypesMap nodeTypesMap; private LongBigList authorTimestamp; private ShortBigList authorTimestampOffset; private LongBigList committerTimestamp; private ShortBigList committerTimestampOffset; private LongBigList contentLength; private LongArrayBitVector contentIsSkipped; private IntBigList authorId; private IntBigList committerId; private ByteBigList messageBuffer; private LongBigList messageOffsets; private ByteBigList tagNameBuffer; private LongBigList tagNameOffsets; private MappedFrontCodedStringBigList edgeLabelNames; protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { this.path = path; this.nodeIdMap = nodeIdMap; this.nodeTypesMap = nodeTypesMap; } public static SwhGraphProperties load(String path) throws IOException { return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path)); } /** * Cleans up resources after use. */ public void close() throws IOException { edgeLabelNames.close(); } /** Return the basename of the compressed graph */ public String getPath() { return path; } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return nodeIdMap.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return nodeIdMap.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } private static LongBigList loadMappedLongs(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return LongMappedBigList.map(raf.getChannel()); } } private static IntBigList loadMappedInts(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return IntMappedBigList.map(raf.getChannel()); } } private static ShortBigList loadMappedShorts(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return ShortMappedBigList.map(raf.getChannel()); } } private static ByteBigList loadMappedBytes(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return ByteMappedBigList.map(raf.getChannel()); } } private static LongBigList loadEFLongs(String path) throws IOException { try { return (EliasFanoLongBigList) BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e); } } private static byte[] getLine(ByteBigList byteArray, long start) { long end = start; while (end < byteArray.size64() && byteArray.getByte(end) != '\n') { end++; } int length = (int) (end - start); byte[] buffer = new byte[length]; byteArray.getElements(start, buffer, 0, length); return buffer; } /** Load the sizes of the content nodes */ public void loadContentLength() throws IOException { contentLength = loadMappedLongs(path + ".property.content.length.bin"); } /** Get the size (in bytes) of the given content node */ public Long getContentLength(long nodeId) { if (contentLength == null) { throw new IllegalStateException("Content lengths not loaded"); } long res = contentLength.getLong(nodeId); return (res >= 0) ? res : null; } /** Load the IDs of the persons (authors and committers) */ public void loadPersonIds() throws IOException { authorId = loadMappedInts(path + ".property.author_id.bin"); committerId = loadMappedInts(path + ".property.committer_id.bin"); } /** Get a unique integer ID representing the author of the given revision or release node */ public Long getAuthorId(long nodeId) { if (authorId == null) { throw new IllegalStateException("Author IDs not loaded"); } long res = authorId.getInt(nodeId); return (res >= 0) ? res : null; } /** Get a unique integer ID representing the committer of the given revision node */ public Long getCommitterId(long nodeId) { if (committerId == null) { throw new IllegalStateException("Committer IDs not loaded"); } long res = committerId.getInt(nodeId); return (res >= 0) ? res : null; } /** * Loads a boolean array indicating whether the given content node was skipped during archive * ingestion */ public void loadContentIsSkipped() throws IOException { try { contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin"); } catch (ClassNotFoundException e) { throw new IOException(e); } } /** Returns whether the given content node was skipped during archive ingestion */ public boolean isContentSkipped(long nodeId) { if (contentIsSkipped == null) { throw new IllegalStateException("Skipped content array not loaded"); } return contentIsSkipped.getBoolean(nodeId); } /** Load the timestamps at which the releases and revisions were authored */ public void loadAuthorTimestamps() throws IOException { authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin"); authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin"); } /** Return the timestamp at which the given revision or release was authored */ public Long getAuthorTimestamp(long nodeId) { if (authorTimestamp == null) { throw new IllegalStateException("Author timestamps not loaded"); } long res = authorTimestamp.getLong(nodeId); return (res > Long.MIN_VALUE) ? res : null; } /** Return the timestamp offset at which the given revision or release was authored */ public Short getAuthorTimestampOffset(long nodeId) { if (authorTimestampOffset == null) { throw new IllegalStateException("Author timestamp offsets not loaded"); } short res = authorTimestampOffset.getShort(nodeId); return (res > Short.MIN_VALUE) ? res : null; } /** Load the timestamps at which the releases and revisions were committed */ public void loadCommitterTimestamps() throws IOException { committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin"); committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin"); } /** Return the timestamp at which the given revision was committed */ public Long getCommitterTimestamp(long nodeId) { if (committerTimestamp == null) { throw new IllegalStateException("Committer timestamps not loaded"); } long res = committerTimestamp.getLong(nodeId); return (res > Long.MIN_VALUE) ? res : null; } /** Return the timestamp offset at which the given revision was committed */ public Short getCommitterTimestampOffset(long nodeId) { if (committerTimestampOffset == null) { throw new IllegalStateException("Committer timestamp offsets not loaded"); } short res = committerTimestampOffset.getShort(nodeId); return (res > Short.MIN_VALUE) ? res : null; } /** Load the revision messages, the release messages and the origin URLs */ public void loadMessages() throws IOException { messageBuffer = loadMappedBytes(path + ".property.message.bin"); messageOffsets = loadMappedLongs(path + ".property.message.offset.bin"); } /** Get the message of the given revision or release node */ public byte[] getMessage(long nodeId) { if (messageBuffer == null || messageOffsets == null) { throw new IllegalStateException("Messages not loaded"); } long startOffset = messageOffsets.getLong(nodeId); if (startOffset == -1) { return null; } return Base64.getDecoder().decode(getLine(messageBuffer, startOffset)); } /** Get the URL of the given origin node */ public String getUrl(long nodeId) { byte[] url = getMessage(nodeId); return (url != null) ? new String(url) : null; } /** Load the release names */ public void loadTagNames() throws IOException { tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin"); tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin"); } /** Get the name of the given release node */ public byte[] getTagName(long nodeId) { if (tagNameBuffer == null || tagNameOffsets == null) { throw new IllegalStateException("Tag names not loaded"); } long startOffset = tagNameOffsets.getLong(nodeId); if (startOffset == -1) { return null; } return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset)); } /** Load the arc label names (directory entry names and snapshot branch names) */ public void loadLabelNames() throws IOException { try { edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl"); } catch (ConfigurationException e) { throw new IOException(e); } } /** * Get the arc label name (either a directory entry name or snapshot branch name) associated with * the given label ID */ public byte[] getLabelName(long labelId) { if (edgeLabelNames == null) { throw new IllegalStateException("Label names not loaded"); } return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId)); } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java index 40610c1..3f865d0 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java @@ -1,223 +1,230 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import java.io.IOException; import java.io.InputStream; /** * Class representing the compressed Software Heritage graph in a single direction. *

* The compressed graph is stored using the WebGraph * framework. This class contains an {@link ImmutableGraph} representing the graph itself, as well * as a reference to the object containing the graph properties (e.g. node labels). Optionally, * arc labels (properties stored on the graph edges) can also be loaded with the * loadLabelled...() function family. * * @author The Software Heritage developers * @see SwhGraphProperties * @see SwhUnidirectionalGraph */ public class SwhUnidirectionalGraph extends ImmutableGraph implements SwhGraph { /** Underlying ImmutableGraph */ private final ImmutableGraph graph; /** Labelled ImmutableGraph, null if labels are not loaded */ private ArcLabelledImmutableGraph labelledGraph; /** Property data of the graph (id/type mappings etc.) */ public SwhGraphProperties properties; public SwhUnidirectionalGraph(ImmutableGraph graph, SwhGraphProperties properties) { this.graph = graph; this.properties = properties; } protected SwhUnidirectionalGraph(ImmutableGraph graph, ArcLabelledImmutableGraph labelledGraph, SwhGraphProperties properties) { this.graph = graph; this.labelledGraph = labelledGraph; this.properties = properties; } /** * Load the (unlabelled) graph only, without the SWH properties. */ public static SwhUnidirectionalGraph loadGraphOnly(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { return new SwhUnidirectionalGraph(ImmutableGraph.load(method, path, is, pl), null); } /** * Load the labelled graph only, without the SWH properties. */ public static SwhUnidirectionalGraph loadLabelledGraphOnly(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { BitStreamArcLabelledImmutableGraph g = (BitStreamArcLabelledImmutableGraph) BitStreamArcLabelledImmutableGraph .load(method, path + "-labelled", is, pl); return new SwhUnidirectionalGraph(g.g, g, null); } /** * Load the SWH properties of the graph from a given path. */ public void loadProperties(String path) throws IOException { properties = SwhGraphProperties.load(path); } /** * Setter for the SWH graph properties. * * @param properties The {@link SwhGraphProperties} object containing the graph properties */ public void setProperties(SwhGraphProperties properties) { this.properties = properties; } /** * Load the unlabelled graph and its SWH properties. */ public static SwhUnidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph g = loadGraphOnly(method, path, is, pl); g.loadProperties(path); return g; } /** * Load the labelled graph and its SWH properties. */ public static SwhUnidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph g = loadLabelledGraphOnly(method, path, is, pl); g.loadProperties(path); return g; } // loadXXX methods of ImmutableGraph public static SwhUnidirectionalGraph load(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.STANDARD, path, null, pl); } public static SwhUnidirectionalGraph load(String path) throws IOException { return load(LoadMethod.STANDARD, path, null, null); } public static SwhUnidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.MAPPED, path, null, pl); } public static SwhUnidirectionalGraph loadMapped(String path) throws IOException { return load(LoadMethod.MAPPED, path, null, null); } public static SwhUnidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.OFFLINE, path, null, pl); } public static SwhUnidirectionalGraph loadOffline(String path) throws IOException { return load(LoadMethod.OFFLINE, path, null, null); } // Labelled versions of the loadXXX methods from ImmutableGraph public static SwhUnidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, pl); } public static SwhUnidirectionalGraph loadLabelled(String path) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, null); } public static SwhUnidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, pl); } public static SwhUnidirectionalGraph loadLabelledMapped(String path) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, null); } public static SwhUnidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, pl); } public static SwhUnidirectionalGraph loadLabelledOffline(String path) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, null); } @Override public SwhUnidirectionalGraph copy() { return new SwhUnidirectionalGraph(this.graph.copy(), this.labelledGraph != null ? this.labelledGraph.copy() : null, this.properties); } @Override public boolean randomAccess() { return graph.randomAccess(); } public void close() throws IOException { this.properties.close(); } @Override public long numNodes() { return graph.numNodes(); } @Override public long numArcs() { return graph.numArcs(); } @Override public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } /** * Returns a labelled node iterator for scanning the graph sequentially, starting from the * first node. */ public ArcLabelledNodeIterator labelledNodeIterator() { if (labelledGraph == null) { throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); } return labelledGraph.nodeIterator(); } /** * Returns a labelled node iterator for scanning the graph sequentially, starting from a * given node. */ public ArcLabelledNodeIterator labelledNodeIterator(long from) { if (labelledGraph == null) { throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); } return labelledGraph.nodeIterator(from); } /** * Returns a labelled lazy iterator over the successors of a given node. The iteration * terminates when -1 is returned. */ public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) { if (labelledGraph == null) { throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); } return labelledGraph.successors(x); } @Override public long outdegree(long nodeId) { return graph.outdegree(nodeId); } @Override public SwhGraphProperties getProperties() { return properties; } public ImmutableGraph underlyingGraph() { return graph; } public ArcLabelledImmutableGraph underlyingLabelledGraph() { return labelledGraph; } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java index ee71713..1f12744 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java @@ -1,185 +1,192 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.github.luben.zstd.ZstdInputStream; import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; /** * A graph dataset in (zstd-compressed) CSV format. * * This format does not contain any properties apart from the SWHIDs of the nodes, and optionally * the labels of the edges and the permissions of the directory entries. * * The structure of the dataset is as follows: one directory per object type, each containing: * *

* */ public class CSVEdgeDataset implements GraphDataset { final static Logger logger = LoggerFactory.getLogger(CSVEdgeDataset.class); final private File datasetDir; public CSVEdgeDataset(String datasetPath) { this(new File(datasetPath)); } public CSVEdgeDataset(File datasetDir) { if (!datasetDir.exists()) { throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist"); } this.datasetDir = datasetDir; } public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { File[] allTables = datasetDir.listFiles(); if (allTables == null) { return; } for (File tableFile : allTables) { File[] allCsvFiles = tableFile.listFiles(); if (allCsvFiles == null) { continue; } for (File csvFile : allCsvFiles) { if (csvFile.getName().endsWith(".edges.csv.zst")) { readEdgesCsvZst(csvFile.getPath(), edgeCb); } else if (csvFile.getName().endsWith(".nodes.csv.zst")) { readNodesCsvZst(csvFile.getPath(), nodeCb); } } } } public static void readEdgesCsvZst(String csvZstPath, GraphDataset.EdgeCallback cb) throws IOException { InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath))); readEdgesCsv(csvInputStream, cb); } public static void readEdgesCsv(InputStream csvInputStream, GraphDataset.EdgeCallback cb) throws IOException { FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream); Charset charset = StandardCharsets.US_ASCII; byte[] array = new byte[1024]; for (long line = 0;; line++) { int start = 0, len; while ((len = csvReader.readLine(array, start, array.length - start, FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { start += len; array = ByteArrays.grow(array, array.length + 1); } if (len == -1) break; // EOF final int lineLength = start + len; // Skip whitespace at the start of the line. int offset = 0; while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; if (offset == lineLength) { continue; } if (array[0] == '#') continue; // Scan source id. start = offset; while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; final byte[] ss = Arrays.copyOfRange(array, start, offset); // Skip whitespace between identifiers. while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; if (offset == lineLength) { logger.error("Error at line " + line + ": no target"); continue; } // Scan target ID start = offset; while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; final byte[] ts = Arrays.copyOfRange(array, start, offset); // Skip whitespace between identifiers. while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; // Scan label byte[] ls = null; if (offset < lineLength) { start = offset; while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; ls = Arrays.copyOfRange(array, start, offset); } // Skip whitespace between identifiers. while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; // Scan permission int permission = 0; if (offset < lineLength) { start = offset; while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; permission = Integer.parseInt(new String(array, start, offset - start, charset)); } cb.onEdge(ss, ts, ls, permission); } } public static void readNodesCsvZst(String csvZstPath, GraphDataset.NodeCallback cb) throws IOException { InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath))); readNodesCsv(csvInputStream, cb); } public static void readNodesCsv(InputStream csvInputStream, GraphDataset.NodeCallback cb) throws IOException { FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream); byte[] array = new byte[1024]; for (long line = 0;; line++) { int start = 0, len; while ((len = csvReader.readLine(array, start, array.length - start, FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { start += len; array = ByteArrays.grow(array, array.length + 1); } if (len == -1) break; // EOF final int lineLength = start + len; // Skip whitespace at the start of the line. int offset = 0; while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; if (offset == lineLength) { continue; } if (array[0] == '#') continue; // Scan source id. start = offset; while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; final byte[] ss = Arrays.copyOfRange(array, start, offset); cb.onNode(ss); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java index ef13166..62d3460 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java @@ -1,51 +1,58 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.fastutil.io.BinIO; import java.io.File; import java.io.IOException; /** * CLI program used to compose two on-disk permutations. * * It takes two on-disk permutations as parameters, p1 and p2, and writes on disk (p1 o p2) at the * given location. This is useful for multi-step compression (e.g., Unordered -> BFS -> LLP), as it * can be used to merge all the intermediate permutations. */ public class ComposePermutations { private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("firstPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The first permutation"), new UnflaggedOption("secondPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The second permutation"), new UnflaggedOption("outputPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The output permutation"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String firstPermFilename = config.getString("firstPermutation"); String secondPermFilename = config.getString("secondPermutation"); String outputPermFilename = config.getString("outputPermutation"); long[][] firstPerm = BinIO.loadLongsBig(new File(firstPermFilename)); long[][] secondPerm = BinIO.loadLongsBig(new File(secondPermFilename)); long[][] outputPerm = Util.composePermutationsInPlace(firstPerm, secondPerm); BinIO.storeLongs(outputPerm, outputPermFilename); } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java index e055f7d..9d58fff 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java @@ -1,404 +1,411 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.github.luben.zstd.ZstdOutputStream; import com.martiansoftware.jsap.*; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.utils.Sort; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLongArray; /** * Read a graph dataset and extract all the unique node SWHIDs it contains, including the ones that * are not stored as actual objects in the graph, but only referred to by the edges. * Additionally, extract the set of all unique edge labels in the graph. * * * *

* Rationale: Because the graph can contain holes, loose objects and dangling * objects, some nodes that are referred to as destinations in the edge relationships might not * actually be stored in the graph itself. However, to compress the graph using a graph compression * library, it is necessary to have a list of all the nodes in the graph, including the * ones that are simply referred to by the edges but not actually stored as concrete objects. *

* *

* This class reads the entire graph dataset, and uses sort -u to extract the set of * all the unique nodes and unique labels that will be needed as an input for the compression * process. *

*/ public class ExtractNodes { private final static Logger logger = LoggerFactory.getLogger(ExtractNodes.class); // Create one thread per processor. final static int numThreads = Runtime.getRuntime().availableProcessors(); // Allocate up to 20% of maximum memory for sorting subprocesses. final static long sortBufferSize = (long) (Runtime.getRuntime().maxMemory() * 0.2 / numThreads / 2); private static JSAPResult parseArgs(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the edges dataset"), new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output files"), new FlaggedOption("format", JSAP.STRING_PARSER, "orc", JSAP.NOT_REQUIRED, 'f', "format", "Format of the input dataset (orc, csv)"), new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, String.valueOf(sortBufferSize) + "b", JSAP.NOT_REQUIRED, 'S', "sort-buffer-size", "Size of the memory buffer used by each sort process"), new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir", "Path to the temporary directory used by sort")}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { System.err.println("Usage error: " + e.getMessage()); System.exit(1); } return config; } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult parsedArgs = parseArgs(args); String datasetPath = parsedArgs.getString("dataset"); String outputBasename = parsedArgs.getString("outputBasename"); String datasetFormat = parsedArgs.getString("format"); String sortBufferSize = parsedArgs.getString("sortBufferSize"); String sortTmpPath = parsedArgs.getString("sortTmpDir", null); File sortTmpDir = new File(sortTmpPath); sortTmpDir.mkdirs(); // Open edge dataset GraphDataset dataset; if (datasetFormat.equals("orc")) { dataset = new ORCGraphDataset(datasetPath); } else if (datasetFormat.equals("csv")) { dataset = new CSVEdgeDataset(datasetPath); } else { throw new IllegalArgumentException("Unknown dataset format: " + datasetFormat); } extractNodes(dataset, outputBasename, sortBufferSize, sortTmpDir); } public static void extractNodes(GraphDataset dataset, String outputBasename, String sortBufferSize, File sortTmpDir) throws IOException, InterruptedException { // Read the dataset and write the nodes and labels to the sorting processes AtomicLong edgeCount = new AtomicLong(0); AtomicLongArray edgeCountByType = new AtomicLongArray(Node.Type.values().length * Node.Type.values().length); int numThreads = Runtime.getRuntime().availableProcessors(); ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); Process[] nodeSorters = new Process[numThreads]; File[] nodeBatchPaths = new File[numThreads]; Process[] labelSorters = new Process[numThreads]; File[] labelBatches = new File[numThreads]; long[] progressCounts = new long[numThreads]; AtomicInteger nextThreadId = new AtomicInteger(0); ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement); ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); pl.itemsName = "edges"; pl.start("Reading node/edge files and writing sorted batches."); GraphDataset.NodeCallback nodeCallback = (node) -> { int threadId = threadLocalId.get(); if (nodeSorters[threadId] == null) { nodeBatchPaths[threadId] = File.createTempFile("nodes", ".txt", sortTmpDir); nodeSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), List.of("-o", nodeBatchPaths[threadId].getPath())); } OutputStream nodeOutputStream = nodeSorters[threadId].getOutputStream(); nodeOutputStream.write(node); nodeOutputStream.write('\n'); }; GraphDataset.NodeCallback labelCallback = (label) -> { int threadId = threadLocalId.get(); if (labelSorters[threadId] == null) { labelBatches[threadId] = File.createTempFile("labels", ".txt", sortTmpDir); labelSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), List.of("-o", labelBatches[threadId].getPath())); } OutputStream labelOutputStream = labelSorters[threadId].getOutputStream(); labelOutputStream.write(label); labelOutputStream.write('\n'); }; try { forkJoinPool.submit(() -> { try { dataset.readEdges((node) -> { nodeCallback.onNode(node); }, (src, dst, label, perm) -> { nodeCallback.onNode(src); nodeCallback.onNode(dst); if (label != null) { labelCallback.onNode(label); } edgeCount.incrementAndGet(); // Extract type of src and dst from their SWHID: swh:1:XXX byte[] srcTypeBytes = Arrays.copyOfRange(src, 6, 6 + 3); byte[] dstTypeBytes = Arrays.copyOfRange(dst, 6, 6 + 3); int srcType = Node.Type.byteNameToInt(srcTypeBytes); int dstType = Node.Type.byteNameToInt(dstTypeBytes); if (srcType != -1 && dstType != -1) { edgeCountByType.incrementAndGet(srcType * Node.Type.values().length + dstType); } else { System.err.println("Invalid edge type: " + new String(srcTypeBytes) + " -> " + new String(dstTypeBytes)); System.exit(1); } int threadId = threadLocalId.get(); if (++progressCounts[threadId] > 1000) { synchronized (pl) { pl.update(progressCounts[threadId]); } progressCounts[threadId] = 0; } }); } catch (IOException e) { throw new RuntimeException(e); } }).get(); } catch (ExecutionException e) { throw new RuntimeException(e); } // Close all the sorters stdin for (int i = 0; i < numThreads; i++) { if (nodeSorters[i] != null) { nodeSorters[i].getOutputStream().close(); } if (labelSorters[i] != null) { labelSorters[i].getOutputStream().close(); } } // Wait for sorting processes to finish for (int i = 0; i < numThreads; i++) { if (nodeSorters[i] != null) { nodeSorters[i].waitFor(); } if (labelSorters[i] != null) { labelSorters[i].waitFor(); } } pl.done(); ArrayList nodeSortMergerOptions = new ArrayList<>(List.of("-m")); ArrayList labelSortMergerOptions = new ArrayList<>(List.of("-m")); for (int i = 0; i < numThreads; i++) { if (nodeBatchPaths[i] != null) { nodeSortMergerOptions.add(nodeBatchPaths[i].getPath()); } if (labelBatches[i] != null) { labelSortMergerOptions.add(labelBatches[i].getPath()); } } // Spawn node merge-sorting process Process nodeSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), nodeSortMergerOptions); nodeSortMerger.getOutputStream().close(); OutputStream nodesFileOutputStream = new ZstdOutputStream( new BufferedOutputStream(new FileOutputStream(outputBasename + ".nodes.csv.zst"))); NodesOutputThread nodesOutputThread = new NodesOutputThread( new BufferedInputStream(nodeSortMerger.getInputStream()), nodesFileOutputStream); nodesOutputThread.start(); // Spawn label merge-sorting process Process labelSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), labelSortMergerOptions); labelSortMerger.getOutputStream().close(); OutputStream labelsFileOutputStream = new ZstdOutputStream( new BufferedOutputStream(new FileOutputStream(outputBasename + ".labels.csv.zst"))); LabelsOutputThread labelsOutputThread = new LabelsOutputThread( new BufferedInputStream(labelSortMerger.getInputStream()), labelsFileOutputStream); labelsOutputThread.start(); pl.logger().info("Waiting for merge-sort and writing output files..."); nodeSortMerger.waitFor(); labelSortMerger.waitFor(); nodesOutputThread.join(); labelsOutputThread.join(); long[][] edgeCountByTypeArray = new long[Node.Type.values().length][Node.Type.values().length]; for (int i = 0; i < edgeCountByTypeArray.length; i++) { for (int j = 0; j < edgeCountByTypeArray[i].length; j++) { edgeCountByTypeArray[i][j] = edgeCountByType.get(i * Node.Type.values().length + j); } } // Write node, edge and label counts/statistics printEdgeCounts(outputBasename, edgeCount.get(), edgeCountByTypeArray); printNodeCounts(outputBasename, nodesOutputThread.getNodeCount(), nodesOutputThread.getNodeTypeCounts()); printLabelCounts(outputBasename, labelsOutputThread.getLabelCount()); // Clean up sorted batches for (int i = 0; i < numThreads; i++) { if (nodeBatchPaths[i] != null) { nodeBatchPaths[i].delete(); } if (labelBatches[i] != null) { labelBatches[i].delete(); } } } private static void printEdgeCounts(String basename, long edgeCount, long[][] edgeTypeCounts) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".edges.count.txt"); nodeCountWriter.println(edgeCount); nodeCountWriter.close(); PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".edges.stats.txt"); TreeMap edgeTypeCountsMap = new TreeMap<>(); for (Node.Type src : Node.Type.values()) { for (Node.Type dst : Node.Type.values()) { long cnt = edgeTypeCounts[Node.Type.toInt(src)][Node.Type.toInt(dst)]; if (cnt > 0) edgeTypeCountsMap.put(src.toString().toLowerCase() + ":" + dst.toString().toLowerCase(), cnt); } } for (Map.Entry entry : edgeTypeCountsMap.entrySet()) { nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue()); } nodeTypesCountWriter.close(); } private static void printNodeCounts(String basename, long nodeCount, long[] nodeTypeCounts) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".nodes.count.txt"); nodeCountWriter.println(nodeCount); nodeCountWriter.close(); PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".nodes.stats.txt"); TreeMap nodeTypeCountsMap = new TreeMap<>(); for (Node.Type v : Node.Type.values()) { nodeTypeCountsMap.put(v.toString().toLowerCase(), nodeTypeCounts[Node.Type.toInt(v)]); } for (Map.Entry entry : nodeTypeCountsMap.entrySet()) { nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue()); } nodeTypesCountWriter.close(); } private static void printLabelCounts(String basename, long labelCount) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".labels.count.txt"); nodeCountWriter.println(labelCount); nodeCountWriter.close(); } private static class NodesOutputThread extends Thread { private final InputStream sortedNodesStream; private final OutputStream nodesOutputStream; private long nodeCount = 0; private final long[] nodeTypeCounts = new long[Node.Type.values().length]; NodesOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) { this.sortedNodesStream = sortedNodesStream; this.nodesOutputStream = nodesOutputStream; } @Override public void run() { BufferedReader reader = new BufferedReader( new InputStreamReader(sortedNodesStream, StandardCharsets.UTF_8)); try { String line; while ((line = reader.readLine()) != null) { nodesOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); nodesOutputStream.write('\n'); nodeCount++; try { Node.Type nodeType = Node.Type.fromStr(line.split(":")[2]); nodeTypeCounts[Node.Type.toInt(nodeType)]++; } catch (ArrayIndexOutOfBoundsException e) { System.err.println("Error parsing SWHID: " + line); System.exit(1); } } nodesOutputStream.close(); } catch (IOException e) { throw new RuntimeException(e); } } public long getNodeCount() { return nodeCount; } public long[] getNodeTypeCounts() { return nodeTypeCounts; } } private static class LabelsOutputThread extends Thread { private final InputStream sortedLabelsStream; private final OutputStream labelsOutputStream; private long labelCount = 0; LabelsOutputThread(InputStream sortedLabelsStream, OutputStream labelsOutputStream) { this.labelsOutputStream = labelsOutputStream; this.sortedLabelsStream = sortedLabelsStream; } @Override public void run() { BufferedReader reader = new BufferedReader( new InputStreamReader(sortedLabelsStream, StandardCharsets.UTF_8)); try { String line; while ((line = reader.readLine()) != null) { labelsOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); labelsOutputStream.write('\n'); labelCount++; } labelsOutputStream.close(); } catch (IOException e) { throw new RuntimeException(e); } } public long getLabelCount() { return labelCount; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java index 6bf20e4..fc5cc5b 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java @@ -1,129 +1,136 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.github.luben.zstd.ZstdOutputStream; import com.martiansoftware.jsap.*; import org.softwareheritage.graph.utils.Sort; import java.io.*; import java.nio.charset.StandardCharsets; /** * Read a graph dataset and extract all the unique authors it contains. * *

* This class reads the revision and release tables of the graph dataset, and uses * sort -u to extract the set of all the unique persons (name + email, potentially * pseudonymized) and store them in a file. *

*/ public class ExtractPersons { private static JSAPResult parseArgs(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC dataset"), new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output files"), new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, "30%", JSAP.NOT_REQUIRED, 'S', "sort-buffer-size", "Size of the memory buffer used by sort"), new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir", "Path to the temporary directory used by sort")}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { System.err.println("Usage error: " + e.getMessage()); System.exit(1); } return config; } private static void processAuthorColumn(ORCGraphDataset.SwhOrcTable table, String columnName, OutputStream stream) throws IOException { table.readBytes64Column(columnName, (swhid, personBase64) -> { stream.write(personBase64); stream.write('\n'); }); } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult parsedArgs = parseArgs(args); String datasetPath = parsedArgs.getString("dataset"); String outputBasename = parsedArgs.getString("outputBasename"); String sortBufferSize = parsedArgs.getString("sortBufferSize"); String sortTmpDir = parsedArgs.getString("sortTmpDir", null); ORCGraphDataset dataset = new ORCGraphDataset(datasetPath); extractPersons(dataset, outputBasename, sortBufferSize, sortTmpDir); } public static void extractPersons(ORCGraphDataset dataset, String outputBasename, String sortBufferSize, String sortTmpDir) throws IOException, InterruptedException { (new File(sortTmpDir)).mkdirs(); // Spawn person sorting process Process personSort = Sort.spawnSort(sortBufferSize, sortTmpDir); BufferedOutputStream personSortStdin = new BufferedOutputStream(personSort.getOutputStream()); BufferedInputStream personSortStdout = new BufferedInputStream(personSort.getInputStream()); OutputStream personsFileOutputStream = new ZstdOutputStream( new BufferedOutputStream(new FileOutputStream(outputBasename + ".persons.csv.zst"))); PersonsOutputThread personsOutputThread = new PersonsOutputThread(personSortStdout, personsFileOutputStream); personsOutputThread.start(); processAuthorColumn(dataset.getTable("release"), "author", personSortStdin); processAuthorColumn(dataset.getTable("revision"), "author", personSortStdin); processAuthorColumn(dataset.getTable("revision"), "committer", personSortStdin); // Wait for sorting processes to finish personSortStdin.close(); personSort.waitFor(); personsOutputThread.join(); // Write person count statistics printPersonsCounts(outputBasename, personsOutputThread.getPersonCount()); } private static void printPersonsCounts(String basename, long labelCount) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".persons.count.txt"); nodeCountWriter.println(labelCount); nodeCountWriter.close(); } private static class PersonsOutputThread extends Thread { private final InputStream sortedPersonsStream; private final OutputStream personsOutputStream; private long personCount = 0; PersonsOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) { this.sortedPersonsStream = sortedNodesStream; this.personsOutputStream = nodesOutputStream; } @Override public void run() { BufferedReader reader = new BufferedReader( new InputStreamReader(sortedPersonsStream, StandardCharsets.UTF_8)); try { String line; while ((line = reader.readLine()) != null) { personsOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); personsOutputStream.write('\n'); personCount++; } personsOutputStream.close(); } catch (IOException e) { throw new RuntimeException(e); } } public long getPersonCount() { return personCount; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java index ebd9adb..ae38cda 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java @@ -1,60 +1,67 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import java.io.IOException; /** * GraphDataset is a common interface to represent on-disk graph datasets in various formats, * usually extracted from the SWH archive with the swh-dataset tool. */ public interface GraphDataset { interface NodeCallback { void onNode(byte[] node) throws IOException; } interface EdgeCallback { void onEdge(byte[] src, byte[] dst, byte[] label, int permission) throws IOException; } /** * Read the graph dataset and call the callback methods for each node and edge encountered. * *
    *
  • The node callback is called for each object stored in the graph.
  • *
  • The edge callback is called for each relationship (between two nodes) stored in the * graph.
  • *
* *

* Note that because the graph can contain holes, loose objects and dangling objects, the edge * callback may be called with parameters representing nodes that are not stored in the graph. This * is because some nodes that are referred to as destinations in the dataset might not be present in * the archive (e.g., a revision entry in a directory pointing to a revision that we have not * crawled yet). *

* *

* In order to generate a complete set of all the nodes that are referred to in the graph * dataset, see the {@link ExtractNodes} class. *

* * @param nodeCb callback for each node * @param edgeCb callback for each edge */ void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException; interface TimestampCallback { void onTimestamp(byte[] swhid, long timestamp, short offset) throws IOException; } interface LongCallback { void onLong(byte[] swhid, long value) throws IOException; } interface BytesCallback { void onBytes(byte[] swhid, byte[] value) throws IOException; } interface HashedEdgeCallback { void onHashedEdge(long src, long dst, long label, int permission) throws IOException; } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java index 9279c08..31531ec 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java @@ -1,480 +1,487 @@ +/* + * Copyright (c) 2020-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.labels.DirEntry; import org.softwareheritage.graph.labels.SwhLabel; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.utils.ForkJoinBigQuickSort2; import org.softwareheritage.graph.utils.ForkJoinQuickSort3; import java.io.*; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; public class LabelMapBuilder { final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); // Create one thread per processor. final static int numThreads = Runtime.getRuntime().availableProcessors(); // Allocate up to 40% of maximum memory. final static int DEFAULT_BATCH_SIZE = Math .min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (numThreads * 8 * 3)), Arrays.MAX_ARRAY_SIZE); String orcDatasetPath; String graphPath; String outputGraphPath; String tmpDir; int batchSize; long numNodes; long numArcs; NodeIdMap nodeIdMap; Object2LongFunction filenameMph; long numFilenames; int totalLabelWidth; public LabelMapBuilder(String orcDatasetPath, String graphPath, String outputGraphPath, int batchSize, String tmpDir) throws IOException { this.orcDatasetPath = orcDatasetPath; this.graphPath = graphPath; this.outputGraphPath = (outputGraphPath == null) ? graphPath : outputGraphPath; this.batchSize = batchSize; this.tmpDir = tmpDir; ImmutableGraph graph = ImmutableGraph.loadOffline(graphPath); this.numArcs = graph.numArcs(); this.numNodes = graph.numNodes(); this.nodeIdMap = new NodeIdMap(graphPath); filenameMph = NodeIdMap.loadMph(graphPath + ".labels.mph"); numFilenames = getMPHSize(filenameMph); totalLabelWidth = DirEntry.labelWidth(numFilenames); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"), new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"), new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "output-graph", "Basename of the output graph, same as --graph if not specified"), new FlaggedOption("batchSize", JSAP.INTEGER_PARSER, String.valueOf(DEFAULT_BATCH_SIZE), JSAP.NOT_REQUIRED, 'b', "batch-size", "Number of triplets held in memory in each batch"), new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 'T', "temp-dir", "Temporary directory path"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String orcDataset = config.getString("dataset"); String graphPath = config.getString("graphPath"); String outputGraphPath = config.getString("outputGraphPath"); int batchSize = config.getInt("batchSize"); String tmpDir = config.getString("tmpDir"); LabelMapBuilder builder = new LabelMapBuilder(orcDataset, graphPath, outputGraphPath, batchSize, tmpDir); builder.computeLabelMap(); } static long getMPHSize(Object2LongFunction mph) { return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); } void computeLabelMap() throws IOException { File tempDirFile = new File(tmpDir); ObjectArrayList forwardBatches = new ObjectArrayList<>(); ObjectArrayList backwardBatches = new ObjectArrayList<>(); genSortedBatches(forwardBatches, backwardBatches, tempDirFile); BatchEdgeLabelLineIterator forwardBatchHeapIterator = new BatchEdgeLabelLineIterator(forwardBatches); writeLabels(forwardBatchHeapIterator, graphPath, outputGraphPath); for (File batch : forwardBatches) { batch.delete(); } BatchEdgeLabelLineIterator backwardBatchHeapIterator = new BatchEdgeLabelLineIterator(backwardBatches); writeLabels(backwardBatchHeapIterator, graphPath + "-transposed", outputGraphPath + "-transposed"); for (File batch : backwardBatches) { batch.delete(); } logger.info("Done"); } void genSortedBatches(ObjectArrayList forwardBatches, ObjectArrayList backwardBatches, File tempDirFile) throws IOException { logger.info("Initializing batch arrays."); long[][] srcArrays = new long[numThreads][batchSize]; long[][] dstArrays = new long[numThreads][batchSize]; long[][] labelArrays = new long[numThreads][batchSize]; int[] indexes = new int[numThreads]; long[] progressCounts = new long[numThreads]; ProgressLogger plSortingBatches = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plSortingBatches.itemsName = "edges"; plSortingBatches.expectedUpdates = this.numArcs; plSortingBatches.start("Reading edges and writing sorted batches."); AtomicInteger nextThreadId = new AtomicInteger(0); ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement); readHashedEdgeLabels((src, dst, label, perms) -> { // System.err.println("0. Input " + src + " " + dst + " " + label + " " + perms); int threadId = threadLocalId.get(); int idx = indexes[threadId]++; srcArrays[threadId][idx] = src; dstArrays[threadId][idx] = dst; labelArrays[threadId][idx] = DirEntry.toEncoded(label, perms); if (++progressCounts[threadId] > 1000) { synchronized (plSortingBatches) { plSortingBatches.update(progressCounts[threadId]); } progressCounts[threadId] = 0; } if (idx == batchSize - 1) { processBidirectionalBatches(batchSize, srcArrays[threadId], dstArrays[threadId], labelArrays[threadId], tempDirFile, forwardBatches, backwardBatches); indexes[threadId] = 0; } }); IntStream.range(0, numThreads).parallel().forEach(t -> { int idx = indexes[t]; if (idx > 0) { try { processBidirectionalBatches(idx, srcArrays[t], dstArrays[t], labelArrays[t], tempDirFile, forwardBatches, backwardBatches); } catch (IOException e) { throw new RuntimeException(e); } } }); // Trigger the GC to free up the large arrays for (int i = 0; i < numThreads; i++) { srcArrays[i] = null; dstArrays[i] = null; labelArrays[i] = null; } logger.info("Created " + forwardBatches.size() + " forward batches and " + backwardBatches.size() + " backward batches."); } void readHashedEdgeLabels(GraphDataset.HashedEdgeCallback cb) throws IOException { ORCGraphDataset dataset = new ORCGraphDataset(orcDatasetPath); ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); try { forkJoinPool.submit(() -> { try { dataset.readEdges((node) -> { }, (src, dst, label, perms) -> { if (label == null) { return; } long srcNode = nodeIdMap.getNodeId(src); long dstNode = nodeIdMap.getNodeId(dst); long labelId = filenameMph.getLong(label); cb.onHashedEdge(srcNode, dstNode, labelId, perms); }); } catch (IOException e) { throw new RuntimeException(e); } }).get(); } catch (InterruptedException | ExecutionException e) { throw new RuntimeException(e); } } void processBidirectionalBatches(final int n, final long[] source, final long[] target, final long[] labels, final File tempDir, final List forwardBatches, final List backwardBatches) throws IOException { processBatch(n, source, target, labels, tempDir, forwardBatches); processBatch(n, target, source, labels, tempDir, backwardBatches); } void processBatch(final int n, final long[] source, final long[] target, final long[] labels, final File tempDir, final List batches) throws IOException { if (n == 0) { return; } ForkJoinQuickSort3.parallelQuickSort(source, target, labels, 0, n); final File batchFile = File.createTempFile("batch", ".bitstream", tempDir); batchFile.deleteOnExit(); batches.add(batchFile); final OutputBitStream batch = new OutputBitStream(batchFile); // Compute unique triplets int u = 1; for (int i = n - 1; i-- != 0;) { if (source[i] != source[i + 1] || target[i] != target[i + 1] || labels[i] != labels[i + 1]) { u++; } } batch.writeDelta(u); // Write batch long prevSource = source[0]; batch.writeLongDelta(prevSource); batch.writeLongDelta(target[0]); batch.writeLongDelta(labels[0]); // System.err.println("1. Wrote " + prevSource + " " + target[0] + " " + labels[0]); for (int i = 1; i < n; i++) { if (source[i] != prevSource) { // Default case, we write (source - prevsource, target, label) batch.writeLongDelta(source[i] - prevSource); batch.writeLongDelta(target[i]); batch.writeLongDelta(labels[i]); prevSource = source[i]; } else if (target[i] != target[i - 1] || labels[i] != labels[i - 1]) { // Case where source is identical with prevsource, but target or label differ. // We write (0, target - prevtarget, label) batch.writeLongDelta(0); batch.writeLongDelta(target[i] - target[i - 1]); batch.writeLongDelta(labels[i]); } else { continue; } // System.err.println("1. Wrote " + source[i] + " " + target[i] + " " + labels[i]); } batch.close(); } void writeLabels(EdgeLabelLineIterator mapLines, String graphBasename, String outputGraphBasename) throws IOException { // Loading the graph to iterate ImmutableGraph graph = ImmutableGraph.loadMapped(graphBasename); // Get the sorted output and write the labels and label offsets ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plLabels.itemsName = "edges"; plLabels.expectedUpdates = this.numArcs; plLabels.start("Writing the labels to the label file: " + outputGraphBasename + "-labelled.*"); OutputBitStream labels = new OutputBitStream( new File(outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABELS_EXTENSION)); OutputBitStream offsets = new OutputBitStream(new File( outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); offsets.writeGamma(0); EdgeLabelLine line = new EdgeLabelLine(-1, -1, -1, -1); NodeIterator it = graph.nodeIterator(); boolean started = false; ArrayList labelBuffer = new ArrayList<>(128); while (it.hasNext()) { long srcNode = it.nextLong(); long bits = 0; LazyLongIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { while (line != null && line.srcNode <= srcNode && line.dstNode <= dstNode) { if (line.srcNode == srcNode && line.dstNode == dstNode) { labelBuffer.add(new DirEntry(line.filenameId, line.permission)); } if (!mapLines.hasNext()) break; line = mapLines.next(); if (!started) { plLabels.start("Writing label map to file..."); started = true; } } SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, labelBuffer.toArray(new DirEntry[0])); labelBuffer.clear(); bits += l.toBitStream(labels, -1); plLabels.lightUpdate(); } offsets.writeLongGamma(bits); } labels.close(); offsets.close(); plLabels.done(); graph = null; PrintWriter pw = new PrintWriter(new FileWriter(outputGraphBasename + "-labelled.properties")); pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName() + "(DirEntry," + totalLabelWidth + ")"); pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + Paths.get(outputGraphBasename).getFileName()); pw.close(); } public static class EdgeLabelLine { public long srcNode; public long dstNode; public long filenameId; public int permission; public EdgeLabelLine(long labelSrcNode, long labelDstNode, long labelFilenameId, int labelPermission) { this.srcNode = labelSrcNode; this.dstNode = labelDstNode; this.filenameId = labelFilenameId; this.permission = labelPermission; } } public abstract static class EdgeLabelLineIterator implements Iterator { @Override public abstract boolean hasNext(); @Override public abstract EdgeLabelLine next(); } public static class BatchEdgeLabelLineIterator extends EdgeLabelLineIterator { private static final int STD_BUFFER_SIZE = 128 * 1024; private final InputBitStream[] batchIbs; private final int[] inputStreamLength; private final long[] refArray; private final LongHeapSemiIndirectPriorityQueue queue; private final long[] prevTarget; /** The last returned node (-1 if no node has been returned yet). */ private long lastNode; private long[][] lastNodeSuccessors = LongBigArrays.EMPTY_BIG_ARRAY; private long[][] lastNodeLabels = LongBigArrays.EMPTY_BIG_ARRAY; private long lastNodeOutdegree; private long lastNodeCurrentSuccessor; public BatchEdgeLabelLineIterator(final List batches) throws IOException { this.batchIbs = new InputBitStream[batches.size()]; this.refArray = new long[batches.size()]; this.prevTarget = new long[batches.size()]; this.queue = new LongHeapSemiIndirectPriorityQueue(refArray); this.inputStreamLength = new int[batches.size()]; for (int i = 0; i < batches.size(); i++) { batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); this.inputStreamLength[i] = batchIbs[i].readDelta(); this.refArray[i] = batchIbs[i].readLongDelta(); queue.enqueue(i); } this.lastNode = -1; this.lastNodeOutdegree = 0; this.lastNodeCurrentSuccessor = 0; } public boolean hasNextNode() { return !queue.isEmpty(); } private void readNextNode() throws IOException { assert hasNext(); int i; lastNode++; lastNodeOutdegree = 0; lastNodeCurrentSuccessor = 0; /* * We extract elements from the queue as long as their target is equal to last. If during the * process we exhaust a batch, we close it. */ while (!queue.isEmpty() && refArray[i = queue.first()] == lastNode) { lastNodeSuccessors = BigArrays.grow(lastNodeSuccessors, lastNodeOutdegree + 1); lastNodeLabels = BigArrays.grow(lastNodeLabels, lastNodeOutdegree + 1); long target = prevTarget[i] += batchIbs[i].readLongDelta(); long label = batchIbs[i].readLongDelta(); BigArrays.set(lastNodeSuccessors, lastNodeOutdegree, target); BigArrays.set(lastNodeLabels, lastNodeOutdegree, label); // System.err.println("2. Read " + lastNode + " " + target + " " + label); if (--inputStreamLength[i] == 0) { queue.dequeue(); batchIbs[i].close(); batchIbs[i] = null; } else { // We read a new source and update the queue. final long sourceDelta = batchIbs[i].readLongDelta(); if (sourceDelta != 0) { refArray[i] += sourceDelta; prevTarget[i] = 0; queue.changed(); } } lastNodeOutdegree++; } // Neither quicksort nor heaps are stable, so we reestablish order here. // LongBigArrays.radixSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree); ForkJoinBigQuickSort2.parallelQuickSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree); } @Override public boolean hasNext() { return lastNodeCurrentSuccessor < lastNodeOutdegree || hasNextNode(); } @Override public EdgeLabelLine next() { if (lastNode == -1 || lastNodeCurrentSuccessor >= lastNodeOutdegree) { try { do { readNextNode(); } while (hasNextNode() && lastNodeOutdegree == 0); } catch (IOException e) { throw new RuntimeException(e); } } long src = lastNode; long dst = BigArrays.get(lastNodeSuccessors, lastNodeCurrentSuccessor); long compressedLabel = BigArrays.get(lastNodeLabels, lastNodeCurrentSuccessor); long labelName = DirEntry.labelNameFromEncoded(compressedLabel); int permission = DirEntry.permissionFromEncoded(compressedLabel); // System.err.println("3. Output (encoded): " + src + " " + dst + " " + compressedLabel); // System.err.println("4. Output (decoded): " + src + " " + dst + " " + labelName + " " + // permission); lastNodeCurrentSuccessor++; return new EdgeLabelLine(src, dst, labelName, permission); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java index 80e0c7e..74ef2f3 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java @@ -1,194 +1,201 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.github.luben.zstd.ZstdInputStream; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.Scanner; import java.util.concurrent.TimeUnit; /** * Create maps needed at runtime by the graph service, in particular: *

*

    *
  • WebGraph long node id → SWHID
  • *
  • WebGraph long node id → SWH node type (enum)
  • *
* * @author The Software Heritage developers */ public class NodeMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class); /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV"); System.exit(1); } String graphPath = args[0]; String tmpDir = args[1]; logger.info("starting maps generation..."); precomputeNodeIdMap(graphPath, tmpDir); logger.info("maps generation completed"); } /** * Computes and dumps on disk mapping files. * * @param graphPath path of the compressed graph */ static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plSWHID2Node.itemsName = "nodes"; plNode2SWHID.itemsName = "nodes"; // first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); plSWHID2Node.expectedUpdates = nbIds; plNode2SWHID.expectedUpdates = nbIds; // second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); logger.info("loading BFS order file..."); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); logger.info("BFS order file loaded"); if (loaded != nbIds) { logger.error("graph contains " + nbIds + " nodes, but read " + loaded); System.exit(2); } /* * Read on stdin a list of SWHIDs, hash them with MPH, then permute them according to the .order * file */ FastBufferedReader buffer = new FastBufferedReader( new InputStreamReader(new ZstdInputStream(new BufferedInputStream(System.in)))); LineIterator swhidIterator = new LineIterator(buffer); /* * The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically * sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the * sorting job to /usr/bin/sort via pipes */ ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); // for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap try (BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) { /* * background handler for sort output, it will be fed SWHID/node pairs, and will itself fill * nodeToSwhidMap as soon as data from sort is ready. */ SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); outputHandler.start(); /* * Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to * efficiently check edge restrictions. */ final int nbBitsPerNodeType = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); plSWHID2Node.start("Hashing SWHIDs to fill sort input"); for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { String swhidStr = swhidIterator.next().toString(); SWHID swhid = new SWHID(swhidStr); long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII)); long nodeId = BigArrays.get(bfsMap, mphId); sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); nodeTypesMap.set(nodeId, swhid.getType().ordinal()); plSWHID2Node.lightUpdate(); } plSWHID2Node.done(); sort_stdin.close(); // write type map logger.info("storing type map"); BinIO.storeObject(nodeTypesMap, graphPath + NodeTypesMap.NODE_TO_TYPE); logger.info("type map stored"); // wait for nodeToSwhidMap filling try { logger.info("waiting for node2swhid map..."); int sortExitCode = sort.waitFor(); if (sortExitCode != 0) { logger.error("sort returned non-zero exit code: " + sortExitCode); System.exit(2); } outputHandler.join(); } catch (InterruptedException e) { logger.error("processing of sort output failed with: " + e); System.exit(2); } } } private static class SortOutputHandler extends Thread { private final Scanner input; private final OutputStream output; private final ProgressLogger pl; SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { this.input = new Scanner(input, StandardCharsets.US_ASCII); this.output = output; this.pl = pl; } public void run() { boolean sortDone = false; logger.info("node2swhid: waiting for sort output..."); while (input.hasNextLine()) { if (!sortDone) { sortDone = true; this.pl.start("filling node2swhid map"); } String line = input.nextLine(); // format: SWHID NODE_ID SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID try { output.write(swhid.toBytes()); } catch (IOException e) { logger.error("writing to node->SWHID map failed with: " + e); } this.pl.lightUpdate(); } this.pl.done(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java index 9ba0e38..d16b5ae 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java @@ -1,711 +1,718 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.github.luben.zstd.ZstdOutputStream; import com.google.common.primitives.Bytes; import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.util.*; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinTask; /** * A graph dataset in ORC format. * * This format of dataset is a full export of the graph, including all the edge and node properties. * * For convenience purposes, this class also provides a main method to print all the edges of the * graph, so that the output can be piped to * {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph}. * * Reading edges from ORC files using this class is about ~2.5 times slower than reading them * directly from a plaintext format. */ public class ORCGraphDataset implements GraphDataset { final static Logger logger = LoggerFactory.getLogger(ORCGraphDataset.class); final static public int ORC_BATCH_SIZE = 16 * 1024; private File datasetDir; protected ORCGraphDataset() { } public ORCGraphDataset(String datasetPath) { this(new File(datasetPath)); } public ORCGraphDataset(File datasetDir) { if (!datasetDir.exists()) { throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist"); } this.datasetDir = datasetDir; } /** * Return the given table as a {@link SwhOrcTable}. The return value can be down-casted to the type * of the specific table it represents. */ public SwhOrcTable getTable(String tableName) { File tableDir = new File(datasetDir, tableName); if (!tableDir.exists()) { return null; } switch (tableName) { case "skipped_content": return new SkippedContentOrcTable(tableDir); case "content": return new ContentOrcTable(tableDir); case "directory": return new DirectoryOrcTable(tableDir); case "directory_entry": return new DirectoryEntryOrcTable(tableDir); case "revision": return new RevisionOrcTable(tableDir); case "revision_history": return new RevisionHistoryOrcTable(tableDir); case "release": return new ReleaseOrcTable(tableDir); case "snapshot_branch": return new SnapshotBranchOrcTable(tableDir); case "snapshot": return new SnapshotOrcTable(tableDir); case "origin_visit_status": return new OriginVisitStatusOrcTable(tableDir); case "origin_visit": return new OriginVisitOrcTable(tableDir); case "origin": return new OriginOrcTable(tableDir); default : return null; } } /** Return all the tables in this dataset as a map of {@link SwhOrcTable}. */ public Map allTables() { HashMap tables = new HashMap<>(); File[] tableDirs = datasetDir.listFiles(); if (tableDirs == null) { return tables; } for (File tableDir : tableDirs) { SwhOrcTable table = getTable(tableDir.getName()); if (table != null) { tables.put(tableDir.getName(), table); } } return tables; } public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { Map tables = allTables(); for (SwhOrcTable table : tables.values()) { table.readEdges(nodeCb, edgeCb); } } /** * A class representing an ORC table, stored on disk as a set of ORC files all in the same * directory. */ public static class ORCTable { private final File tableDir; public ORCTable(File tableDir) { if (!tableDir.exists()) { throw new IllegalArgumentException("Table " + tableDir.getName() + " does not exist"); } this.tableDir = tableDir; } public static ORCTable load(File tableDir) { return new ORCTable(tableDir); } /** * Utility function for byte columns. Return as a byte array the value of the given row in the * column vector. */ public static byte[] getBytesRow(BytesColumnVector columnVector, int row) { if (columnVector.isRepeating) { row = 0; } if (columnVector.isNull[row]) { return null; } return Arrays.copyOfRange(columnVector.vector[row], columnVector.start[row], columnVector.start[row] + columnVector.length[row]); } /** * Utility function for long columns. Return as a long the value of the given row in the column * vector. */ public static Long getLongRow(LongColumnVector columnVector, int row) { if (columnVector.isRepeating) { row = 0; } if (columnVector.isNull[row]) { return null; } return columnVector.vector[row]; } interface ReadOrcBatchHandler { void accept(VectorizedRowBatch batch, Map columnMap) throws IOException; } /** * Read the table, calling the given handler for each new batch of rows. Optionally, if columns is * not null, will only scan the columns present in this set instead of the entire table. * * If this method is called from within a ForkJoinPool, the ORC table will be read in parallel using * that thread pool. Otherwise, the ORC files will be read sequentially. */ public void readOrcTable(ReadOrcBatchHandler batchHandler, Set columns) throws IOException { File[] listing = tableDir.listFiles(); if (listing == null) { throw new IOException("No files found in " + tableDir.getName()); } ForkJoinPool forkJoinPool = ForkJoinTask.getPool(); if (forkJoinPool == null) { // Sequential case for (File file : listing) { readOrcFile(file.getPath(), batchHandler, columns); } } else { // Parallel case ArrayList listingArray = new ArrayList<>(Arrays.asList(listing)); listingArray.parallelStream().forEach(file -> { try { readOrcFile(file.getPath(), batchHandler, columns); } catch (IOException e) { throw new RuntimeException(e); } }); } } private void readOrcFile(String path, ReadOrcBatchHandler batchHandler, Set columns) throws IOException { try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(new Configuration()))) { TypeDescription schema = reader.getSchema(); Reader.Options options = reader.options(); if (columns != null) { options.include(createColumnsToRead(schema, columns)); } Map columnMap = getColumnMap(schema); try (RecordReader records = reader.rows(options)) { VectorizedRowBatch batch = reader.getSchema().createRowBatch(ORC_BATCH_SIZE); while (records.nextBatch(batch)) { batchHandler.accept(batch, columnMap); } } } } private static Map getColumnMap(TypeDescription schema) { Map columnMap = new HashMap<>(); List fieldNames = schema.getFieldNames(); for (int i = 0; i < fieldNames.size(); i++) { columnMap.put(fieldNames.get(i), i); } return columnMap; } private static boolean[] createColumnsToRead(TypeDescription schema, Set columns) { boolean[] columnsToRead = new boolean[schema.getMaximumId() + 1]; List fieldNames = schema.getFieldNames(); List columnTypes = schema.getChildren(); for (int i = 0; i < fieldNames.size(); i++) { if (columns.contains(fieldNames.get(i))) { logger.debug("Adding column " + fieldNames.get(i) + " with ID " + i + " to the read list"); TypeDescription type = columnTypes.get(i); for (int id = type.getId(); id <= type.getMaximumId(); id++) { columnsToRead[id] = true; } } } return columnsToRead; } } /** Base class for SWH-specific ORC tables. */ public static class SwhOrcTable { protected ORCTable orcTable; protected static final byte[] cntPrefix = "swh:1:cnt:".getBytes(); protected static final byte[] dirPrefix = "swh:1:dir:".getBytes(); protected static final byte[] revPrefix = "swh:1:rev:".getBytes(); protected static final byte[] relPrefix = "swh:1:rel:".getBytes(); protected static final byte[] snpPrefix = "swh:1:snp:".getBytes(); protected static final byte[] oriPrefix = "swh:1:ori:".getBytes(); protected String getIdColumn() { return "id"; } protected byte[] getSwhidPrefix() { throw new UnsupportedOperationException(); } protected byte[] idToSwhid(byte[] id) { return Bytes.concat(getSwhidPrefix(), id); } protected SwhOrcTable() { } public SwhOrcTable(File tableDir) { orcTable = new ORCTable(tableDir); } public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { // No nodes or edges to read in the table by default. } protected static byte[] urlToOriginId(byte[] url) { return DigestUtils.sha1Hex(url).getBytes(); } public void readIdColumn(NodeCallback cb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; for (int row = 0; row < batch.size; row++) { byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); cb.onNode(id); } }, Set.of(getIdColumn())); } public void readLongColumn(String longColumn, LongCallback cb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; LongColumnVector dateVector = (LongColumnVector) batch.cols[columnMap.get(longColumn)]; for (int row = 0; row < batch.size; row++) { byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); Long date = ORCTable.getLongRow(dateVector, row); if (date != null) { cb.onLong(id, date); } } }, Set.of(getIdColumn(), longColumn)); } public void readTimestampColumn(String dateColumn, String dateOffsetColumn, TimestampCallback cb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; TimestampColumnVector dateVector = (TimestampColumnVector) batch.cols[columnMap.get(dateColumn)]; LongColumnVector dateOffsetVector = (LongColumnVector) batch.cols[columnMap.get(dateOffsetColumn)]; for (int row = 0; row < batch.size; row++) { byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); long date = dateVector.getTimestampAsLong(row); // rounded to seconds Long dateOffset = ORCTable.getLongRow(dateOffsetVector, row); if (dateOffset != null) { cb.onTimestamp(id, date, dateOffset.shortValue()); } } }, Set.of(getIdColumn(), dateColumn, dateOffsetColumn)); } public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; BytesColumnVector valueVector = (BytesColumnVector) batch.cols[columnMap.get(longColumn)]; for (int row = 0; row < batch.size; row++) { byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); byte[] value = Base64.getEncoder().encode(ORCTable.getBytesRow(valueVector, row)); cb.onBytes(id, value); } }, Set.of(getIdColumn(), longColumn)); } } public static class SkippedContentOrcTable extends SwhOrcTable { public SkippedContentOrcTable(File tableDir) { super(tableDir); } @Override protected String getIdColumn() { return "sha1_git"; } @Override protected byte[] getSwhidPrefix() { return cntPrefix; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { readIdColumn(nodeCb); } } public static class ContentOrcTable extends SwhOrcTable { public ContentOrcTable(File tableDir) { super(tableDir); } @Override protected String getIdColumn() { return "sha1_git"; } @Override protected byte[] getSwhidPrefix() { return cntPrefix; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { readIdColumn(nodeCb); } } public static class DirectoryOrcTable extends SwhOrcTable { public DirectoryOrcTable(File tableDir) { super(tableDir); } @Override protected byte[] getSwhidPrefix() { return dirPrefix; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { readIdColumn(nodeCb); } } public static class DirectoryEntryOrcTable extends SwhOrcTable { public DirectoryEntryOrcTable(File tableDir) { super(tableDir); } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { byte[] cntType = "file".getBytes(); byte[] dirType = "dir".getBytes(); byte[] revType = "rev".getBytes(); orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector srcVector = (BytesColumnVector) batch.cols[columnMap.get("directory_id")]; BytesColumnVector dstVector = (BytesColumnVector) batch.cols[columnMap.get("target")]; BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("type")]; BytesColumnVector labelVector = (BytesColumnVector) batch.cols[columnMap.get("name")]; LongColumnVector permissionVector = (LongColumnVector) batch.cols[columnMap.get("perms")]; for (int row = 0; row < batch.size; row++) { byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row); byte[] targetPrefix; if (Arrays.equals(targetType, cntType)) { targetPrefix = cntPrefix; } else if (Arrays.equals(targetType, dirType)) { targetPrefix = dirPrefix; } else if (Arrays.equals(targetType, revType)) { targetPrefix = revPrefix; } else { continue; } byte[] src = Bytes.concat(dirPrefix, ORCTable.getBytesRow(srcVector, row)); byte[] dst = Bytes.concat(targetPrefix, ORCTable.getBytesRow(dstVector, row)); byte[] label = Base64.getEncoder().encode(ORCTable.getBytesRow(labelVector, row)); Long permission = ORCTable.getLongRow(permissionVector, row); edgeCb.onEdge(src, dst, label, permission != null ? permission.intValue() : 0); } }, Set.of("directory_id", "target", "type", "name", "perms")); } } public static class RevisionOrcTable extends SwhOrcTable { public RevisionOrcTable(File tableDir) { super(tableDir); } @Override protected byte[] getSwhidPrefix() { return revPrefix; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")]; BytesColumnVector directoryIdVector = (BytesColumnVector) batch.cols[columnMap.get("directory")]; for (int row = 0; row < batch.size; row++) { byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row)); byte[] directoryId = Bytes.concat(dirPrefix, ORCTable.getBytesRow(directoryIdVector, row)); nodeCb.onNode(revisionId); edgeCb.onEdge(revisionId, directoryId, null, -1); } }, Set.of("id", "directory")); } } public static class RevisionHistoryOrcTable extends SwhOrcTable { public RevisionHistoryOrcTable(File tableDir) { super(tableDir); } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")]; BytesColumnVector parentIdVector = (BytesColumnVector) batch.cols[columnMap.get("parent_id")]; for (int row = 0; row < batch.size; row++) { byte[] parentId = Bytes.concat(revPrefix, ORCTable.getBytesRow(parentIdVector, row)); byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row)); edgeCb.onEdge(revisionId, parentId, null, -1); } }, Set.of("id", "parent_id")); } } public static class ReleaseOrcTable extends SwhOrcTable { public ReleaseOrcTable(File tableDir) { super(tableDir); } @Override protected byte[] getSwhidPrefix() { return relPrefix; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { byte[] cntType = "content".getBytes(); byte[] dirType = "directory".getBytes(); byte[] revType = "revision".getBytes(); byte[] relType = "release".getBytes(); orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector releaseIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")]; BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")]; BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")]; for (int row = 0; row < batch.size; row++) { byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row); byte[] targetPrefix; if (Arrays.equals(targetType, cntType)) { targetPrefix = cntPrefix; } else if (Arrays.equals(targetType, dirType)) { targetPrefix = dirPrefix; } else if (Arrays.equals(targetType, revType)) { targetPrefix = revPrefix; } else if (Arrays.equals(targetType, relType)) { targetPrefix = relPrefix; } else { continue; } byte[] releaseId = Bytes.concat(relPrefix, ORCTable.getBytesRow(releaseIdVector, row)); byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row)); nodeCb.onNode(releaseId); edgeCb.onEdge(releaseId, targetId, null, -1); } }, Set.of("id", "target", "target_type")); } } public static class SnapshotOrcTable extends SwhOrcTable { public SnapshotOrcTable(File tableDir) { super(tableDir); } @Override protected byte[] getSwhidPrefix() { return snpPrefix; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { readIdColumn(nodeCb); } } public static class SnapshotBranchOrcTable extends SwhOrcTable { public SnapshotBranchOrcTable(File tableDir) { super(tableDir); } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { byte[] cntType = "content".getBytes(); byte[] dirType = "directory".getBytes(); byte[] revType = "revision".getBytes(); byte[] relType = "release".getBytes(); orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot_id")]; BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")]; BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")]; BytesColumnVector branchNameVector = (BytesColumnVector) batch.cols[columnMap.get("name")]; for (int row = 0; row < batch.size; row++) { byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row); byte[] targetPrefix; if (Arrays.equals(targetType, cntType)) { targetPrefix = cntPrefix; } else if (Arrays.equals(targetType, dirType)) { targetPrefix = dirPrefix; } else if (Arrays.equals(targetType, revType)) { targetPrefix = revPrefix; } else if (Arrays.equals(targetType, relType)) { targetPrefix = relPrefix; } else { continue; } byte[] snapshotId = Bytes.concat(snpPrefix, ORCTable.getBytesRow(snapshotIdVector, row)); byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row)); byte[] branchName = Base64.getEncoder().encode(ORCTable.getBytesRow(branchNameVector, row)); nodeCb.onNode(snapshotId); edgeCb.onEdge(snapshotId, targetId, branchName, -1); } }, Set.of("snapshot_id", "name", "target", "target_type")); } } public static class OriginVisitStatusOrcTable extends SwhOrcTable { public OriginVisitStatusOrcTable(File tableDir) { super(tableDir); } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector originUrlVector = (BytesColumnVector) batch.cols[columnMap.get("origin")]; BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot")]; for (int row = 0; row < batch.size; row++) { byte[] originId = urlToOriginId(ORCTable.getBytesRow(originUrlVector, row)); byte[] snapshot_id = ORCTable.getBytesRow(snapshotIdVector, row); if (snapshot_id == null || snapshot_id.length == 0) { continue; } edgeCb.onEdge(Bytes.concat(oriPrefix, originId), Bytes.concat(snpPrefix, snapshot_id), null, -1); } }, Set.of("origin", "snapshot")); } } public static class OriginVisitOrcTable extends SwhOrcTable { public OriginVisitOrcTable(File tableDir) { super(tableDir); } } public static class OriginOrcTable extends SwhOrcTable { public OriginOrcTable(File tableDir) { super(tableDir); } @Override protected byte[] getSwhidPrefix() { return oriPrefix; } @Override protected byte[] idToSwhid(byte[] id) { return Bytes.concat(getSwhidPrefix(), urlToOriginId(id)); } @Override protected String getIdColumn() { return "url"; } @Override public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { readIdColumn(nodeCb); } public void readURLs(BytesCallback cb) throws IOException { orcTable.readOrcTable((batch, columnMap) -> { BytesColumnVector urlVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; for (int row = 0; row < batch.size; row++) { byte[] id = idToSwhid(ORCTable.getBytesRow(urlVector, row)); byte[] url = Base64.getEncoder().encode(ORCTable.getBytesRow(urlVector, row)); cb.onBytes(id, url); } }, Set.of(getIdColumn())); } } /** * Export an ORC graph to the CSV edge dataset format as two different files, * nodes.csv.zst and edges.csv.zst. */ public static void exportToCsvDataset(String orcDataset, String csvDatasetBasename) throws IOException { ORCGraphDataset dataset = new ORCGraphDataset(orcDataset); File nodesFile = new File(csvDatasetBasename + ".nodes.csv.zst"); File edgesFile = new File(csvDatasetBasename + ".edges.csv.zst"); FastBufferedOutputStream nodesOut = new FastBufferedOutputStream( new ZstdOutputStream(new FileOutputStream(nodesFile))); FastBufferedOutputStream edgesOut = new FastBufferedOutputStream( new ZstdOutputStream(new FileOutputStream(edgesFile))); dataset.readEdges((node) -> { nodesOut.write(node); nodesOut.write('\n'); }, (src, dst, label, perms) -> { edgesOut.write(src); edgesOut.write(' '); edgesOut.write(dst); if (label != null) { edgesOut.write(' '); edgesOut.write(label); edgesOut.write(' '); } if (perms != -1) { edgesOut.write(' '); edgesOut.write(Long.toString(perms).getBytes()); } edgesOut.write('\n'); }); } /** * Print all the edges of the graph to stdout. Can be piped to * {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph} to import the graph dataset and convert * it to a {@link it.unimi.dsi.big.webgraph.BVGraph}. */ public static void printSimpleEdges(String orcDataset) throws IOException { ORCGraphDataset dataset = new ORCGraphDataset(orcDataset); FastBufferedOutputStream out = new FastBufferedOutputStream(System.out); dataset.readEdges((node) -> { }, (src, dst, label, perms) -> { out.write(src); out.write(' '); out.write(dst); out.write('\n'); }); out.flush(); } public static void main(String[] args) throws IOException { printSimpleEdges(args[0]); } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java index 05531f5..9320d98 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java @@ -1,252 +1,259 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import java.io.File; import java.io.IOException; import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.ImmutableSequentialGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import it.unimi.dsi.big.webgraph.Transform; import it.unimi.dsi.fastutil.Arrays; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.UnflaggedOption; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.logging.ProgressLogger; public class ScatteredArcsORCGraph extends ImmutableSequentialGraph { private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredArcsORCGraph.class); /** The default number of threads. */ public static final int DEFAULT_NUM_THREADS = Runtime.getRuntime().availableProcessors(); /** The default batch size. */ public static final int DEFAULT_BATCH_SIZE = Math .min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (DEFAULT_NUM_THREADS * 8 * 2)), Arrays.MAX_ARRAY_SIZE); /** The batch graph used to return node iterators. */ private final Transform.BatchGraph batchGraph; /** * Creates a scattered-arcs ORC graph. * * @param dataset the Swh ORC Graph dataset * @param function an explicitly provided function from string representing nodes to node numbers, * or null for the standard behaviour. * @param n the number of nodes of the graph (used only if function is not * null). * @param numThreads the number of threads to use. * @param batchSize the number of integers in a batch; two arrays of integers of this size will be * allocated by each thread. * @param tempDir a temporary directory for the batches, or null for * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. * @param pl a progress logger, or null. */ public ScatteredArcsORCGraph(final ORCGraphDataset dataset, final Object2LongFunction function, final long n, final int numThreads, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { final ObjectArrayList batches = new ObjectArrayList<>(); ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); long[][] srcArrays = new long[numThreads][batchSize]; long[][] dstArrays = new long[numThreads][batchSize]; int[] indexes = new int[numThreads]; long[] progressCounts = new long[numThreads]; AtomicInteger pairs = new AtomicInteger(0); AtomicInteger nextThreadId = new AtomicInteger(0); ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement); if (pl != null) { pl.itemsName = "arcs"; pl.start("Creating sorted batches..."); } try { forkJoinPool.submit(() -> { try { dataset.readEdges((node) -> { }, (src, dst, label, perms) -> { long s = function.getLong(src); long t = function.getLong(dst); int threadId = threadLocalId.get(); int idx = indexes[threadId]++; srcArrays[threadId][idx] = s; dstArrays[threadId][idx] = t; if (idx == batchSize - 1) { pairs.addAndGet(Transform.processBatch(batchSize, srcArrays[threadId], dstArrays[threadId], tempDir, batches)); indexes[threadId] = 0; } if (pl != null && ++progressCounts[threadId] > 1000) { synchronized (pl) { pl.update(progressCounts[threadId]); } progressCounts[threadId] = 0; } }); } catch (IOException e) { throw new RuntimeException(e); } }).get(); } catch (InterruptedException | ExecutionException e) { throw new RuntimeException(e); } IntStream.range(0, numThreads).parallel().forEach(t -> { int idx = indexes[t]; if (idx > 0) { try { pairs.addAndGet(Transform.processBatch(idx, srcArrays[t], dstArrays[t], tempDir, batches)); } catch (IOException e) { throw new RuntimeException(e); } } }); // Trigger the GC to free up the large arrays for (int i = 0; i < numThreads; i++) { srcArrays[i] = null; dstArrays[i] = null; } if (pl != null) { pl.done(); pl.logger().info("Created " + batches.size() + " batches."); } batchGraph = new Transform.BatchGraph(n, pairs.get(), batches); } @Override public long numNodes() { if (batchGraph == null) throw new UnsupportedOperationException( "The number of nodes is unknown (you need to generate all the batches first)."); return batchGraph.numNodes(); } @Override public long numArcs() { if (batchGraph == null) throw new UnsupportedOperationException( "The number of arcs is unknown (you need to generate all the batches first)."); return batchGraph.numArcs(); } @Override public NodeIterator nodeIterator(final long from) { return batchGraph.nodeIterator(from); } @Override public boolean hasCopiableIterators() { return batchGraph.hasCopiableIterators(); } @Override public ScatteredArcsORCGraph copy() { return this; } @SuppressWarnings("unchecked") public static void main(final String[] args) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { final SimpleJSAP jsap = new SimpleJSAP(ScatteredArcsORCGraph.class.getName(), "Converts a scattered list of arcs from an ORC graph dataset into a BVGraph.", new Parameter[]{ new FlaggedOption("logInterval", JSAP.LONG_PARSER, Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds."), new FlaggedOption("numThreads", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_NUM_THREADS), JSAP.NOT_REQUIRED, 't', "threads", "The number of threads to use."), new FlaggedOption("batchSize", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_BATCH_SIZE), JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in arcs."), new FlaggedOption("tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for all temporary batch files."), new FlaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "function", "A serialised function from strings to longs that will be used to translate identifiers to node numbers."), new FlaggedOption("comp", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag (may be specified several times).") .setAllowMultipleDeclarations(true), new FlaggedOption("windowSize", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_WINDOW_SIZE), JSAP.NOT_REQUIRED, 'w', "window-size", "Reference window size (0 to disable)."), new FlaggedOption("maxRefCount", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_MAX_REF_COUNT), JSAP.NOT_REQUIRED, 'm', "max-ref-count", "Maximum number of backward references (-1 for ∞)."), new FlaggedOption("minIntervalLength", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_MIN_INTERVAL_LENGTH), JSAP.NOT_REQUIRED, 'i', "min-interval-length", "Minimum length of an interval (0 to disable)."), new FlaggedOption("zetaK", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_ZETA_K), JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes."), new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The path to the ORC graph dataset."), new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the output graph"),}); final JSAPResult jsapResult = jsap.parse(args); if (jsap.messagePrinted()) System.exit(1); String basename = jsapResult.getString("basename"); String orcDatasetPath = jsapResult.getString("dataset"); ORCGraphDataset orcDataset = new ORCGraphDataset(orcDatasetPath); int flags = 0; for (final String compressionFlag : jsapResult.getStringArray("comp")) { try { flags |= BVGraph.class.getField(compressionFlag).getInt(BVGraph.class); } catch (final Exception notFound) { throw new JSAPException("Compression method " + compressionFlag + " unknown."); } } final int windowSize = jsapResult.getInt("windowSize"); final int zetaK = jsapResult.getInt("zetaK"); int maxRefCount = jsapResult.getInt("maxRefCount"); if (maxRefCount == -1) maxRefCount = Integer.MAX_VALUE; final int minIntervalLength = jsapResult.getInt("minIntervalLength"); if (!jsapResult.userSpecified("function")) { throw new IllegalArgumentException("Function must be specified."); } final Object2LongFunction function = (Object2LongFunction) BinIO .loadObject(jsapResult.getString("function")); long n = function instanceof Size64 ? ((Size64) function).size64() : function.size(); File tempDir = null; if (jsapResult.userSpecified("tempDir")) { tempDir = new File(jsapResult.getString("tempDir")); } final ProgressLogger pl = new ProgressLogger(LOGGER, jsapResult.getLong("logInterval"), TimeUnit.MILLISECONDS); final int batchSize = jsapResult.getInt("batchSize"); final int numThreads = jsapResult.getInt("numThreads"); final ScatteredArcsORCGraph graph = new ScatteredArcsORCGraph(orcDataset, function, n, numThreads, batchSize, tempDir, pl); BVGraph.store(graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl); } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java index e55d8a4..f06ba59 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java @@ -1,273 +1,280 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import com.martiansoftware.jsap.*; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.ints.IntBigArrays; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.shorts.ShortBigArrays; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.compress.ORCGraphDataset.*; import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.*; import java.util.concurrent.atomic.AtomicLong; /** * This class is used to extract the node properties from the graph dataset, and write them to a set * of property files. * * Note: because the nodes are not sorted by type, we have an incentive to minimize the number of * "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same * files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file). * Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all * the different properties in their own files. */ public class WriteNodeProperties { final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class); private final ORCGraphDataset dataset; private final String graphBasename; private final NodeIdMap nodeIdMap; private final long numNodes; public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) { this.dataset = new ORCGraphDataset(dataset); this.graphBasename = graphBasename; this.nodeIdMap = nodeIdMap; this.numNodes = nodeIdMap.size64(); } public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped", "person_ids", "messages", "tag_names",}; private static JSAPResult parseArgs(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"), new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"), new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties", "Properties to write, comma separated (default: all). Possible choices: " + String.join(",", PROPERTY_WRITERS)),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { System.err.println("Usage error: " + e.getMessage()); System.exit(1); } return config; } public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { JSAPResult args = parseArgs(argv); String dataset = args.getString("dataset"); String graphBasename = args.getString("graphBasename"); NodeIdMap nodeIdMap = new NodeIdMap(graphBasename); Set properties; if (args.getString("properties").equals("*")) { properties = Set.of(PROPERTY_WRITERS); } else { properties = new HashSet<>(Arrays.asList(args.getString("properties").split(","))); } WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap); if (properties.contains("timestamps")) { writer.writeTimestamps(); } if (properties.contains("content_length")) { writer.writeContentLength(); } if (properties.contains("content_is_skipped")) { writer.writeContentIsSkipped(); } if (properties.contains("person_ids")) { writer.writePersonIds(); } if (properties.contains("messages")) { writer.writeMessages(); } if (properties.contains("tag_names")) { writer.writeTagNames(); } } public void writeContentLength() throws IOException { logger.info("Writing content lengths"); long[][] valueArray = LongBigArrays.newBigArray(numNodes); BigArrays.fill(valueArray, -1); for (String tableName : new String[]{"content", "skipped_content"}) { SwhOrcTable table = dataset.getTable(tableName); if (table == null) { continue; } table.readLongColumn("length", (swhid, value) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(valueArray, id, value); }); } BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin"); } public void writeContentIsSkipped() throws IOException { LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes); SwhOrcTable table = dataset.getTable("skipped_content"); if (table != null) { table.readIdColumn((swhid) -> { long id = nodeIdMap.getNodeId(swhid); isSkippedBitVector.set(id); }); } BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin"); } public void writeTimestamps() throws IOException { logger.info("Writing author/committer timestamps for release + revision"); SwhOrcTable releaseTable = dataset.getTable("release"); SwhOrcTable revisionTable = dataset.getTable("revision"); long[][] timestampArray = LongBigArrays.newBigArray(numNodes); short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes); // Author timestamps BigArrays.fill(timestampArray, Long.MIN_VALUE); BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE); releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(timestampArray, id, date); BigArrays.set(timestampOffsetArray, id, dateOffset); }); revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(timestampArray, id, date); BigArrays.set(timestampOffsetArray, id, dateOffset); }); BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin"); BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin"); // Committer timestamps BigArrays.fill(timestampArray, Long.MIN_VALUE); BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE); revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(timestampArray, id, date); BigArrays.set(timestampOffsetArray, id, dateOffset); }); BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin"); BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin"); } public void writePersonIds() throws IOException { logger.info("Writing author/committer IDs for release + revision"); Object2LongFunction personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph"); SwhOrcTable releaseTable = dataset.getTable("release"); SwhOrcTable revisionTable = dataset.getTable("revision"); int[][] personArray = IntBigArrays.newBigArray(numNodes); // Author IDs BigArrays.fill(personArray, -1); releaseTable.readBytes64Column("author", (swhid, personBase64) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); }); revisionTable.readBytes64Column("author", (swhid, personBase64) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); }); BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin"); // Committer IDs BigArrays.fill(personArray, -1); revisionTable.readBytes64Column("committer", (swhid, personBase64) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); }); BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin"); } public void writeMessages() throws IOException { logger.info("Writing messages for release + revision, and URLs for origins"); long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes); BigArrays.fill(messageOffsetArray, -1); FastBufferedOutputStream messageStream = new FastBufferedOutputStream( new FileOutputStream(graphBasename + ".property.message.bin")); AtomicLong offset = new AtomicLong(0L); SwhOrcTable releaseTable = dataset.getTable("release"); releaseTable.readBytes64Column("message", (swhid, messageBase64) -> { long id = nodeIdMap.getNodeId(swhid); messageStream.write(messageBase64); messageStream.write('\n'); BigArrays.set(messageOffsetArray, id, offset.longValue()); offset.addAndGet(messageBase64.length + 1); }); SwhOrcTable revisionTable = dataset.getTable("revision"); revisionTable.readBytes64Column("message", (swhid, messageBase64) -> { long id = nodeIdMap.getNodeId(swhid); messageStream.write(messageBase64); messageStream.write('\n'); BigArrays.set(messageOffsetArray, id, offset.longValue()); offset.addAndGet(messageBase64.length + 1); }); OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin"); originTable.readURLs((swhid, messageBase64) -> { long id = nodeIdMap.getNodeId(swhid); messageStream.write(messageBase64); messageStream.write('\n'); BigArrays.set(messageOffsetArray, id, offset.longValue()); offset.addAndGet(messageBase64.length + 1); }); // TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin"); // EliasFanoLongBigList messageOffsetEF = new // EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray)); // BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin"); messageStream.close(); } public void writeTagNames() throws IOException { logger.info("Writing tag names for release"); long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes); BigArrays.fill(tagNameOffsetArray, -1); FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream( new FileOutputStream(graphBasename + ".property.tag_name.bin")); AtomicLong offset = new AtomicLong(0L); SwhOrcTable releaseTable = dataset.getTable("release"); releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> { long id = nodeIdMap.getNodeId(swhid); tagNameStream.write(tagNameBase64); tagNameStream.write('\n'); BigArrays.set(tagNameOffsetArray, id, offset.longValue()); offset.addAndGet(tagNameBase64.length + 1); }); BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin"); // EliasFanoLongBigList tagNameOffsetEF = new // EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray)); // BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin"); tagNameStream.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java index bd5459f..9352853 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java @@ -1,249 +1,256 @@ +/* + * Copyright (c) 2019 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.forks; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; public class ForkCC { public Boolean includeRootDir; private SwhBidirectionalGraph graph; private Long emptySnapshot; private LongArrayBitVector visited; private LongArrayBitVector whitelist; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ForkCC.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't', "whitelist", "Whitelist of origins"), new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'R', "includerootdir", "Include root directory (default: false)"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private static void printDistribution(ArrayList> components) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { System.out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { System.out.println(node); } } private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = SwhBidirectionalGraph.loadMapped(graphBasename).symmetrize(); System.err.println("Graph loaded."); this.emptySnapshot = null; this.whitelist = null; this.visited = null; this.includeRootDir = null; } private boolean nodeIsEmptySnapshot(Long node) { if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private Boolean shouldVisit(Long node) { Node.Type nt = this.graph.getNodeType(node); if (nt == Node.Type.CNT) { return false; } if (nt == Node.Type.DIR && !includeRootDir) return false; if (this.nodeIsEmptySnapshot(node)) return false; if (visited.getBoolean(node)) return false; return true; } private ArrayList> compute(ProgressLogger pl) throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 visited = LongArrayBitVector.ofLength(n); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); ArrayList> components = new ArrayList<>(); for (long i = 0; i < n; i++) { if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) continue; ArrayList component = new ArrayList<>(); queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); Node.Type cur_nt = this.graph.getNodeType(currentNode); if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) { // TODO: add a check that the origin has >=1 non-empty snapshot component.add(currentNode); } final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!shouldVisit(succ)) continue; if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (component.size() > 0) { components.add(component); } } pl.done(); queue.close(); return components; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } } private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); Scanner scanner; try { scanner = new Scanner(new File(path)); while (scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String whitelistPath = config.getString("whitelistPath"); boolean includeRootDir = config.getBoolean("includeRootDir"); String outdirPath = config.getString("outdir"); ForkCC forkCc = new ForkCC(); try { forkCc.load_graph(graphPath); forkCc.includeRootDir = includeRootDir; } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } if (whitelistPath != null) { forkCc.parseWhitelist(whitelistPath); } ProgressLogger logger = new ProgressLogger(); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { ArrayList> components = forkCc.compute(logger); printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt")); printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt")); } catch (IOException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java index aa57ae6..361cce8 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java @@ -1,223 +1,230 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.forks; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.*; public class ForkCliques { private SwhBidirectionalGraph graph; private LongArrayBitVector whitelist; private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = SwhBidirectionalGraph.loadMapped(graphBasename); System.err.println("Graph loaded."); this.whitelist = null; } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ForkCliques.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't', "whitelist", "Whitelist of origins"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private ArrayList dfsAt(Long baseNode) { ArrayList res = new ArrayList<>(); final Deque stack = new ArrayDeque<>(); HashSet seen = new HashSet<>(); stack.push(baseNode); while (!stack.isEmpty()) { final Long currentNode = stack.pop(); final LazyLongIterator iterator = this.graph.predecessors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!seen.contains(succ)) { Node.Type nt = this.graph.getNodeType(succ); if (nt == Node.Type.DIR || nt == Node.Type.CNT) continue; if (nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(succ))) { res.add(succ); } else { stack.push(succ); seen.add(succ); } } } } Collections.sort(res); return res; } private boolean isBaseRevision(Long node) { if (this.graph.getNodeType(node) != Node.Type.REV) return false; final LazyLongIterator iterator = this.graph.successors(node); long succ; while ((succ = iterator.nextLong()) != -1) { if (this.graph.getNodeType(succ) == Node.Type.REV) return false; } return true; } static private String fingerprint(ArrayList cluster) { MessageDigest digest; try { digest = MessageDigest.getInstance("SHA-256"); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); return null; } for (Long n : cluster) digest.update(Longs.toByteArray(n)); return new String(digest.digest()); } private ArrayList> compute(ProgressLogger pl) { final long n = this.graph.numNodes(); HashSet fingerprints = new HashSet<>(); ArrayList> clusters = new ArrayList<>(); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting topological sort..."); for (long i = 0; i < n; i++) { if (isBaseRevision(i)) { ArrayList currentCluster = dfsAt(i); String clusterFp = fingerprint(currentCluster); if (!fingerprints.contains(clusterFp)) { fingerprints.add(clusterFp); clusters.add(currentCluster); } } pl.update(); } pl.done(); return clusters; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } } private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); Scanner scanner; try { scanner = new Scanner(new File(path)); while (scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String whitelistPath = config.getString("whitelistPath"); String outdirPath = config.getString("outdir"); ForkCliques forkCliques = new ForkCliques(); try { forkCliques.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } if (whitelistPath != null) { forkCliques.parseWhitelist(whitelistPath); } Logger rootLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); rootLogger.setLevel(Level.DEBUG); ProgressLogger logger = new ProgressLogger(rootLogger); ArrayList> components = forkCliques.compute(logger); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt")); printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt")); } catch (FileNotFoundException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java index 8389962..5067c28 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java @@ -1,88 +1,95 @@ +/* + * Copyright (c) 2019 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.forks; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; import java.io.IOException; import java.util.ArrayList; public class ListEmptyOrigins { private SwhBidirectionalGraph graph; private Long emptySnapshot; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ListEmptyOrigins.class.getName(), "", new Parameter[]{new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); ListEmptyOrigins leo = new ListEmptyOrigins(); try { leo.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } ArrayList badlist = leo.compute(leo.graph); for (Long bad : badlist) { System.out.println(bad); } } private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = SwhBidirectionalGraph.loadMapped(graphBasename); System.err.println("Graph loaded."); this.emptySnapshot = null; } private boolean nodeIsEmptySnapshot(Long node) { System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node); if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private ArrayList compute(ImmutableGraph graph) { final long n = graph.numNodes(); ArrayList bad = new ArrayList<>(); for (long i = 0; i < n; i++) { Node.Type nt = this.graph.getNodeType(i); if (nt != Node.Type.ORI) continue; final LazyLongIterator iterator = graph.successors(i); long succ; boolean found = false; while ((succ = iterator.nextLong()) != -1) { if (this.graph.outdegree(succ) > 0) { found = true; } } if (!found) bad.add(i); } return bad; } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java index 53bcc49..dd8d203 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java @@ -1,188 +1,195 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.util.XoRoShiRo128PlusRandom; import org.softwareheritage.graph.*; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.*; import java.util.concurrent.*; public class AveragePaths { private final SwhBidirectionalGraph graph; private final Subgraph subgraph; private final ConcurrentHashMap result; private final String outdir; public AveragePaths(String graphBasename, String allowedNodes, String outdir) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); this.graph = SwhBidirectionalGraph.loadMapped(graphBasename); this.subgraph = new Subgraph(this.graph, new AllowedNodes(allowedNodes)); this.outdir = outdir; System.err.println("Graph loaded."); result = new ConcurrentHashMap<>(); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's', "nodetypes", "Node type constraints"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't', "numthreads", "Number of threads"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private void run(int numThreads) throws InterruptedException { final long END_OF_QUEUE = -1L; ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); service.submit(() -> { try { SwhBidirectionalGraph thread_graph = graph.copy(); Subgraph thread_subgraph = subgraph.copy(); long[][] randomPerm = Util.identity(thread_graph.numNodes()); LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom()); long n = thread_graph.numNodes(); ProgressLogger pl = new ProgressLogger(); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Filling processor queue..."); for (long j = 0; j < n; ++j) { long node = BigArrays.get(randomPerm, j); if (thread_subgraph.nodeExists(node) && thread_subgraph.outdegree(node) == 0) { queue.put(node); } if (j % 10000 == 0) { printResult(); } pl.update(); } } catch (Exception e) { e.printStackTrace(); } finally { for (int i = 0; i < numThreads; ++i) { try { queue.put(END_OF_QUEUE); } catch (InterruptedException e) { e.printStackTrace(); } } } }); for (int i = 0; i < numThreads; ++i) { service.submit(() -> { try { Subgraph thread_subgraph = subgraph.copy(); while (true) { Long node = null; try { node = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } if (node == null || node == END_OF_QUEUE) { return; } bfsAt(thread_subgraph, node); } } catch (Exception e) { e.printStackTrace(); } }); } service.shutdown(); service.awaitTermination(365, TimeUnit.DAYS); } private void bfsAt(Subgraph graph, long srcNodeId) { ArrayDeque queue = new ArrayDeque<>(); HashSet visited = new HashSet<>(); long FRONTIER_MARKER = -1; queue.addLast(srcNodeId); visited.add(srcNodeId); long distance = 0; queue.addLast(FRONTIER_MARKER); while (!queue.isEmpty()) { long currentNodeId = queue.removeFirst(); // System.err.println("curr: " + currentNodeId); if (currentNodeId == FRONTIER_MARKER) { if (queue.isEmpty()) // avoid infinite loops break; ++distance; queue.addLast(FRONTIER_MARKER); continue; } if (graph.indegree(currentNodeId) == 0) { result.merge(distance, 1L, Long::sum); } LazyLongIterator it = graph.predecessors(currentNodeId); for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { if (!visited.contains(neighborNodeId)) { queue.addLast(neighborNodeId); visited.add(neighborNodeId); } } } } public void printResult() throws IOException { new File(outdir).mkdirs(); PrintWriter f = new PrintWriter(new FileWriter(outdir + "/distribution.txt")); TreeMap sortedDistribution = new TreeMap<>(result); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdir = config.getString("outdir"); String allowedNodes = config.getString("nodeTypes"); int numThreads = config.getInt("numThreads"); AveragePaths tp = new AveragePaths(graphPath, allowedNodes, outdir); tp.run(numThreads); tp.printResult(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java index 0564463..9195560 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java @@ -1,325 +1,332 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.util.XoRoShiRo128PlusRandom; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; import java.io.*; import java.util.*; import java.util.concurrent.*; public class ClusteringCoefficient { private final SwhBidirectionalGraph graph; private final String outdirPath; private final ConcurrentHashMap result_full; private final ConcurrentHashMap result_dircnt; private final ConcurrentHashMap result_rev; private final ConcurrentHashMap result_revrel; private final ConcurrentHashMap result_orisnp; public ClusteringCoefficient(String graphBasename, String outdirPath) throws IOException { this.outdirPath = outdirPath; System.err.println("Loading graph " + graphBasename + " ..."); SwhBidirectionalGraph directedGraph = SwhBidirectionalGraph.loadMapped(graphBasename); this.graph = directedGraph.symmetrize(); System.err.println("Graph loaded."); result_full = new ConcurrentHashMap<>(); result_dircnt = new ConcurrentHashMap<>(); result_rev = new ConcurrentHashMap<>(); result_revrel = new ConcurrentHashMap<>(); result_orisnp = new ConcurrentHashMap<>(); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't', "numthreads", "Number of threads"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private void run(int numThreads) throws InterruptedException { final long END_OF_QUEUE = -1L; ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); service.submit(() -> { try { SwhBidirectionalGraph thread_graph = graph.copy(); long[][] randomPerm = Util.identity(thread_graph.numNodes()); LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom()); long n = thread_graph.numNodes(); ProgressLogger pl = new ProgressLogger(); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Filling processor queue..."); for (long j = 0; j < n; ++j) { long node = BigArrays.get(randomPerm, j); queue.put(node); if (j % 10000 == 0) { printResult(); } pl.update(); } } catch (Exception e) { e.printStackTrace(); } finally { for (int i = 0; i < numThreads; ++i) { try { queue.put(END_OF_QUEUE); } catch (InterruptedException e) { e.printStackTrace(); } } } }); for (int i = 0; i < numThreads; ++i) { service.submit(() -> { try { SwhBidirectionalGraph thread_graph = graph.copy(); while (true) { Long node = null; try { node = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } if (node == null || node == END_OF_QUEUE) { return; } computeAt(thread_graph, node); } } catch (Exception e) { e.printStackTrace(); } }); } service.shutdown(); service.awaitTermination(365, TimeUnit.DAYS); } private void computeAt(SwhBidirectionalGraph graph, long node) { long d = graph.outdegree(node); if (d < 2) { return; } Node.Type nodeType = graph.getNodeType(node); HashSet neighborhood = new HashSet<>(); long succ; final LazyLongIterator iterator = graph.successors(node); while ((succ = iterator.nextLong()) != -1) { neighborhood.add(succ); } long triangles_full = 0; long triangles_dircnt = 0; long triangles_rev = 0; long triangles_revrel = 0; long triangles_orisnp = 0; for (Long neighbor : neighborhood) { Node.Type neighborNodeType = graph.getNodeType(neighbor); final LazyLongIterator it = graph.successors(neighbor); while ((succ = it.nextLong()) != -1) { if (neighborhood.contains(succ)) { Node.Type succNodeType = graph.getNodeType(succ); triangles_full++; if ((nodeType == Node.Type.DIR || nodeType == Node.Type.CNT) && (neighborNodeType == Node.Type.DIR || neighborNodeType == Node.Type.CNT) && (succNodeType == Node.Type.DIR || succNodeType == Node.Type.CNT)) { triangles_dircnt++; } else if ((nodeType == Node.Type.REV || nodeType == Node.Type.REL) && (neighborNodeType == Node.Type.REV || neighborNodeType == Node.Type.REL) && (succNodeType == Node.Type.REV || succNodeType == Node.Type.REL)) { triangles_revrel++; if (nodeType == Node.Type.REV && neighborNodeType == Node.Type.REV && succNodeType == Node.Type.REV) triangles_rev++; } else if ((nodeType == Node.Type.ORI || nodeType == Node.Type.SNP) && (neighborNodeType == Node.Type.ORI || neighborNodeType == Node.Type.SNP) && (succNodeType == Node.Type.ORI || succNodeType == Node.Type.SNP)) { triangles_orisnp++; } } } } result_full.merge(triangles_full, 1L, Long::sum); result_dircnt.merge(triangles_dircnt, 1L, Long::sum); result_rev.merge(triangles_rev, 1L, Long::sum); result_revrel.merge(triangles_revrel, 1L, Long::sum); result_orisnp.merge(triangles_orisnp, 1L, Long::sum); } public void printSortedDistribution(String distribPath, Map distrib) throws IOException { PrintWriter f = new PrintWriter(new FileWriter(distribPath)); TreeMap sortedDistribution = new TreeMap<>(distrib); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } public void printResult() throws IOException { new File(outdirPath).mkdirs(); printSortedDistribution(outdirPath + "/distribution-full.txt", result_full); printSortedDistribution(outdirPath + "/distribution-dircnt.txt", result_dircnt); printSortedDistribution(outdirPath + "/distribution-rev.txt", result_rev); printSortedDistribution(outdirPath + "/distribution-relrev.txt", result_revrel); printSortedDistribution(outdirPath + "/distribution-orisnp.txt", result_orisnp); } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdir = config.getString("outdir"); int numThreads = config.getInt("numThreads"); ClusteringCoefficient cc = new ClusteringCoefficient(graphPath, outdir); cc.run(numThreads); cc.printResult(); } // Old unused functions private long oppositeEdges(ImmutableGraph graph, long node) { HashSet neighborhood = new HashSet<>(); long succ; final LazyLongIterator iterator = graph.successors(node); while ((succ = iterator.nextLong()) != -1) { neighborhood.add(succ); } long closed_triplets = 0; for (Long neighbor : neighborhood) { final LazyLongIterator it = graph.successors(neighbor); while ((succ = it.nextLong()) != -1) { if (neighborhood.contains(succ)) { closed_triplets++; } } } return closed_triplets; } public void compute(ProgressLogger pl, Formatter out_local, Formatter out_global) { final long n = this.graph.numNodes(); pl.expectedUpdates = n; pl.itemsName = "nodes"; long nodes_d2 = 0; double cum_lcc = 0; double cum_lcc_c0 = 0; double cum_lcc_c1 = 0; HashMap distribution = new HashMap<>(); for (long node = 0; node < n; node++) { long d = graph.outdegree(node); if (d >= 2) { double t = (d * (d - 1)); double m = oppositeEdges(graph, node); double lcc = m / t; distribution.merge(lcc, 1L, Long::sum); cum_lcc += lcc; nodes_d2++; } else { cum_lcc_c1++; } pl.update(); } pl.done(); for (Map.Entry entry : distribution.entrySet()) { out_local.format("%f %d\n", entry.getKey(), entry.getValue()); } double gC = cum_lcc / nodes_d2; double gC0 = cum_lcc_c0 / n; double gC1 = cum_lcc_c1 / n; out_global.format("C: %f\n", gC); out_global.format("C0: %f\n", gC0); out_global.format("C1: %f\n", gC1); } public void compute_approx(Formatter out_global) { final long n = this.graph.numNodes(); long trials = 0; long triangles = 0; while (true) { long node = ThreadLocalRandom.current().nextLong(0, n); long d = graph.outdegree(node); if (d >= 2) { Long u = null; Long v = null; long u_idx = ThreadLocalRandom.current().nextLong(0, d); long v_idx = ThreadLocalRandom.current().nextLong(0, d - 1); if (v_idx >= u_idx) { v_idx++; } long succ; final LazyLongIterator node_iterator = graph.successors(node); for (long succ_idx = 0; (succ = node_iterator.nextLong()) != -1; succ_idx++) { if (succ_idx == u_idx) { u = succ; } if (succ_idx == v_idx) { v = succ; } } final LazyLongIterator u_iterator = graph.successors(u); while ((succ = u_iterator.nextLong()) != -1) { if (succ == v) triangles++; } } trials++; if (trials % 100 == 0 || true) { double gC = (double) triangles / (double) trials; out_global.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials); System.out.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials); } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java index b351869..b6f6072 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java @@ -1,200 +1,207 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.AllowedNodes; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.Subgraph; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.*; public class ConnectedComponents { private Subgraph graph; private void load_graph(String graphBasename, String nodeTypes) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); var underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename); var underlyingGraphSym = underlyingGraph.symmetrize(); graph = new Subgraph(underlyingGraphSym, new AllowedNodes(nodeTypes)); System.err.println("Graph loaded."); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ConnectedComponents.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new Switch("byOrigins", JSAP.NO_SHORTFLAG, "by-origins", "Compute size of components by number of origins"), new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', "nodetypes", "Allowed node types (comma-separated)"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private HashMap /* ArrayList> */ compute(ProgressLogger pl, boolean byOrigin) throws IOException { final long n = graph.numNodes(); final long maxN = graph.maxNodeNumber(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * maxN); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 LongArrayBitVector visited = LongArrayBitVector.ofLength(maxN); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); // ArrayList> components = new ArrayList<>(); HashMap componentDistribution = new HashMap<>(); var it = graph.nodeIterator(); while (it.hasNext()) { long i = it.nextLong(); if (visited.getBoolean(i)) continue; // ArrayList component = new ArrayList<>(); long componentNodes = 0; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); // component.add(currentNode); if (!byOrigin || graph.getNodeType(currentNode) == Node.Type.ORI) componentNodes += 1; final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } /* * if (component.size() > 0) { components.add(component); } */ if (componentNodes > 0) componentDistribution.merge(componentNodes, 1L, Long::sum); } pl.done(); // return components; return componentDistribution; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } out.close(); } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } out.close(); } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } out.close(); } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdirPath = config.getString("outdir"); String nodeTypes = config.getString("nodeTypes"); boolean byOrigin = config.getBoolean("byOrigins"); ConnectedComponents connectedComponents = new ConnectedComponents(); try { connectedComponents.load_graph(graphPath, nodeTypes); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } ProgressLogger logger = new ProgressLogger(); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { // ArrayList> components = connectedComponents.compute(logger); // components.sort(Comparator.comparing(ArrayList::size).reversed()); // printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); // printLargestComponent(components, new Formatter(outdirPath + "/largest_component.txt")); // printAllComponents(components, new Formatter(outdirPath + "/all_components.txt")); HashMap componentDistribution = connectedComponents.compute(logger, byOrigin); PrintWriter f = new PrintWriter(new FileWriter(outdirPath + "/distribution.txt")); TreeMap sortedDistribution = new TreeMap<>(componentDistribution); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } catch (IOException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java index a74a31b..54e53cb 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java @@ -1,239 +1,246 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.lang.reflect.InvocationTargetException; import java.util.*; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.UnflaggedOption; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; public class InOutDegree { private InOutDegree() { } private static final int NODE_ARRAY_SIZE = Node.Type.values().length + 1; private static final int TYPE_ALL = Node.Type.values().length; private static final int TYPE_CNT = Node.Type.toInt(Node.Type.CNT); private static final int TYPE_DIR = Node.Type.toInt(Node.Type.DIR); private static final int TYPE_REV = Node.Type.toInt(Node.Type.REV); private static final int TYPE_REL = Node.Type.toInt(Node.Type.REL); private static final int TYPE_SNP = Node.Type.toInt(Node.Type.SNP); private static final int TYPE_ORI = Node.Type.toInt(Node.Type.ORI); public static long[] outdegreeTypes(final SwhBidirectionalGraph graph, long node) { long[] out = new long[NODE_ARRAY_SIZE]; var successors = graph.successors(node); long neighbor; while ((neighbor = successors.nextLong()) != -1) { out[Node.Type.toInt(graph.getNodeType(neighbor))]++; out[TYPE_ALL]++; } return out; } public static long[] indegreeTypes(final SwhBidirectionalGraph graph, long node) { return outdegreeTypes(graph.transpose(), node); } public static void writeDistribution(HashMap distribution, String filename) throws IOException { PrintWriter f = new PrintWriter(new FileWriter(filename)); TreeMap sortedDistribution = new TreeMap<>(distribution); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } public static void run(final SwhBidirectionalGraph graph, String resultsDir) throws IOException { // Per-type var cnt_in_dir = new HashMap(); var dir_in_dir = new HashMap(); var dir_in_rev = new HashMap(); var dir_in_all = new HashMap(); var dir_out_all = new HashMap(); var dir_out_dir = new HashMap(); var dir_out_cnt = new HashMap(); var dir_out_rev = new HashMap(); var rev_in_dir = new HashMap(); var rev_in_rel = new HashMap(); var rev_in_rev = new HashMap(); var rev_in_snp = new HashMap(); var rev_in_all = new HashMap(); var rev_out_rev = new HashMap(); var rel_in_snp = new HashMap(); var snp_in_ori = new HashMap(); var snp_out_all = new HashMap(); var snp_out_rel = new HashMap(); var snp_out_rev = new HashMap(); var ori_out_snp = new HashMap(); // Aggregated per layer var full_in = new HashMap(); var full_out = new HashMap(); var dircnt_in = new HashMap(); var dircnt_out = new HashMap(); var orisnp_in = new HashMap(); var orisnp_out = new HashMap(); var relrev_in = new HashMap(); var relrev_out = new HashMap(); var rev_in = rev_in_rev; // alias for single-type layer var rev_out = rev_out_rev; final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "nodes"; pl.expectedUpdates = graph.numNodes(); pl.start("Scanning..."); long[] in; long[] out; for (long i = graph.numNodes(); i-- != 0;) { long d_in = graph.indegree(i); long d_out = graph.outdegree(i); full_in.merge(d_in, 1L, Long::sum); full_out.merge(d_out, 1L, Long::sum); switch (graph.getNodeType(i)) { case CNT: cnt_in_dir.merge(d_in, 1L, Long::sum); dircnt_in.merge(d_in, 1L, Long::sum); dircnt_out.merge(0L, 1L, Long::sum); break; case DIR: in = indegreeTypes(graph, i); out = outdegreeTypes(graph, i); dir_in_all.merge(in[TYPE_ALL], 1L, Long::sum); dir_out_all.merge(out[TYPE_ALL], 1L, Long::sum); dir_in_dir.merge(in[TYPE_DIR], 1L, Long::sum); dir_in_rev.merge(in[TYPE_REV], 1L, Long::sum); dir_out_cnt.merge(out[TYPE_CNT], 1L, Long::sum); dir_out_dir.merge(out[TYPE_DIR], 1L, Long::sum); dir_out_rev.merge(out[TYPE_REV], 1L, Long::sum); dircnt_in.merge(in[TYPE_DIR] + in[TYPE_CNT], 1L, Long::sum); dircnt_out.merge(out[TYPE_DIR] + out[TYPE_CNT], 1L, Long::sum); break; case REV: in = indegreeTypes(graph, i); out = outdegreeTypes(graph, i); rev_in_all.merge(in[TYPE_ALL], 1L, Long::sum); rev_in_dir.merge(in[TYPE_DIR], 1L, Long::sum); rev_in_rev.merge(in[TYPE_REV], 1L, Long::sum); rev_in_rel.merge(in[TYPE_REL], 1L, Long::sum); rev_in_snp.merge(in[TYPE_SNP], 1L, Long::sum); rev_out_rev.merge(out[TYPE_REV], 1L, Long::sum); relrev_in.merge(in[TYPE_REL] + in[TYPE_REV], 1L, Long::sum); relrev_out.merge(out[TYPE_REL] + out[TYPE_REV], 1L, Long::sum); break; case REL: rel_in_snp.merge(d_in, 1L, Long::sum); relrev_in.merge(0L, 1L, Long::sum); relrev_out.merge(d_out, 1L, Long::sum); break; case SNP: out = outdegreeTypes(graph, i); snp_in_ori.merge(d_in, 1L, Long::sum); snp_out_all.merge(out[TYPE_ALL], 1L, Long::sum); snp_out_rel.merge(out[TYPE_REL], 1L, Long::sum); snp_out_rev.merge(out[TYPE_REV], 1L, Long::sum); orisnp_in.merge(d_in, 1L, Long::sum); orisnp_out.merge(out[TYPE_REL] + out[TYPE_REV], 1L, Long::sum); break; case ORI: ori_out_snp.merge(d_out, 1L, Long::sum); orisnp_in.merge(0L, 1L, Long::sum); orisnp_out.merge(d_out, 1L, Long::sum); break; default : pl.logger().warn("Invalid node type at pos {}", i); break; } pl.update(); } pl.done(); (new File(resultsDir)).mkdir(); writeDistribution(full_in, resultsDir + "/full_in.txt"); writeDistribution(full_out, resultsDir + "/full_out.txt"); writeDistribution(dircnt_in, resultsDir + "/dir+cnt_in.txt"); writeDistribution(dircnt_out, resultsDir + "/dir+cnt_out.txt"); writeDistribution(relrev_in, resultsDir + "/rel+rev_in.txt"); writeDistribution(relrev_out, resultsDir + "/rel+rev_out.txt"); writeDistribution(orisnp_in, resultsDir + "/ori+snp_in.txt"); writeDistribution(orisnp_out, resultsDir + "/ori+snp_out.txt"); writeDistribution(rev_in, resultsDir + "/rev_in.txt"); writeDistribution(rev_out, resultsDir + "/rev_out.txt"); String resultsTypeDir = resultsDir + "/per_type"; (new File(resultsTypeDir)).mkdir(); writeDistribution(cnt_in_dir, resultsTypeDir + "/cnt_in_dir.txt"); writeDistribution(dir_in_dir, resultsTypeDir + "/dir_in_dir.txt"); writeDistribution(dir_in_rev, resultsTypeDir + "/dir_in_rev.txt"); writeDistribution(dir_in_all, resultsTypeDir + "/dir_in_all.txt"); writeDistribution(dir_out_all, resultsTypeDir + "/dir_out_all.txt"); writeDistribution(dir_out_dir, resultsTypeDir + "/dir_out_dir.txt"); writeDistribution(dir_out_cnt, resultsTypeDir + "/dir_out_cnt.txt"); writeDistribution(dir_out_rev, resultsTypeDir + "/dir_out_rev.txt"); writeDistribution(rev_in_dir, resultsTypeDir + "/rev_in_dir.txt"); writeDistribution(rev_in_rel, resultsTypeDir + "/rev_in_rel.txt"); writeDistribution(rev_in_rev, resultsTypeDir + "/rev_in_rev.txt"); writeDistribution(rev_in_snp, resultsTypeDir + "/rev_in_snp.txt"); writeDistribution(rev_in_all, resultsTypeDir + "/rev_in_all.txt"); writeDistribution(rev_out_rev, resultsTypeDir + "/rev_out_rev.txt"); writeDistribution(rel_in_snp, resultsTypeDir + "/rel_in_snp.txt"); writeDistribution(snp_in_ori, resultsTypeDir + "/snp_in_ori.txt"); writeDistribution(snp_out_all, resultsTypeDir + "/snp_out_all.txt"); writeDistribution(snp_out_rel, resultsTypeDir + "/snp_out_rel.txt"); writeDistribution(snp_out_rev, resultsTypeDir + "/snp_out_rev.txt"); writeDistribution(ori_out_snp, resultsTypeDir + "/ori_out_snp.txt"); } static public void main(final String[] arg) throws IllegalArgumentException, SecurityException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, JSAPException, IOException, ClassNotFoundException { final SimpleJSAP jsap = new SimpleJSAP(InOutDegree.class.getName(), "Computes in and out degrees of the given SWHGraph", new Parameter[]{ new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the graph."), new UnflaggedOption("resultsDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The directory of the resulting files."),}); final JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) System.exit(1); final String basename = jsapResult.getString("basename"); final String resultsDir = jsapResult.userSpecified("resultsDir") ? jsapResult.getString("resultsDir") : basename; final ProgressLogger pl = new ProgressLogger(); SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(basename); run(graph, resultsDir); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java index 3632d32..3f55826 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java @@ -1,98 +1,105 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.util.XoRoShiRo128PlusRandom; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.experiments.forks.ForkCC; import java.io.*; public class SubdatasetSizeFunction { private SubdatasetSizeFunction() { } public static void run(final SwhBidirectionalGraph graph) throws IOException { final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "nodes"; pl.expectedUpdates = graph.numNodes(); long n = graph.numNodes(); LongArrayBitVector visited = LongArrayBitVector.ofLength(n); int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; long[][] randomPerm = Util.identity(graph.numNodes()); LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom()); long visitedNodes = 0; long visitedEdges = 0; long visitedOrigins = 0; long visitedContents = 0; pl.start("Running traversal starting from origins..."); for (long j = 0; j < n; ++j) { long i = BigArrays.get(randomPerm, j); if (visited.getBoolean(i) || graph.getNodeType(i) != Node.Type.ORI) { continue; } visitedOrigins++; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); visitedNodes++; if (graph.getNodeType(currentNode) == Node.Type.CNT) visitedContents++; final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { visitedEdges++; if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (visitedOrigins % 10000 == 0) System.out.println(visitedNodes + " " + visitedEdges + " " + visitedContents); } pl.done(); } static public void main(final String[] arg) throws IllegalArgumentException, SecurityException, JSAPException, IOException { final SimpleJSAP jsap = new SimpleJSAP(SubdatasetSizeFunction.class.getName(), "Computes subdataset size functions using a random uniform order", new Parameter[]{new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the graph."),}); final JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) System.exit(1); final String basename = jsapResult.getString("basename"); SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(basename); run(graph); } } diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java index 5e4a430..2b30ecf 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java @@ -1,147 +1,154 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.labels; /** * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be * encoded in a single long type, to re-use Webgraph interface. * * @author The Software Heritage developers */ public class DirEntry { public long filenameId; public int permission; public DirEntry(long filenameId, int permission) { this.filenameId = filenameId; this.permission = permission; } public DirEntry(long dirEntryEncoded) { this.filenameId = labelNameFromEncoded(dirEntryEncoded); this.permission = permissionFromEncoded(dirEntryEncoded); } public static long toEncoded(long filenameId, int permission) { return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission); } public static long labelNameFromEncoded(long labelEncoded) { return labelEncoded >> Permission.NB_BITS_PER_TYPE; } public static int permissionFromEncoded(long labelEncoded) { int dirBytes = (int) (labelEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); return Permission.Type.fromEncoded(dirBytes); } public long toEncoded() { return toEncoded(filenameId, permission); } public static int labelWidth(long numLabels) { int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) { System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now."); System.exit(2); } return filenameIdWidth + Permission.NB_BITS_PER_TYPE; } /** * Permission types present in the Software Heritage graph. * * @author The Software Heritage developers */ private static class Permission { public static final int NB_BITS_PER_TYPE = (int) Math .ceil(Math.log(Permission.Type.values().length) / Math.log(2)); public enum Type { NONE, CONTENT, EXECUTABLE_CONTENT, SYMLINK, DIRECTORY, REVISION; public static Permission.Type fromIntCode(int intCode) { switch (intCode) { case 0: return NONE; case 1: return CONTENT; case 2: return EXECUTABLE_CONTENT; case 3: return SYMLINK; case 4: return DIRECTORY; case 5: return REVISION; } throw new IllegalArgumentException("Unknown node permission code: " + intCode); } public static int toIntCode(Permission.Type type) { switch (type) { case NONE: return 0; case CONTENT: return 1; case EXECUTABLE_CONTENT: return 2; case SYMLINK: return 3; case DIRECTORY: return 4; case REVISION: return 5; } throw new IllegalArgumentException("Unknown node permission type: " + type); } public static Permission.Type fromIntPerm(int intPerm) { switch (intPerm) { case 0: return NONE; case 0100644: return CONTENT; case 0100755: return EXECUTABLE_CONTENT; case 0120000: return SYMLINK; case 0040000: return DIRECTORY; case 0160000: return REVISION; default : return NONE; } // throw new IllegalArgumentException("Unknown node permission: " + intPerm); // TODO: warning here instead? } public static int toIntPerm(Permission.Type type) { switch (type) { case NONE: return 0; case CONTENT: return 0100644; case EXECUTABLE_CONTENT: return 0100755; case SYMLINK: return 0120000; case DIRECTORY: return 0040000; case REVISION: return 0160000; } throw new IllegalArgumentException("Unknown node permission type: " + type); } public static int fromEncoded(int encoded) { return toIntPerm(fromIntCode(encoded)); } public static int toEncoded(int permission) { return toIntCode(fromIntPerm(permission)); } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java index c84cfec..f1a2c18 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java @@ -1,110 +1,117 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.labels; import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel; import it.unimi.dsi.big.webgraph.labelling.Label; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import java.io.IOException; import java.util.Arrays; /** * Software Heritage graph edge labels following Webgraph labels convention. * * @author The Software Heritage developers */ public class SwhLabel extends AbstractLabel { private final String key; private final int width; // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry // to store branch names) public DirEntry[] value; // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long private final FixedWidthLongListLabel longList; private static final DirEntry[] EMPTY_ARRAY = {}; public SwhLabel(String key, int width, DirEntry[] value) { this.key = key; this.width = width; this.value = value; long[] valueEncoded = new long[value.length]; for (int i = 0; i < value.length; i++) valueEncoded[i] = value[i].toEncoded(); this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); } public SwhLabel(String key, int width) { this(key, width, EMPTY_ARRAY); } public SwhLabel(String... arg) { this(arg[0], Integer.parseInt(arg[1])); } @Override public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { int ret = longList.fromBitStream(inputBitStream, sourceUnused); // Decode values from their internal long representation value = new DirEntry[longList.value.length]; for (int i = 0; i < value.length; i++) value[i] = new DirEntry(longList.value[i]); return ret; } @Override public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { // Values have already been encoded in the SwhLabel constructor return longList.toBitStream(outputBitStream, sourceUnused); } @Override public String wellKnownAttributeKey() { return key; } @Override public String[] attributeKeys() { return new String[]{key}; } @Override public Class[] attributeTypes() { return new Class[]{DirEntry[].class}; } @Override public Object get(String s) { if (this.key.equals(s)) return value; throw new IllegalArgumentException(); } @Override public Object get() { return value; } @Override public Label copy() { return new SwhLabel(key, width, value.clone()); } @Override public int fixedWidth() { return -1; } @Override public String toString() { return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; } @Override public String toSpec() { return this.getClass().getName() + "(" + key + "," + width + ")"; } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java index 7ca8c77..fb65937 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,189 +1,196 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.bytes.ByteBigList; import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.longs.LongMappedBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.compress.NodeMapBuilder; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using * an mmap()-ed .order file containing the graph permutation. * * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ public class NodeIdMap implements Size64 { /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; /** File extension for the long node id to SWHID map */ public static final String NODE_TO_SWHID = ".node2swhid.bin"; /** Graph path and basename */ String graphPath; /** mmap()-ed NODE_TO_SWHID file */ ByteBigList nodeToSwhMap; /** Minimal perfect hash (MPH) function SWHID -> initial order */ Object2LongFunction mph; /** mmap()-ed long list with the permutation initial order -> graph order */ LongBigList orderMap; /** * Constructor. * * @param graphPath full graph path */ public NodeIdMap(String graphPath) throws IOException { this.graphPath = graphPath; // node -> SWHID try (RandomAccessFile raf = new RandomAccessFile(graphPath + NODE_TO_SWHID, "r")) { this.nodeToSwhMap = ByteMappedBigList.map(raf.getChannel()); } // SWHID -> node this.mph = loadMph(graphPath + ".mph"); try (RandomAccessFile mapFile = new RandomAccessFile(new File(graphPath + ".order"), "r")) { this.orderMap = LongMappedBigList.map(mapFile.getChannel()); } } @SuppressWarnings("unchecked") public static Object2LongFunction loadMph(String path) throws IOException { Object obj; try { obj = BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e.getMessage()); } Object2LongFunction res = (Object2LongFunction) obj; // Backward-compatibility for old maps parametrized with . // New maps should be parametrized with , which is faster. try { // Try to call it with bytes, will fail if it's a O2LF. res.getLong("42".getBytes(StandardCharsets.UTF_8)); } catch (ClassCastException e) { class StringCompatibleByteFunction implements Object2LongFunction, Size64 { private final Object2LongFunction legacyFunction; public StringCompatibleByteFunction(Object2LongFunction legacyFunction) { this.legacyFunction = legacyFunction; } @Override public long getLong(Object o) { byte[] bi = (byte[]) o; return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8)); } @SuppressWarnings("deprecation") @Override public int size() { return legacyFunction.size(); } @Override public long size64() { return (legacyFunction instanceof Size64) ? ((Size64) legacyFunction).size64() : legacyFunction.size(); } } Object2LongFunction mphLegacy = (Object2LongFunction) obj; return new StringCompatibleByteFunction(mphLegacy); } // End of backward-compatibility block return res; } /** * Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the * SWHID is valid. * * @param swhid node represented as bytes * @return corresponding node as a long id */ public long getNodeId(byte[] swhid) { // 1. Hash the SWHID with the MPH to get its original ID long origNodeId = mph.getLong(swhid); // 2. Use the order permutation to get the position in the permuted graph return this.orderMap.getLong(origNodeId); } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} * @param checkExists if true, error if the SWHID is not present in the graph, if false the check * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ public long getNodeId(SWHID swhid, boolean checkExists) { // Convert the SWHID to bytes and call getNodeId() long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII)); // Check that the position effectively corresponds to a real node using the reverse map. // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. if (!checkExists || getSWHID(nodeId).equals(swhid)) { return nodeId; } else { throw new IllegalArgumentException("Unknown SWHID: " + swhid); } } public long getNodeId(SWHID swhid) { return getNodeId(swhid, true); } /** * Converts a node long id to corresponding SWHID. * * @param nodeId node as a long id * @return corresponding node as a {@link SWHID} * @see SWHID */ public SWHID getSWHID(long nodeId) { /* * Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's * swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid */ if (nodeId < 0 || nodeId >= nodeToSwhMap.size64()) { throw new IllegalArgumentException( "Node id " + nodeId + " should be between 0 and " + nodeToSwhMap.size64()); } byte[] swhid = new byte[SWHID_BIN_SIZE]; nodeToSwhMap.getElements(nodeId * SWHID_BIN_SIZE, swhid, 0, SWHID_BIN_SIZE); return SWHID.fromBytes(swhid); } /** Return the number of nodes in the map. */ @Override public long size64() { return nodeToSwhMap.size64(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java index d3da61d..3332607 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java @@ -1,55 +1,62 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import org.softwareheritage.graph.Node; import java.io.IOException; /** * Mapping between long node id and SWH node type as described in the * data model. *

* The type mapping is pre-computed and dumped on disk in the * {@link org.softwareheritage.graph.compress.NodeMapBuilder} class, then it is loaded in-memory * here using fastutil LongBigList. To be * space-efficient, the mapping is stored as a bitmap using minimum number of bits per * {@link Node.Type}. * * @author The Software Heritage developers */ public class NodeTypesMap { /** File extension for the long node id to node type map */ public static final String NODE_TO_TYPE = ".node2type.map"; /** * Array storing for each node its type */ public LongBigList nodeTypesMap; /** * Constructor. * * @param graphPath path and basename of the compressed graph */ public NodeTypesMap(String graphPath) throws IOException { try { nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + NODE_TO_TYPE); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Unknown class object: " + e); } } /** * Returns node type from a node long id. * * @param nodeId node as a long id * @return corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType(long nodeId) { long type = nodeTypesMap.getLong(nodeId); return Node.Type.fromInt((int) type); } } diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java b/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java index 64acfba..470f6da 100644 --- a/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java +++ b/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java @@ -1,293 +1,300 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import com.google.protobuf.FieldMask; import com.martiansoftware.jsap.*; import io.grpc.Server; import io.grpc.Status; import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder; import io.grpc.netty.shaded.io.netty.channel.ChannelOption; import io.grpc.stub.StreamObserver; import io.grpc.protobuf.services.ProtoReflectionService; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.compress.LabelMapBuilder; import java.io.FileInputStream; import java.io.IOException; import java.util.Properties; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; /** * Server that manages startup/shutdown of a {@code Greeter} server. */ public class GraphServer { private final static Logger logger = LoggerFactory.getLogger(GraphServer.class); private final SwhBidirectionalGraph graph; private final int port; private final int threads; private Server server; /** * @param graphBasename the basename of the SWH graph to load * @param port the port on which the GRPC server will listen * @param threads the number of threads to use in the server threadpool */ public GraphServer(String graphBasename, int port, int threads) throws IOException { this.graph = loadGraph(graphBasename); this.port = port; this.threads = threads; } /** Load a graph and all its properties. */ public static SwhBidirectionalGraph loadGraph(String basename) throws IOException { SwhBidirectionalGraph g = SwhBidirectionalGraph.loadLabelledMapped(basename, new ProgressLogger(logger)); g.loadContentLength(); g.loadContentIsSkipped(); g.loadPersonIds(); g.loadAuthorTimestamps(); g.loadCommitterTimestamps(); g.loadMessages(); g.loadTagNames(); g.loadLabelNames(); return g; } /** Start the RPC server. */ private void start() throws IOException { server = NettyServerBuilder.forPort(port).withChildOption(ChannelOption.SO_REUSEADDR, true) .executor(Executors.newFixedThreadPool(threads)).addService(new TraversalService(graph)) .addService(ProtoReflectionService.newInstance()).build().start(); logger.info("Server started, listening on " + port); Runtime.getRuntime().addShutdownHook(new Thread(() -> { try { GraphServer.this.stop(); } catch (InterruptedException e) { e.printStackTrace(System.err); } })); } private void stop() throws InterruptedException { if (server != null) { server.shutdown().awaitTermination(30, TimeUnit.SECONDS); } } /** * Await termination on the main thread since the grpc library uses daemon threads. */ private void blockUntilShutdown() throws InterruptedException { if (server != null) { server.awaitTermination(); } } private static JSAPResult parseArgs(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ new FlaggedOption("port", JSAP.INTEGER_PARSER, "50091", JSAP.NOT_REQUIRED, 'p', "port", "The port on which the server should listen."), new FlaggedOption("threads", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 't', "threads", "The number of concurrent threads. 0 = number of cores."), new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph")}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } /** Main launches the server from the command line. */ public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parseArgs(args); String graphBasename = config.getString("graphBasename"); int port = config.getInt("port"); int threads = config.getInt("threads"); if (threads == 0) { threads = Runtime.getRuntime().availableProcessors(); } final GraphServer server = new GraphServer(graphBasename, port, threads); server.start(); server.blockUntilShutdown(); } /** Implementation of the Traversal service, which contains all the graph querying endpoints. */ static class TraversalService extends TraversalServiceGrpc.TraversalServiceImplBase { SwhBidirectionalGraph graph; public TraversalService(SwhBidirectionalGraph graph) { this.graph = graph; } /** Return various statistics on the overall graph. */ @Override public void stats(StatsRequest request, StreamObserver responseObserver) { StatsResponse.Builder response = StatsResponse.newBuilder(); response.setNumNodes(graph.numNodes()); response.setNumEdges(graph.numArcs()); Properties properties = new Properties(); try { properties.load(new FileInputStream(graph.getPath() + ".properties")); properties.load(new FileInputStream(graph.getPath() + ".stats")); } catch (IOException e) { throw new RuntimeException(e); } response.setCompressionRatio(Double.parseDouble(properties.getProperty("compratio"))); response.setBitsPerNode(Double.parseDouble(properties.getProperty("bitspernode"))); response.setBitsPerEdge(Double.parseDouble(properties.getProperty("bitsperlink"))); response.setAvgLocality(Double.parseDouble(properties.getProperty("avglocality"))); response.setIndegreeMin(Long.parseLong(properties.getProperty("minindegree"))); response.setIndegreeMax(Long.parseLong(properties.getProperty("maxindegree"))); response.setIndegreeAvg(Double.parseDouble(properties.getProperty("avgindegree"))); response.setOutdegreeMin(Long.parseLong(properties.getProperty("minoutdegree"))); response.setOutdegreeMax(Long.parseLong(properties.getProperty("maxoutdegree"))); response.setOutdegreeAvg(Double.parseDouble(properties.getProperty("avgoutdegree"))); responseObserver.onNext(response.build()); responseObserver.onCompleted(); } /** Return a single node and its properties. */ @Override public void getNode(GetNodeRequest request, StreamObserver responseObserver) { SwhBidirectionalGraph g = graph.copy(); long nodeId; try { nodeId = g.getNodeId(new SWHID(request.getSwhid())); } catch (IllegalArgumentException e) { responseObserver .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); return; } Node.Builder builder = Node.newBuilder(); NodePropertyBuilder.buildNodeProperties(g.getForwardGraph(), request.hasMask() ? request.getMask() : null, builder, nodeId); responseObserver.onNext(builder.build()); responseObserver.onCompleted(); } /** Perform a BFS traversal from a set of source nodes and stream the nodes encountered. */ @Override public void traverse(TraversalRequest request, StreamObserver responseObserver) { SwhBidirectionalGraph g = graph.copy(); Traversal.SimpleTraversal t; try { t = new Traversal.SimpleTraversal(g, request, responseObserver::onNext); } catch (IllegalArgumentException e) { responseObserver .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); return; } t.visit(); responseObserver.onCompleted(); } /** * Find the shortest path between a set of source nodes and a node that matches a given criteria * using a BFS. */ @Override public void findPathTo(FindPathToRequest request, StreamObserver responseObserver) { SwhBidirectionalGraph g = graph.copy(); Traversal.FindPathTo t; try { t = new Traversal.FindPathTo(g, request); } catch (IllegalArgumentException e) { responseObserver .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); return; } t.visit(); Path path = t.getPath(); if (path == null) { responseObserver.onError(Status.NOT_FOUND.asException()); } else { responseObserver.onNext(path); responseObserver.onCompleted(); } } /** * Find the shortest path between a set of source nodes and a set of destination nodes using a * bidirectional BFS. */ @Override public void findPathBetween(FindPathBetweenRequest request, StreamObserver responseObserver) { SwhBidirectionalGraph g = graph.copy(); Traversal.FindPathBetween t; try { t = new Traversal.FindPathBetween(g, request); } catch (IllegalArgumentException e) { responseObserver .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); return; } t.visit(); Path path = t.getPath(); if (path == null) { responseObserver.onError(Status.NOT_FOUND.asException()); } else { responseObserver.onNext(path); responseObserver.onCompleted(); } } /** Return the number of nodes traversed by a BFS traversal. */ @Override public void countNodes(TraversalRequest request, StreamObserver responseObserver) { AtomicLong count = new AtomicLong(0); SwhBidirectionalGraph g = graph.copy(); TraversalRequest fixedReq = TraversalRequest.newBuilder(request) // Ignore return fields, just count nodes .setMask(FieldMask.getDefaultInstance()).build(); Traversal.SimpleTraversal t; try { t = new Traversal.SimpleTraversal(g, fixedReq, n -> count.incrementAndGet()); } catch (IllegalArgumentException e) { responseObserver .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); return; } t.visit(); CountResponse response = CountResponse.newBuilder().setCount(count.get()).build(); responseObserver.onNext(response); responseObserver.onCompleted(); } /** Return the number of edges traversed by a BFS traversal. */ @Override public void countEdges(TraversalRequest request, StreamObserver responseObserver) { AtomicLong count = new AtomicLong(0); SwhBidirectionalGraph g = graph.copy(); TraversalRequest fixedReq = TraversalRequest.newBuilder(request) // Force return empty successors to count the edges .setMask(FieldMask.newBuilder().addPaths("num_successors").build()).build(); Traversal.SimpleTraversal t; try { t = new Traversal.SimpleTraversal(g, fixedReq, n -> count.addAndGet(n.getNumSuccessors())); } catch (IllegalArgumentException e) { responseObserver .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); return; } t.visit(); CountResponse response = CountResponse.newBuilder().setCount(count.get()).build(); responseObserver.onNext(response); responseObserver.onCompleted(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java b/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java index 5b5bf8e..bbdf4fa 100644 --- a/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java +++ b/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java @@ -1,526 +1,533 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.big.webgraph.labelling.Label; import org.softwareheritage.graph.*; import java.util.*; /** Traversal contains all the algorithms used for graph traversals */ public class Traversal { /** * Wrapper around g.successors(), only follows edges that are allowed by the given * {@link AllowedEdges} object. */ private static ArcLabelledNodeIterator.LabelledArcIterator filterLabelledSuccessors(SwhUnidirectionalGraph g, long nodeId, AllowedEdges allowedEdges) { if (allowedEdges.restrictedTo == null) { // All edges are allowed, bypass edge check return g.labelledSuccessors(nodeId); } else { ArcLabelledNodeIterator.LabelledArcIterator allSuccessors = g.labelledSuccessors(nodeId); return new ArcLabelledNodeIterator.LabelledArcIterator() { @Override public Label label() { return allSuccessors.label(); } @Override public long nextLong() { long neighbor; while ((neighbor = allSuccessors.nextLong()) != -1) { if (allowedEdges.isAllowed(g.getNodeType(nodeId), g.getNodeType(neighbor))) { return neighbor; } } return -1; } @Override public long skip(final long n) { long i = 0; while (i < n && nextLong() != -1) i++; return i; } }; } } /** Helper class to check that a given node is "valid" for some given {@link NodeFilter} */ private static class NodeFilterChecker { private final SwhUnidirectionalGraph g; private final NodeFilter filter; private final AllowedNodes allowedNodes; private NodeFilterChecker(SwhUnidirectionalGraph graph, NodeFilter filter) { this.g = graph; this.filter = filter; this.allowedNodes = new AllowedNodes(filter.hasTypes() ? filter.getTypes() : "*"); } public boolean allowed(long nodeId) { if (filter == null) { return true; } if (!this.allowedNodes.isAllowed(g.getNodeType(nodeId))) { return false; } return true; } } /** Returns the unidirectional graph from a bidirectional graph and a {@link GraphDirection}. */ public static SwhUnidirectionalGraph getDirectedGraph(SwhBidirectionalGraph g, GraphDirection direction) { switch (direction) { case FORWARD: return g.getForwardGraph(); case BACKWARD: return g.getBackwardGraph(); /* * TODO: add support for BOTH case BOTH: return new SwhUnidirectionalGraph(g.symmetrize(), * g.getProperties()); */ default : throw new IllegalArgumentException("Unknown direction: " + direction); } } /** Returns the opposite of a given {@link GraphDirection} (equivalent to a graph transposition). */ public static GraphDirection reverseDirection(GraphDirection direction) { switch (direction) { case FORWARD: return GraphDirection.BACKWARD; case BACKWARD: return GraphDirection.FORWARD; /* * TODO: add support for BOTH case BOTH: return GraphDirection.BOTH; */ default : throw new IllegalArgumentException("Unknown direction: " + direction); } } /** Dummy exception to short-circuit and interrupt a graph traversal. */ static class StopTraversalException extends RuntimeException { } /** Generic BFS traversal algorithm. */ static class BFSVisitor { /** The graph to traverse. */ protected final SwhUnidirectionalGraph g; /** Depth of the node currently being visited */ protected long depth = 0; /** * Number of traversal successors (i.e., successors that will be considered by the traversal) of the * node currently being visited */ protected long traversalSuccessors = 0; /** Number of edges accessed since the beginning of the traversal */ protected long edgesAccessed = 0; /** * Map from a node ID to its parent node ID. The key set can be used as the set of all visited * nodes. */ protected HashMap parents = new HashMap<>(); /** Queue of nodes to visit (also called "frontier", "open set", "wavefront" etc.) */ protected ArrayDeque queue = new ArrayDeque<>(); /** If > 0, the maximum depth of the traversal. */ private long maxDepth = -1; /** If > 0, the maximum number of edges to traverse. */ private long maxEdges = -1; BFSVisitor(SwhUnidirectionalGraph g) { this.g = g; } /** Add a new source node to the initial queue. */ public void addSource(long nodeId) { queue.add(nodeId); parents.put(nodeId, -1L); } /** Set the maximum depth of the traversal. */ public void setMaxDepth(long depth) { maxDepth = depth; } /** Set the maximum number of edges to traverse. */ public void setMaxEdges(long edges) { maxEdges = edges; } /** Setup the visit counters and depth sentinel. */ public void visitSetup() { edgesAccessed = 0; depth = 0; queue.add(-1L); // depth sentinel } /** Perform the visit */ public void visit() { visitSetup(); while (!queue.isEmpty()) { visitStep(); } } /** Single "step" of a visit. Advance the frontier of exactly one node. */ public void visitStep() { try { assert !queue.isEmpty(); long curr = queue.poll(); if (curr == -1L) { ++depth; if (!queue.isEmpty()) { queue.add(-1L); visitStep(); } return; } if (maxDepth >= 0 && depth > maxDepth) { throw new StopTraversalException(); } edgesAccessed += g.outdegree(curr); if (maxEdges >= 0 && edgesAccessed > maxEdges) { throw new StopTraversalException(); } visitNode(curr); } catch (StopTraversalException e) { // Traversal is over, clear the to-do queue. queue.clear(); } } /** * Get the successors of a node. Override this function if you want to filter which successors are * considered during the traversal. */ protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { return g.labelledSuccessors(nodeId); } /** Visit a node. Override to do additional processing on the node. */ protected void visitNode(long node) { ArcLabelledNodeIterator.LabelledArcIterator it = getSuccessors(node); traversalSuccessors = 0; for (long succ; (succ = it.nextLong()) != -1;) { traversalSuccessors++; visitEdge(node, succ, it.label()); } } /** Visit an edge. Override to do additional processing on the edge. */ protected void visitEdge(long src, long dst, Label label) { if (!parents.containsKey(dst)) { queue.add(dst); parents.put(dst, src); } } } /** * SimpleTraversal is used by the Traverse endpoint. It extends BFSVisitor with additional * processing, notably related to graph properties and filters. */ static class SimpleTraversal extends BFSVisitor { private final NodeFilterChecker nodeReturnChecker; private final AllowedEdges allowedEdges; private final TraversalRequest request; private final NodePropertyBuilder.NodeDataMask nodeDataMask; private final NodeObserver nodeObserver; private Node.Builder nodeBuilder; SimpleTraversal(SwhBidirectionalGraph bidirectionalGraph, TraversalRequest request, NodeObserver nodeObserver) { super(getDirectedGraph(bidirectionalGraph, request.getDirection())); this.request = request; this.nodeObserver = nodeObserver; this.nodeReturnChecker = new NodeFilterChecker(g, request.getReturnNodes()); this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null); this.allowedEdges = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*"); request.getSrcList().forEach(srcSwhid -> { long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); addSource(srcNodeId); }); if (request.hasMaxDepth()) { setMaxDepth(request.getMaxDepth()); } if (request.hasMaxEdges()) { setMaxEdges(request.getMaxEdges()); } } @Override protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { return filterLabelledSuccessors(g, nodeId, allowedEdges); } @Override public void visitNode(long node) { nodeBuilder = null; if (nodeReturnChecker.allowed(node) && (!request.hasMinDepth() || depth >= request.getMinDepth())) { nodeBuilder = Node.newBuilder(); NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, node); } super.visitNode(node); if (request.getReturnNodes().hasMinTraversalSuccessors() && traversalSuccessors < request.getReturnNodes().getMinTraversalSuccessors() || request.getReturnNodes().hasMaxTraversalSuccessors() && traversalSuccessors > request.getReturnNodes().getMaxTraversalSuccessors()) { nodeBuilder = null; } if (nodeBuilder != null) { nodeObserver.onNext(nodeBuilder.build()); } } @Override protected void visitEdge(long src, long dst, Label label) { super.visitEdge(src, dst, label); NodePropertyBuilder.buildSuccessorProperties(g, nodeDataMask, nodeBuilder, src, dst, label); } } /** * FindPathTo searches for a path from a source node to a node matching a given criteria It extends * BFSVisitor with additional processing, and makes the traversal stop as soon as a node matching * the given criteria is found. */ static class FindPathTo extends BFSVisitor { private final AllowedEdges allowedEdges; private final FindPathToRequest request; private final NodePropertyBuilder.NodeDataMask nodeDataMask; private final NodeFilterChecker targetChecker; private Long targetNode = null; FindPathTo(SwhBidirectionalGraph bidirectionalGraph, FindPathToRequest request) { super(getDirectedGraph(bidirectionalGraph, request.getDirection())); this.request = request; this.targetChecker = new NodeFilterChecker(g, request.getTarget()); this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null); this.allowedEdges = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*"); if (request.hasMaxDepth()) { setMaxDepth(request.getMaxDepth()); } if (request.hasMaxEdges()) { setMaxEdges(request.getMaxEdges()); } request.getSrcList().forEach(srcSwhid -> { long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); addSource(srcNodeId); }); } @Override protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { return filterLabelledSuccessors(g, nodeId, allowedEdges); } @Override public void visitNode(long node) { if (targetChecker.allowed(node)) { targetNode = node; throw new StopTraversalException(); } super.visitNode(node); } /** * Once the visit has been performed and a matching node has been found, return the shortest path * from the source set to that node. To do so, we need to backtrack the parents of the node until we * find one of the source nodes (whose parent is -1). */ public Path getPath() { if (targetNode == null) { return null; // No path found. } /* Backtrack from targetNode to a source node */ long curNode = targetNode; ArrayList path = new ArrayList<>(); while (curNode != -1) { path.add(curNode); curNode = parents.get(curNode); } Collections.reverse(path); /* Enrich path with node properties */ Path.Builder pathBuilder = Path.newBuilder(); for (long nodeId : path) { Node.Builder nodeBuilder = Node.newBuilder(); NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, nodeId); pathBuilder.addNode(nodeBuilder.build()); } return pathBuilder.build(); } } /** * FindPathBetween searches for a shortest path between a set of source nodes and a set of * destination nodes. * * It does so by performing a *bidirectional breadth-first search*, i.e., two parallel breadth-first * searches, one from the source set ("src-BFS") and one from the destination set ("dst-BFS"), until * both searches find a common node that joins their visited sets. This node is called the "midpoint * node". The path returned is the path src -> ... -> midpoint -> ... -> dst, which is always a * shortest path between src and dst. * * The graph direction of both BFS can be configured separately. By default, the dst-BFS will use * the graph in the opposite direction than the src-BFS (if direction = FORWARD, by default * direction_reverse = BACKWARD, and vice-versa). The default behavior is thus to search for a * shortest path between two nodes in a given direction. However, one can also specify FORWARD or * BACKWARD for *both* the src-BFS and the dst-BFS. This will search for a common descendant or a * common ancestor between the two sets, respectively. These will be the midpoints of the returned * path. */ static class FindPathBetween extends BFSVisitor { private final FindPathBetweenRequest request; private final NodePropertyBuilder.NodeDataMask nodeDataMask; private final AllowedEdges allowedEdgesSrc; private final AllowedEdges allowedEdgesDst; private final BFSVisitor srcVisitor; private final BFSVisitor dstVisitor; private Long middleNode = null; FindPathBetween(SwhBidirectionalGraph bidirectionalGraph, FindPathBetweenRequest request) { super(getDirectedGraph(bidirectionalGraph, request.getDirection())); this.request = request; this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null); GraphDirection direction = request.getDirection(); // if direction_reverse is not specified, use the opposite direction of direction GraphDirection directionReverse = request.hasDirectionReverse() ? request.getDirectionReverse() : reverseDirection(request.getDirection()); SwhUnidirectionalGraph srcGraph = getDirectedGraph(bidirectionalGraph, direction); SwhUnidirectionalGraph dstGraph = getDirectedGraph(bidirectionalGraph, directionReverse); this.allowedEdgesSrc = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*"); /* * If edges_reverse is not specified: - If `edges` is not specified either, defaults to "*" - If * direction == direction_reverse, defaults to `edges` - If direction != direction_reverse, defaults * to the reverse of `edges` (e.g. "rev:dir" becomes "dir:rev"). */ this.allowedEdgesDst = request.hasEdgesReverse() ? new AllowedEdges(request.getEdgesReverse()) : (request.hasEdges() ? (direction == directionReverse ? new AllowedEdges(request.getEdges()) : new AllowedEdges(request.getEdges()).reverse()) : new AllowedEdges("*")); /* * Source sub-visitor. Aborts as soon as it finds a node already visited by the destination * sub-visitor. */ this.srcVisitor = new BFSVisitor(srcGraph) { @Override protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { return filterLabelledSuccessors(g, nodeId, allowedEdgesSrc); } @Override public void visitNode(long node) { if (dstVisitor.parents.containsKey(node)) { middleNode = node; throw new StopTraversalException(); } super.visitNode(node); } }; /* * Destination sub-visitor. Aborts as soon as it finds a node already visited by the source * sub-visitor. */ this.dstVisitor = new BFSVisitor(dstGraph) { @Override protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { return filterLabelledSuccessors(g, nodeId, allowedEdgesDst); } @Override public void visitNode(long node) { if (srcVisitor.parents.containsKey(node)) { middleNode = node; throw new StopTraversalException(); } super.visitNode(node); } }; if (request.hasMaxDepth()) { this.srcVisitor.setMaxDepth(request.getMaxDepth()); this.dstVisitor.setMaxDepth(request.getMaxDepth()); } if (request.hasMaxEdges()) { this.srcVisitor.setMaxEdges(request.getMaxEdges()); this.dstVisitor.setMaxEdges(request.getMaxEdges()); } request.getSrcList().forEach(srcSwhid -> { long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); srcVisitor.addSource(srcNodeId); }); request.getDstList().forEach(srcSwhid -> { long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); dstVisitor.addSource(srcNodeId); }); } @Override public void visit() { /* * Bidirectional BFS: maintain two sub-visitors, and alternately run a visit step in each of them. */ srcVisitor.visitSetup(); dstVisitor.visitSetup(); while (!srcVisitor.queue.isEmpty() || !dstVisitor.queue.isEmpty()) { if (!srcVisitor.queue.isEmpty()) { srcVisitor.visitStep(); } if (!dstVisitor.queue.isEmpty()) { dstVisitor.visitStep(); } } } public Path getPath() { if (middleNode == null) { return null; // No path found. } Path.Builder pathBuilder = Path.newBuilder(); ArrayList path = new ArrayList<>(); /* First section of the path: src -> midpoint */ long curNode = middleNode; while (curNode != -1) { path.add(curNode); curNode = srcVisitor.parents.get(curNode); } pathBuilder.setMidpointIndex(path.size() - 1); Collections.reverse(path); /* Second section of the path: midpoint -> dst */ curNode = dstVisitor.parents.get(middleNode); while (curNode != -1) { path.add(curNode); curNode = dstVisitor.parents.get(curNode); } /* Enrich path with node properties */ for (long nodeId : path) { Node.Builder nodeBuilder = Node.newBuilder(); NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, nodeId); pathBuilder.addNode(nodeBuilder.build()); } return pathBuilder.build(); } } public interface NodeObserver { void onNext(Node nodeId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java index 7b02d76..4f1eda7 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java @@ -1,91 +1,98 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.SwhUnidirectionalGraph; import org.softwareheritage.graph.labels.DirEntry; import java.io.IOException; import java.util.concurrent.TimeUnit; public class DumpProperties { final static Logger logger = LoggerFactory.getLogger(DumpProperties.class); public static void main(String[] args) throws IOException { String graphPath = args[0]; ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); SwhUnidirectionalGraph graph; if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl); } else { graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl); } graph.loadContentLength(); graph.loadContentIsSkipped(); graph.loadPersonIds(); graph.loadAuthorTimestamps(); graph.loadCommitterTimestamps(); graph.loadMessages(); graph.loadTagNames(); graph.loadLabelNames(); ArcLabelledNodeIterator it = graph.labelledNodeIterator(); while (it.hasNext()) { long node = it.nextLong(); System.out.format("%s: %s\n", node, graph.getSWHID(node)); var s = it.successors(); System.out.println(" successors:"); for (long succ; (succ = s.nextLong()) >= 0;) { DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { for (DirEntry label : labels) { System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ), new String(graph.getLabelName(label.filenameId)), label.permission); } } else { System.out.format(" %s\n", graph.getSWHID(succ)); } } switch (graph.getNodeType(node)) { case CNT: System.out.format(" length: %s\n", graph.getContentLength(node)); System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node)); break; case REV: System.out.format(" author: %s\n", graph.getAuthorId(node)); System.out.format(" committer: %s\n", graph.getCommitterId(node)); System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node), graph.getAuthorTimestampOffset(node)); System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node), graph.getCommitterTimestampOffset(node)); byte[] msg = graph.getMessage(node); if (msg != null) { System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n")); } break; case REL: System.out.format(" author: %s\n", graph.getAuthorId(node)); System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node), graph.getAuthorTimestamp(node)); byte[] tagMsg = graph.getMessage(node); if (tagMsg != null) { System.out.format(" message: %s\n", (new String(tagMsg)).replace("\n", "\\n")); } byte[] tagName = graph.getTagName(node); if (tagName != null) { System.out.format(" message: %s\n", (new String(tagName))); } break; case ORI: System.out.format(" url: %s\n", graph.getUrl(node)); } System.out.println(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java index 0f09ccd..a4e017b 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java @@ -1,76 +1,83 @@ +/* + * Copyright (c) 2021 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import com.google.common.primitives.Longs; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.experiments.topology.ConnectedComponents; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class ExportSubdataset { public static void main(String[] args) throws IOException, ClassNotFoundException { System.err.print("Loading everything..."); String graphPath = args[0]; SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath); Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); System.err.println(" done."); final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 LongArrayBitVector visited = LongArrayBitVector.ofLength(n); FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); long i; try { // i = mphMap.getLong(line.getBytes(StandardCharsets.UTF_8)); i = graph.getNodeId(new SWHID(line)); } catch (IllegalArgumentException e) { continue; } queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); SWHID currentNodeSWHID = graph.getSWHID(currentNode); final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { System.out.format("%s %s\n", currentNodeSWHID, graph.getSWHID(succ)); if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java index 3623bb0..bf59b6f 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java @@ -1,113 +1,120 @@ +/* + * Copyright (c) 2021 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.*; import java.io.IOException; import java.time.Duration; import java.util.HashSet; import java.util.Scanner; import java.util.Stack; /* sample invocation on granet.internal.softwareheritage.org for benchmarking * purposes, with the main swh-graph service already running: * * $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-0.3.0.jar -Xmx300G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.FindEarliestRevision --timing /dev/shm/swh-graph/default/graph * */ public class FindEarliestRevision { public static void main(String[] args) throws IOException, ClassNotFoundException { String graphPath = args[0]; boolean timing = false; long ts, elapsedNanos; Duration elapsed; if (args.length >= 2 && (args[0].equals("-t") || args[0].equals("--timing"))) { timing = true; graphPath = args[1]; System.err.println("started with timing option, will keep track of elapsed time"); } System.err.println("loading transposed graph..."); ts = System.nanoTime(); SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath).transpose(); elapsed = Duration.ofNanos(System.nanoTime() - ts); System.err.println(String.format("transposed graph loaded (duration: %s).", elapsed)); System.err.println("loading revision timestamps..."); ts = System.nanoTime(); graph.loadCommitterTimestamps(); elapsed = Duration.ofNanos(System.nanoTime() - ts); System.err.println(String.format("revision timestamps loaded (duration: %s).", elapsed)); Scanner stdin = new Scanner(System.in); AllowedEdges edges = new AllowedEdges("cnt:dir,dir:dir,dir:rev"); String rawSWHID = null; SWHID srcSWHID = null; long lineCount = 0; long srcNodeId = -1; if (timing) { System.err.println("starting SWHID processing..."); elapsed = Duration.ZERO; } while (stdin.hasNextLine()) { if (timing) ts = System.nanoTime(); rawSWHID = stdin.nextLine().strip(); lineCount++; try { srcSWHID = new SWHID(rawSWHID); srcNodeId = graph.getNodeId(srcSWHID); } catch (IllegalArgumentException e) { System.err .println(String.format("skipping invalid or unknown SWHID %s on line %d", rawSWHID, lineCount)); continue; } if (timing) System.err.println("starting traversal for: " + srcSWHID.toString()); Stack stack = new Stack<>(); HashSet visited = new HashSet<>(); stack.push(srcNodeId); visited.add(srcNodeId); long minRevId = -1; long minTimestamp = Long.MAX_VALUE; while (!stack.isEmpty()) { long currentNodeId = stack.pop(); if (graph.getNodeType(currentNodeId) == Node.Type.REV) { long committerTs = graph.getCommitterTimestamp(currentNodeId); if (committerTs < minTimestamp) { minRevId = currentNodeId; minTimestamp = committerTs; } } LazyLongIterator it = graph.successors(currentNodeId); for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { if (!edges.isAllowed(graph.getNodeType(currentNodeId), graph.getNodeType(neighborNodeId))) { continue; } if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); } } } if (minRevId == -1) { System.err.println("no revision found containing: " + srcSWHID.toString()); } else { System.out.println(srcSWHID.toString() + "\t" + graph.getSWHID(minRevId).toString()); } if (timing) { elapsedNanos = System.nanoTime() - ts; // processing time for current SWHID elapsed = elapsed.plus(Duration.ofNanos(elapsedNanos)); // cumulative processing time for all SWHIDs System.err.printf("visit time (s):\t%.6f\n", (double) elapsedNanos / 1_000_000_000); } } if (timing) System.err.printf("processed %d SWHIDs in %s (%s avg)\n", lineCount, elapsed, elapsed.dividedBy(lineCount)); } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java index d316047..dadaa51 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java @@ -1,197 +1,204 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.fastutil.BigArrays; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinTask; import java.util.concurrent.RecursiveAction; public class ForkJoinBigQuickSort2 extends RecursiveAction { private static final long serialVersionUID = 1L; private final long from; private final long to; private final long[][] x, y; private static final int QUICKSORT_NO_REC = 16; private static final int PARALLEL_QUICKSORT_NO_FORK = 8192; private static final int QUICKSORT_MEDIAN_OF_9 = 128; public ForkJoinBigQuickSort2(final long[][] x, final long[][] y, final long from, final long to) { this.from = from; this.to = to; this.x = x; this.y = y; } @Override protected void compute() { final long[][] x = this.x; final long[][] y = this.y; final long len = to - from; if (len < PARALLEL_QUICKSORT_NO_FORK) { quickSort(x, y, from, to); return; } // Choose a partition element, v long m = from + len / 2; long l = from; long n = to - 1; long s = len / 8; l = med3(x, y, l, l + s, l + 2 * s); m = med3(x, y, m - s, m, m + s); n = med3(x, y, n - 2 * s, n - s, n); m = med3(x, y, l, m, n); final long xm = BigArrays.get(x, m), ym = BigArrays.get(y, m); // Establish Invariant: v* (v)* v* long a = from, b = a, c = to - 1, d = c; while (true) { int comparison; while (b <= c && (comparison = compare(x, y, b, xm, ym)) <= 0) { if (comparison == 0) swap(x, y, a++, b); b++; } while (c >= b && (comparison = compare(x, y, c, xm, ym)) >= 0) { if (comparison == 0) swap(x, y, c, d--); c--; } if (b > c) break; swap(x, y, b++, c--); } // Swap partition elements back to middle long t; s = Math.min(a - from, b - a); swap(x, y, from, b - s, s); s = Math.min(d - c, to - d - 1); swap(x, y, b, to - s, s); s = b - a; t = d - c; // Recursively sort non-partition-elements if (s > 1 && t > 1) invokeAll(new ForkJoinBigQuickSort2(x, y, from, from + s), new ForkJoinBigQuickSort2(x, y, to - t, to)); else if (s > 1) invokeAll(new ForkJoinBigQuickSort2(x, y, from, from + s)); else invokeAll(new ForkJoinBigQuickSort2(x, y, to - t, to)); } public static void quickSort(final long[][] x, final long[][] y, final long from, final long to) { final long len = to - from; if (len < QUICKSORT_NO_REC) { selectionSort(x, y, from, to); return; } // Choose a partition element, v long m = from + len / 2; long l = from; long n = to - 1; if (len > QUICKSORT_MEDIAN_OF_9) { // Big arrays, pseudomedian of 9 long s = len / 8; l = med3(x, y, l, l + s, l + 2 * s); m = med3(x, y, m - s, m, m + s); n = med3(x, y, n - 2 * s, n - s, n); } m = med3(x, y, l, m, n); // Mid-size, med of 3 // Establish Invariant: v* (v)* v* long a = from, b = a, c = to - 1, d = c; final long xm = BigArrays.get(x, m), ym = BigArrays.get(y, m); while (true) { long comparison; while (b <= c && (comparison = compare(x, y, b, xm, ym)) <= 0) { if (comparison == 0) swap(x, y, a++, b); b++; } while (c >= b && (comparison = compare(x, y, c, xm, ym)) >= 0) { if (comparison == 0) swap(x, y, c, d--); c--; } if (b > c) break; swap(x, y, b++, c--); } // Swap partition elements back to middle long s; s = Math.min(a - from, b - a); swap(x, y, from, b - s, s); s = Math.min(d - c, to - d - 1); swap(x, y, b, to - s, s); // Recursively sort non-partition-elements if ((s = b - a) > 1) quickSort(x, y, from, from + s); if ((s = d - c) > 1) quickSort(x, y, to - s, to); } public static void quickSort(final long[][] x, final long[][] y) { quickSort(x, y, 0, x.length); } private static int compare(final long[][] x, final long[][] y, final long u, final long v) { int tx; return (tx = Long.compare(BigArrays.get(x, u), BigArrays.get(x, v))) != 0 ? tx : Long.compare(BigArrays.get(y, u), BigArrays.get(y, v)); } private static int compare(final long[][] x, final long[][] y, final long i, final long xm, final long ym) { int tx; return (tx = Long.compare(BigArrays.get(x, i), xm)) != 0 ? tx : Long.compare(BigArrays.get(y, i), ym); } private static void swap(final long[][] x, final long[][] y, final long a, final long b) { BigArrays.swap(x, a, b); BigArrays.swap(y, a, b); } private static void swap(final long[][] x, final long[][] y, long a, long b, final long n) { for (long i = 0; i < n; i++, a++, b++) swap(x, y, a, b); } private static long med3(final long[][] x, final long[][] y, final long a, final long b, final long c) { final int ab = compare(x, y, a, b); final int ac = compare(x, y, a, c); final int bc = compare(x, y, b, c); return (ab < 0 ? (bc < 0 ? b : ac < 0 ? c : a) : (bc > 0 ? b : ac > 0 ? c : a)); } public static void selectionSort(final long[][] a, final long[][] b, final long from, final long to) { for (long i = from; i < to - 1; i++) { long m = i; for (long j = i + 1; j < to; j++) if (compare(a, b, j, m) < 0) m = j; if (m != i) { BigArrays.swap(a, i, m); BigArrays.swap(b, i, m); } } } public static void selectionSort(final long[][] x, final long[][] y) { selectionSort(x, y, 0, x.length); } public static ForkJoinPool getPool() { ForkJoinPool current = ForkJoinTask.getPool(); return current == null ? ForkJoinPool.commonPool() : current; } public static void parallelQuickSort(final long[][] x, final long[][] y) { BigArrays.ensureSameLength(x, y); parallelQuickSort(x, y, 0, x.length); } public static void parallelQuickSort(final long[][] x, final long[][] y, final long from, final long to) { ForkJoinPool pool = getPool(); if (to - from < PARALLEL_QUICKSORT_NO_FORK || pool.getParallelism() == 1) quickSort(x, y, from, to); else { pool.invoke(new ForkJoinBigQuickSort2(x, y, from, to)); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java index f423369..57ae71d 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java @@ -1,217 +1,224 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinTask; import java.util.concurrent.RecursiveAction; import static it.unimi.dsi.fastutil.longs.LongArrays.ensureSameLength; public class ForkJoinQuickSort3 extends RecursiveAction { private static final long serialVersionUID = 1L; private final int from; private final int to; private final long[] x, y, z; private static final int QUICKSORT_NO_REC = 16; private static final int PARALLEL_QUICKSORT_NO_FORK = 8192; private static final int QUICKSORT_MEDIAN_OF_9 = 128; public ForkJoinQuickSort3(final long[] x, final long[] y, final long z[], final int from, final int to) { this.from = from; this.to = to; this.x = x; this.y = y; this.z = z; } @Override protected void compute() { final long[] x = this.x; final long[] y = this.y; final long[] z = this.z; final int len = to - from; if (len < PARALLEL_QUICKSORT_NO_FORK) { quickSort(x, y, z, from, to); return; } // Choose a partition element, v int m = from + len / 2; int l = from; int n = to - 1; int s = len / 8; l = med3(x, y, z, l, l + s, l + 2 * s); m = med3(x, y, z, m - s, m, m + s); n = med3(x, y, z, n - 2 * s, n - s, n); m = med3(x, y, z, l, m, n); final long xm = x[m], ym = y[m], zm = z[m]; // Establish Invariant: v* (v)* v* int a = from, b = a, c = to - 1, d = c; while (true) { int comparison, t; while (b <= c && (comparison = compare(x, y, z, b, xm, ym, zm)) <= 0) { if (comparison == 0) swap(x, y, z, a++, b); b++; } while (c >= b && (comparison = compare(x, y, z, c, xm, ym, zm)) >= 0) { if (comparison == 0) swap(x, y, z, c, d--); c--; } if (b > c) break; swap(x, y, z, b++, c--); } // Swap partition elements back to middle int t; s = Math.min(a - from, b - a); swap(x, y, z, from, b - s, s); s = Math.min(d - c, to - d - 1); swap(x, y, z, b, to - s, s); s = b - a; t = d - c; // Recursively sort non-partition-elements if (s > 1 && t > 1) invokeAll(new ForkJoinQuickSort3(x, y, z, from, from + s), new ForkJoinQuickSort3(x, y, z, to - t, to)); else if (s > 1) invokeAll(new ForkJoinQuickSort3(x, y, z, from, from + s)); else invokeAll(new ForkJoinQuickSort3(x, y, z, to - t, to)); } public static void quickSort(final long[] x, final long[] y, final long[] z, final int from, final int to) { final int len = to - from; if (len < QUICKSORT_NO_REC) { selectionSort(x, y, z, from, to); return; } // Choose a partition element, v int m = from + len / 2; int l = from; int n = to - 1; if (len > QUICKSORT_MEDIAN_OF_9) { // Big arrays, pseudomedian of 9 int s = len / 8; l = med3(x, y, z, l, l + s, l + 2 * s); m = med3(x, y, z, m - s, m, m + s); n = med3(x, y, z, n - 2 * s, n - s, n); } m = med3(x, y, z, l, m, n); // Mid-size, med of 3 // Establish Invariant: v* (v)* v* int a = from, b = a, c = to - 1, d = c; final long xm = x[m], ym = y[m], zm = z[m]; while (true) { int comparison; while (b <= c && (comparison = compare(x, y, z, b, xm, ym, zm)) <= 0) { if (comparison == 0) swap(x, y, z, a++, b); b++; } while (c >= b && (comparison = compare(x, y, z, c, xm, ym, zm)) >= 0) { if (comparison == 0) swap(x, y, z, c, d--); c--; } if (b > c) break; swap(x, y, z, b++, c--); } // Swap partition elements back to middle int s; s = Math.min(a - from, b - a); swap(x, y, z, from, b - s, s); s = Math.min(d - c, to - d - 1); swap(x, y, z, b, to - s, s); // Recursively sort non-partition-elements if ((s = b - a) > 1) quickSort(x, y, z, from, from + s); if ((s = d - c) > 1) quickSort(x, y, z, to - s, to); } public static void quickSort(final long[] x, final long[] y, final long[] z) { quickSort(x, y, z, 0, x.length); } private static int compare(final long[] x, final long[] y, final long[] z, final int u, final int v) { int tx, ty; return (tx = Long.compare(x[u], x[v])) != 0 ? tx : ((ty = Long.compare(y[u], y[v])) != 0 ? ty : Long.compare(z[u], z[v])); } private static int compare(final long[] x, final long[] y, final long[] z, final int i, final long xm, final long ym, final long zm) { int tx, ty; return (tx = Long.compare(x[i], xm)) != 0 ? tx : ((ty = Long.compare(y[i], ym)) != 0 ? ty : Long.compare(z[i], zm)); } private static void swap(final long[] x, final long[] y, final long[] z, final int a, final int b) { final long t = x[a]; final long u = y[a]; final long v = z[a]; x[a] = x[b]; y[a] = y[b]; z[a] = z[b]; x[b] = t; y[b] = u; z[b] = v; } private static void swap(final long[] x, final long[] y, final long[] z, int a, int b, final int n) { for (int i = 0; i < n; i++, a++, b++) swap(x, y, z, a, b); } private static int med3(final long[] x, final long[] y, final long[] z, final int a, final int b, final int c) { final int ab = compare(x, y, z, a, b); final int ac = compare(x, y, z, a, c); final int bc = compare(x, y, z, b, c); return (ab < 0 ? (bc < 0 ? b : ac < 0 ? c : a) : (bc > 0 ? b : ac > 0 ? c : a)); } public static void selectionSort(final long[] a, final long[] b, long[] c, final int from, final int to) { for (int i = from; i < to - 1; i++) { int m = i; for (int j = i + 1; j < to; j++) if (compare(a, b, c, j, m) < 0) m = j; if (m != i) { long t = a[i]; a[i] = a[m]; a[m] = t; t = b[i]; b[i] = b[m]; b[m] = t; t = c[i]; c[i] = c[m]; c[m] = t; } } } public static void selectionSort(final long[] x, final long[] y, final long[] z) { selectionSort(x, y, z, 0, x.length); } public static ForkJoinPool getPool() { ForkJoinPool current = ForkJoinTask.getPool(); return current == null ? ForkJoinPool.commonPool() : current; } public static void parallelQuickSort(final long[] x, final long[] y, final long[] z) { ensureSameLength(x, y); ensureSameLength(x, z); parallelQuickSort(x, y, z, 0, x.length); } public static void parallelQuickSort(final long[] x, final long[] y, final long[] z, final int from, final int to) { ForkJoinPool pool = getPool(); if (to - from < PARALLEL_QUICKSORT_NO_FORK || pool.getParallelism() == 1) quickSort(x, y, z, from, to); else { pool.invoke(new ForkJoinQuickSort3(x, y, z, from, to)); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java index 0d672e2..71d6dab 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java @@ -1,46 +1,53 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import com.martiansoftware.jsap.*; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class MPHTranslate { private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(MPHTranslate.class.getName(), "", new Parameter[]{new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.REQUIRED, "Filename of the serialized MPH"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String mphPath = config.getString("function"); Object2LongFunction mphMap = NodeIdMap.loadMph(mphPath); // TODO: wasteful to convert to/from bytes FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); System.out.println(mphMap.getLong(line.getBytes(StandardCharsets.US_ASCII))); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java index c760032..7daec23 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java @@ -1,40 +1,47 @@ +/* + * Copyright (c) 2020-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.big.webgraph.NodeIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.SwhUnidirectionalGraph; import java.io.IOException; import java.util.concurrent.TimeUnit; public class ReadGraph { final static Logger logger = LoggerFactory.getLogger(ReadLabelledGraph.class); public static void main(String[] args) throws IOException { String graphPath = args[0]; SwhUnidirectionalGraph graph; ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { graph = SwhUnidirectionalGraph.loadMapped(graphPath, pl); } else { graph = SwhUnidirectionalGraph.load(graphPath, pl); } pl.expectedUpdates = graph.numArcs(); pl.start("Reading graph..."); NodeIterator it = graph.nodeIterator(); while (it.hasNext()) { long srcNode = it.nextLong(); var s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode)); pl.lightUpdate(); } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java index 3c64bbd..c8e0a9f 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java @@ -1,48 +1,55 @@ +/* + * Copyright (c) 2020-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.SwhUnidirectionalGraph; import org.softwareheritage.graph.labels.DirEntry; import java.io.IOException; import java.util.concurrent.TimeUnit; public class ReadLabelledGraph { final static Logger logger = LoggerFactory.getLogger(ReadLabelledGraph.class); public static void main(String[] args) throws IOException, ClassNotFoundException { String graphPath = args[0]; ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); SwhUnidirectionalGraph graph; if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl); } else { graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl); } graph.properties.loadLabelNames(); ArcLabelledNodeIterator it = graph.labelledNodeIterator(); while (it.hasNext()) { long srcNode = it.nextLong(); ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { for (DirEntry label : labels) { System.out.format("%s %s %s %d\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode), new String(graph.properties.getLabelName(label.filenameId)), label.permission); } } else { System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode)); } } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/Sort.java b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java index 2181a53..9a69b94 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/Sort.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java @@ -1,32 +1,39 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; public class Sort { public static Process spawnSort(String sortBufferSize, String sortTmpDir) throws IOException { return spawnSort(sortBufferSize, sortTmpDir, null); } public static Process spawnSort(String sortBufferSize, String sortTmpDir, List options) throws IOException { ProcessBuilder sortProcessBuilder = new ProcessBuilder(); sortProcessBuilder.redirectError(ProcessBuilder.Redirect.INHERIT); ArrayList command = new ArrayList<>(List.of("sort", "-u", "--buffer-size", sortBufferSize)); if (sortTmpDir != null) { command.add("--temporary-directory"); command.add(sortTmpDir); } if (options != null) { command.addAll(options); } sortProcessBuilder.command(command); Map env = sortProcessBuilder.environment(); env.put("LC_ALL", "C"); env.put("LC_COLLATE", "C"); env.put("LANG", "C"); return sortProcessBuilder.start(); } } diff --git a/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java b/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java index f91f6ed..022f2b6 100644 --- a/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java @@ -1,113 +1,120 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.ArrayList; public class AllowedEdgesTest extends GraphTest { static class EdgeType { Node.Type src; Node.Type dst; public EdgeType(Node.Type src, Node.Type dst) { this.src = src; this.dst = dst; } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof EdgeType)) return false; EdgeType other = (EdgeType) otherObj; return src == other.src && dst == other.dst; } } void assertEdgeRestriction(AllowedEdges edges, ArrayList expectedAllowed) { Node.Type[] nodeTypes = Node.Type.values(); for (Node.Type src : nodeTypes) { for (Node.Type dst : nodeTypes) { EdgeType edge = new EdgeType(src, dst); boolean isAllowed = edges.isAllowed(src, dst); boolean isExpected = false; for (EdgeType expected : expectedAllowed) { if (expected.equals(edge)) { isExpected = true; break; } } Assertions.assertEquals(isAllowed, isExpected, "Edge type: " + src + " -> " + dst); } } } @Test public void dirToDirDirToCntEdges() { AllowedEdges edges = new AllowedEdges("dir:dir,dir:cnt"); ArrayList expected = new ArrayList<>(); expected.add(new EdgeType(Node.Type.DIR, Node.Type.DIR)); expected.add(new EdgeType(Node.Type.DIR, Node.Type.CNT)); assertEdgeRestriction(edges, expected); } @Test public void relToRevRevToRevRevToDirEdges() { AllowedEdges edges = new AllowedEdges("rel:rev,rev:rev,rev:dir"); ArrayList expected = new ArrayList<>(); expected.add(new EdgeType(Node.Type.REL, Node.Type.REV)); expected.add(new EdgeType(Node.Type.REV, Node.Type.REV)); expected.add(new EdgeType(Node.Type.REV, Node.Type.DIR)); assertEdgeRestriction(edges, expected); } @Test public void revToAllDirToDirEdges() { AllowedEdges edges = new AllowedEdges("rev:*,dir:dir"); ArrayList expected = new ArrayList<>(); for (Node.Type dst : Node.Type.values()) { expected.add(new EdgeType(Node.Type.REV, dst)); } expected.add(new EdgeType(Node.Type.DIR, Node.Type.DIR)); assertEdgeRestriction(edges, expected); } @Test public void allToCntEdges() { AllowedEdges edges = new AllowedEdges("*:cnt"); ArrayList expected = new ArrayList<>(); for (Node.Type src : Node.Type.values()) { expected.add(new EdgeType(src, Node.Type.CNT)); } assertEdgeRestriction(edges, expected); } @Test public void allEdges() { AllowedEdges edges = new AllowedEdges("*:*"); ArrayList expected = new ArrayList<>(); for (Node.Type src : Node.Type.values()) { for (Node.Type dst : Node.Type.values()) { expected.add(new EdgeType(src, dst)); } } assertEdgeRestriction(edges, expected); // Special null value used to quickly bypass edge check when no restriction AllowedEdges edges2 = new AllowedEdges("*"); Assertions.assertNull(edges2.restrictedTo); } @Test public void noEdges() { AllowedEdges edges = new AllowedEdges(""); AllowedEdges edges2 = new AllowedEdges(null); ArrayList expected = new ArrayList<>(); assertEdgeRestriction(edges, expected); assertEdgeRestriction(edges2, expected); } } diff --git a/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java b/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java index ca6479f..7d66391 100644 --- a/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java @@ -1,53 +1,60 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.Set; public class AllowedNodesTest extends GraphTest { void assertNodeRestriction(AllowedNodes nodes, Set expectedAllowed) { Node.Type[] nodeTypes = Node.Type.values(); for (Node.Type t : nodeTypes) { boolean isAllowed = nodes.isAllowed(t); boolean isExpected = expectedAllowed.contains(t); Assertions.assertEquals(isAllowed, isExpected, "Node type: " + t); } } @Test public void dirCntNodes() { AllowedNodes edges = new AllowedNodes("dir,cnt"); Set expected = Set.of(Node.Type.DIR, Node.Type.CNT); assertNodeRestriction(edges, expected); } @Test public void revDirNodes() { AllowedNodes edges = new AllowedNodes("rev,dir"); Set expected = Set.of(Node.Type.DIR, Node.Type.REV); assertNodeRestriction(edges, expected); } @Test public void relSnpCntNodes() { AllowedNodes edges = new AllowedNodes("rel,snp,cnt"); Set expected = Set.of(Node.Type.REL, Node.Type.SNP, Node.Type.CNT); assertNodeRestriction(edges, expected); } @Test public void allNodes() { AllowedNodes edges = new AllowedNodes("*"); Set expected = Set.of(Node.Type.REL, Node.Type.SNP, Node.Type.CNT, Node.Type.DIR, Node.Type.REV, Node.Type.ORI); assertNodeRestriction(edges, expected); } @Test public void noNodes() { AllowedNodes edges = new AllowedNodes(""); Set expected = Set.of(); assertNodeRestriction(edges, expected); } } diff --git a/java/src/test/java/org/softwareheritage/graph/GraphTest.java b/java/src/test/java/org/softwareheritage/graph/GraphTest.java index 94df365..872784f 100644 --- a/java/src/test/java/org/softwareheritage/graph/GraphTest.java +++ b/java/src/test/java/org/softwareheritage/graph/GraphTest.java @@ -1,60 +1,67 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.Iterator; import com.github.luben.zstd.ZstdInputStream; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.LazyLongIterators; import org.junit.jupiter.api.BeforeAll; import static org.junit.Assert.assertEquals; public class GraphTest { static SwhBidirectionalGraph graph; final protected String TEST_ORIGIN_ID = "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"; @BeforeAll public static void setUp() throws IOException { graph = SwhBidirectionalGraph.loadLabelled(getGraphPath().toString()); } public static Path getGraphPath() { return Paths.get("..", "swh", "graph", "tests", "dataset", "compressed", "example"); } public static SwhBidirectionalGraph getGraph() { return graph; } public static SWHID fakeSWHID(String type, int num) { return new SWHID(String.format("swh:1:%s:%040d", type, num)); } public static void assertEqualsAnyOrder(Collection expected, Collection actual) { ArrayList expectedList = new ArrayList<>(expected); ArrayList actualList = new ArrayList<>(actual); expectedList.sort(Comparator.comparing(Object::toString)); actualList.sort(Comparator.comparing(Object::toString)); assertEquals(expectedList, actualList); } public static ArrayList lazyLongIteratorToList(LazyLongIterator input) { ArrayList inputList = new ArrayList<>(); Iterator inputIt = LazyLongIterators.eager(input); inputIt.forEachRemaining(inputList::add); return inputList; } public static String[] readZstFile(Path zstFile) throws IOException { ZstdInputStream zis = new ZstdInputStream(new FileInputStream(zstFile.toFile())); return (new String(zis.readAllBytes())).split("\n"); } } diff --git a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java index e471799..cce1a45 100644 --- a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java +++ b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java @@ -1,85 +1,92 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.util.*; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class SubgraphTest extends GraphTest { @Test public void noFilter() { SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("*")); for (long i = 0; i < g.numNodes(); ++i) { Assertions.assertEquals(g.outdegree(i), sg.outdegree(i)); } } @Test public void missingNode() { SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); SWHID rev1 = fakeSWHID("rev", 18); Assertions.assertThrows(IllegalArgumentException.class, () -> { sg.outdegree(sg.getNodeId(rev1)); }); Assertions.assertThrows(IllegalArgumentException.class, () -> { sg.successors(sg.getNodeId(rev1)); }); } @Test public void outdegreeOnlyDirOri() { SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); SWHID dir1 = fakeSWHID("dir", 17); Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir1))); Assertions.assertEquals(1, sg.outdegree(sg.getNodeId(dir1))); SWHID dir2 = fakeSWHID("dir", 6); Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir2))); Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(dir2))); SWHID ori1 = new SWHID(TEST_ORIGIN_ID); Assertions.assertEquals(1, g.outdegree(g.getNodeId(ori1))); Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(ori1))); } @Test public void successorsOnlyDirOri() { SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); SWHID dir1 = fakeSWHID("dir", 17); assertEqualsAnyOrder(Collections.singletonList(sg.getNodeId(fakeSWHID("dir", 16))), lazyLongIteratorToList(sg.successors(sg.getNodeId(dir1)))); SWHID dir2 = fakeSWHID("dir", 6); assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(dir2)))); SWHID ori1 = new SWHID(TEST_ORIGIN_ID); assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(ori1)))); } @Test public void nodeIteratorOnlyOriDir() { SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); ArrayList nodeList = new ArrayList<>(); Iterator nodeIt = sg.nodeIterator(); nodeIt.forEachRemaining(nodeList::add); assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(new SWHID(TEST_ORIGIN_ID)), sg.getNodeId(fakeSWHID("dir", 2)), sg.getNodeId(fakeSWHID("dir", 6)), sg.getNodeId(fakeSWHID("dir", 8)), sg.getNodeId(fakeSWHID("dir", 12)), sg.getNodeId(fakeSWHID("dir", 16)), sg.getNodeId(fakeSWHID("dir", 17))), nodeList); sg = new Subgraph(g, new AllowedNodes("snp,rel")); nodeList = new ArrayList<>(); nodeIt = sg.nodeIterator(); nodeIt.forEachRemaining(nodeList::add); assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(fakeSWHID("snp", 20)), sg.getNodeId(fakeSWHID("rel", 10)), sg.getNodeId(fakeSWHID("rel", 19))), nodeList); } } diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java index d9713f8..4576aae 100644 --- a/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java @@ -1,106 +1,113 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import org.apache.commons.codec.digest.DigestUtils; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.softwareheritage.graph.GraphTest; import org.softwareheritage.graph.Node; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; import java.util.TreeSet; public class ExtractNodesTest extends GraphTest { /** Generate a fake SWHID for a given node type and numeric ID */ private static byte[] f(String type, int id) { String hash = new String(DigestUtils.sha1Hex(type + id).getBytes()); return String.format("swh:1:%s:%s", type, hash).getBytes(); } static class FakeDataset implements GraphDataset { @Override public void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException { // For each node type, write nodes {1..4} as present in the graph for (Node.Type type : Node.Type.values()) { for (int i = 1; i <= 4; i++) { byte[] node = f(type.toString().toLowerCase(), i); nodeCb.onNode(node); } } edgeCb.onEdge(f("ori", 1), f("snp", 1), null, -1); edgeCb.onEdge(f("ori", 2), f("snp", 2), null, -1); edgeCb.onEdge(f("ori", 3), f("snp", 3), null, -1); edgeCb.onEdge(f("ori", 4), f("snp", 404), null, -1); edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup1".getBytes(), -1); edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup2".getBytes(), -1); edgeCb.onEdge(f("snp", 3), f("cnt", 1), "c1".getBytes(), -1); edgeCb.onEdge(f("snp", 4), f("rel", 1), "r1".getBytes(), -1); edgeCb.onEdge(f("rel", 1), f("rel", 2), null, -1); edgeCb.onEdge(f("rel", 2), f("rev", 1), null, -1); edgeCb.onEdge(f("rel", 3), f("rev", 2), null, -1); edgeCb.onEdge(f("rel", 4), f("dir", 1), null, -1); edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1); edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1); edgeCb.onEdge(f("rev", 1), f("rev", 2), null, -1); edgeCb.onEdge(f("rev", 2), f("rev", 404), null, -1); edgeCb.onEdge(f("rev", 3), f("rev", 2), null, -1); edgeCb.onEdge(f("rev", 4), f("dir", 1), null, -1); edgeCb.onEdge(f("dir", 1), f("cnt", 1), "c1".getBytes(), 42); edgeCb.onEdge(f("dir", 1), f("dir", 1), "d1".getBytes(), 1337); edgeCb.onEdge(f("dir", 1), f("rev", 1), "r1".getBytes(), 0); } } @Test public void testExtractNodes(@TempDir Path outputDir, @TempDir Path sortTmpDir) throws IOException, InterruptedException { FakeDataset dataset = new FakeDataset(); ExtractNodes.extractNodes(dataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toFile()); // Check count files Long nodeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.nodes.count.txt")).strip()); Long edgeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.edges.count.txt")).strip()); Long labelCount = Long.parseLong(Files.readString(outputDir.resolve("graph.labels.count.txt")).strip()); Assertions.assertEquals(26L, nodeCount); Assertions.assertEquals(21L, edgeCount); Assertions.assertEquals(5L, labelCount); // Check stat files List nodeStats = Files.readAllLines(outputDir.resolve("graph.nodes.stats.txt")); List edgeStats = Files.readAllLines(outputDir.resolve("graph.edges.stats.txt")); Assertions.assertEquals(nodeStats, List.of("cnt 4", "dir 4", "ori 4", "rel 4", "rev 5", "snp 5")); Assertions.assertEquals(edgeStats, List.of("dir:cnt 1", "dir:dir 1", "dir:rev 1", "ori:snp 4", "rel:dir 1", "rel:rel 1", "rel:rev 2", "rev:dir 1", "rev:rev 5", "snp:cnt 1", "snp:rel 1", "snp:rev 2")); // Build ordered set of expected node IDs TreeSet expectedNodes = new TreeSet<>(); for (Node.Type type : Node.Type.values()) { for (int i = 1; i <= 4; i++) { byte[] node = f(type.toString().toLowerCase(), i); expectedNodes.add(new String(node)); } } expectedNodes.add(new String(f("snp", 404))); expectedNodes.add(new String(f("rev", 404))); String[] nodeLines = readZstFile(outputDir.resolve("graph.nodes.csv.zst")); Assertions.assertArrayEquals(expectedNodes.toArray(new String[0]), nodeLines); // Build ordered set of expected label IDs TreeSet expectedLabels = new TreeSet<>(); expectedLabels.add("dup1"); expectedLabels.add("dup2"); expectedLabels.add("c1"); expectedLabels.add("r1"); expectedLabels.add("d1"); String[] labelLines = readZstFile(outputDir.resolve("graph.labels.csv.zst")); Assertions.assertArrayEquals(expectedLabels.toArray(new String[0]), labelLines); } } diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java index 9089d0d..142d849 100644 --- a/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java +++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java @@ -1,76 +1,83 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.compress; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.softwareheritage.graph.GraphTest; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; public class ExtractPersonsTest extends GraphTest { private static class FakeORCDataset extends ORCGraphDataset { private static class FakeSwhOrcTable extends ORCGraphDataset.SwhOrcTable { private final String tableName; public FakeSwhOrcTable(String tableName) { this.tableName = tableName; } @Override public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException { if (tableName.equals("revision") && longColumn.equals("author")) { cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_author_1".getBytes()); cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_author_1".getBytes()); cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_author_2".getBytes()); cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_1".getBytes()); cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_3".getBytes()); } else if (tableName.equals("revision") && longColumn.equals("committer")) { cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_committer_1".getBytes()); cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_committer_1".getBytes()); cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_committer_2".getBytes()); cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_2".getBytes()); cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_1".getBytes()); cb.onBytes(fakeSWHID("rev", 6).toBytes(), "rev_committer_1".getBytes()); } else if (tableName.equals("release") && longColumn.equals("author")) { cb.onBytes(fakeSWHID("rel", 1).toBytes(), "rel_committer_1".getBytes()); cb.onBytes(fakeSWHID("rel", 2).toBytes(), "rel_committer_1".getBytes()); cb.onBytes(fakeSWHID("rel", 3).toBytes(), "rel_committer_2".getBytes()); cb.onBytes(fakeSWHID("rel", 4).toBytes(), "rev_author_2".getBytes()); cb.onBytes(fakeSWHID("rel", 5).toBytes(), "rev_author_1".getBytes()); cb.onBytes(fakeSWHID("rel", 6).toBytes(), "rev_committer_1".getBytes()); cb.onBytes(fakeSWHID("rel", 7).toBytes(), "rel_committer_1".getBytes()); } else { throw new RuntimeException("Unknown table/column: " + tableName + "/" + longColumn); } } } public SwhOrcTable getTable(String tableName) { return new FakeSwhOrcTable(tableName); } } @Test public void testExtractPersons(@TempDir Path outputDir, @TempDir Path sortTmpDir) throws IOException, InterruptedException { FakeORCDataset fakeORCDataset = new FakeORCDataset(); ExtractPersons.extractPersons(fakeORCDataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toString()); ArrayList expectedPersons = new ArrayList<>(Arrays.asList("rev_author_1", "rev_author_2", "rev_author_3", "rev_committer_1", "rev_committer_2", "rel_committer_1", "rel_committer_2")); // Check count files Long personsCount = Long.parseLong(Files.readString(outputDir.resolve("graph.persons.count.txt")).strip()); Assertions.assertEquals(expectedPersons.size(), personsCount); // Check persons expectedPersons.sort(String::compareTo); String[] personLines = readZstFile(outputDir.resolve("graph.persons.csv.zst")); Assertions.assertArrayEquals(expectedPersons.toArray(new String[0]), personLines); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java index be76492..218a79c 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java @@ -1,203 +1,210 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import io.grpc.Status; import io.grpc.StatusRuntimeException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.SWHID; import java.util.ArrayList; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class FindPathBetweenTest extends TraversalServiceTest { private FindPathBetweenRequest.Builder getRequestBuilder(SWHID src, SWHID dst) { return FindPathBetweenRequest.newBuilder().addSrc(src.toString()).addDst(dst.toString()); } @Test public void testSwhidErrors() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client .findPathBetween(FindPathBetweenRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest .newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest .newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest.newBuilder().addSrc(TEST_ORIGIN_ID) .addDst("swh:1:cnt:000000000000000000000000000000000000000z").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } @Test public void testEdgeErrors() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest .newBuilder().addSrc(TEST_ORIGIN_ID).addDst(TEST_ORIGIN_ID).setEdges("batracien:reptile").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } // Test path between ori 1 and cnt 4 (forward graph) @Test public void forwardRootToLeaf() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 4)).build())); List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("snp", 20), fakeSWHID("rev", 9), fakeSWHID("dir", 8), fakeSWHID("dir", 6), fakeSWHID("cnt", 4)); Assertions.assertEquals(expected, actual); } // Test path between rev 18 and rev 3 (forward graph) @Test public void forwardRevToRev() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 18), fakeSWHID("rev", 3)).build())); List expected = List.of(fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("rev", 9), fakeSWHID("rev", 3)); Assertions.assertEquals(expected, actual); } // Test path between rev 3 and rev 18 (backward graph) @Test public void backwardRevToRev() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 3), fakeSWHID("rev", 18)) .setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18)); Assertions.assertEquals(expected, actual); } // Test path between cnt 4 and itself (forward graph) @Test public void forwardCntToItself() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("cnt", 4)).build())); List expected = List.of(fakeSWHID("cnt", 4)); Assertions.assertEquals(expected, actual); } // Start from ori and rel 19 and find cnt 14 or cnt 7 (forward graph) @Test public void forwardMultipleSourcesDest() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 14)) .addSrc(TEST_ORIGIN_ID).addDst(fakeSWHID("cnt", 7).toString()).build())); List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), fakeSWHID("cnt", 14)); } // Start from cnt 4 and cnt 11 and find rev 13 or rev 9 (backward graph) @Test public void backwardMultipleSourcesDest() { ArrayList actual = getSWHIDs(client.findPathBetween( getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("rev", 13)).setDirection(GraphDirection.BACKWARD) .addSrc(fakeSWHID("cnt", 11).toString()).addDst(fakeSWHID("rev", 9).toString()).build())); List expected = List.of(fakeSWHID("cnt", 11), fakeSWHID("dir", 12), fakeSWHID("rev", 13)); Assertions.assertEquals(expected, actual); } // Start from all directories and find the origin (backward graph) @Test public void backwardMultipleSourcesAllDirToOri() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(fakeSWHID("dir", 2), new SWHID(TEST_ORIGIN_ID)) .addSrc(fakeSWHID("dir", 6).toString()).addSrc(fakeSWHID("dir", 8).toString()) .addSrc(fakeSWHID("dir", 12).toString()).addSrc(fakeSWHID("dir", 16).toString()) .addSrc(fakeSWHID("dir", 17).toString()).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("dir", 8), fakeSWHID("rev", 9), fakeSWHID("snp", 20), new SWHID(TEST_ORIGIN_ID)); Assertions.assertEquals(expected, actual); } // Start from cnt 4 and find any rev (backward graph) @Test public void backwardCntToAnyRev() { ArrayList actual = getSWHIDs( client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("rev", 3)) .addDst(fakeSWHID("rev", 9).toString()).addDst(fakeSWHID("rev", 13).toString()) .addDst(fakeSWHID("rev", 18).toString()).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("cnt", 4), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("rev", 9)); Assertions.assertEquals(expected, actual); } // Impossible path between rev 9 and cnt 14 @Test public void forwardImpossiblePath() { StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 9), fakeSWHID("cnt", 14)).build()); }); Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); // Reverse direction thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 14), fakeSWHID("rev", 9)) .setDirection(GraphDirection.BACKWARD).build()); }); Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); } // Common ancestor between cnt 4 and cnt 15 : rev 18 @Test public void commonAncestorBackwardBackward() { Path p = client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("cnt", 15)) .setDirection(GraphDirection.BACKWARD).setDirectionReverse(GraphDirection.BACKWARD).build()); ArrayList actual = getSWHIDs(p); SWHID expected = fakeSWHID("rev", 18); Assertions.assertEquals(expected, actual.get(p.getMidpointIndex())); } // Common descendant between rev 13 and rev 3 : cnt 1 (with rev:dir,dir:dir,dir:cnt) @Test public void commonDescendantForwardForward() { Path p = client.findPathBetween( getRequestBuilder(fakeSWHID("rev", 13), fakeSWHID("rev", 3)).setDirection(GraphDirection.FORWARD) .setDirectionReverse(GraphDirection.FORWARD).setEdges("rev:dir,dir:dir,dir:cnt").build()); ArrayList actual = getSWHIDs(p); SWHID expected = fakeSWHID("cnt", 1); Assertions.assertEquals(expected, actual.get(p.getMidpointIndex())); } // Path between rel 19 and cnt 15 with various max depths @Test public void maxDepth() { // Works with max_depth = 2 ArrayList actual = getSWHIDs(client .findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxDepth(2).build())); List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), fakeSWHID("dir", 16), fakeSWHID("cnt", 15)); Assertions.assertEquals(expected, actual); // Check that it throws NOT_FOUND with max depth = 1 StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathBetween( getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxDepth(1).build()); }); Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); } // Path between rel 19 and cnt 15 with various max edges @Test public void maxEdges() { // Works with max_edges = 3 ArrayList actual = getSWHIDs(client .findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxEdges(3).build())); List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), fakeSWHID("dir", 16), fakeSWHID("cnt", 15)); Assertions.assertEquals(expected, actual); // Check that it throws NOT_FOUND with max_edges = 2 StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathBetween( getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxEdges(2).build()); }); Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java index ebec7fc..54d358f 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java @@ -1,162 +1,169 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import io.grpc.Status; import io.grpc.StatusRuntimeException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.SWHID; import java.util.ArrayList; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class FindPathToTest extends TraversalServiceTest { private FindPathToRequest.Builder getRequestBuilder(SWHID src, String allowedNodes) { return FindPathToRequest.newBuilder().addSrc(src.toString()) .setTarget(NodeFilter.newBuilder().setTypes(allowedNodes).build()); } @Test public void testSrcErrors() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client .findPathTo(FindPathToRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo( FindPathToRequest.newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo( FindPathToRequest.newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } @Test public void testEdgeErrors() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo( FindPathToRequest.newBuilder().addSrc(TEST_ORIGIN_ID).setEdges("batracien:reptile").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } @Test public void testTargetErrors() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo(FindPathToRequest.newBuilder().addSrc(TEST_ORIGIN_ID) .setTarget(NodeFilter.newBuilder().setTypes("argoumante,eglomatique").build()).build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } // Test path between ori 1 and any dir (forward graph) @Test public void forwardOriToFirstDir() { ArrayList actual = getSWHIDs( client.findPathTo(getRequestBuilder(new SWHID(TEST_ORIGIN_ID), "dir").build())); List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("snp", 20), fakeSWHID("rev", 9), fakeSWHID("dir", 8)); Assertions.assertEquals(expected, actual); } // Test path between rel 19 and any cnt (forward graph) @Test public void forwardRelToFirstCnt() { ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("rel", 19), "cnt").build())); List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), fakeSWHID("cnt", 14)); Assertions.assertEquals(expected, actual); } // Test path between dir 16 and any rel (backward graph) @Test public void backwardDirToFirstRel() { ArrayList actual = getSWHIDs(client.findPathTo( getRequestBuilder(fakeSWHID("dir", 16), "rel").setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("dir", 16), fakeSWHID("dir", 17), fakeSWHID("rev", 18), fakeSWHID("rel", 19)); Assertions.assertEquals(expected, actual); } // Test path between cnt 4 and itself (forward graph) @Test public void forwardCntToItself() { ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 4), "cnt").build())); List expected = List.of(fakeSWHID("cnt", 4)); Assertions.assertEquals(expected, actual); } // Start from ori and rel 19 and find any cnt (forward graph) @Test public void forwardMultipleSources() { ArrayList actual = getSWHIDs( client.findPathTo(getRequestBuilder(fakeSWHID("rel", 19), "cnt").addSrc(TEST_ORIGIN_ID).build())); List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), fakeSWHID("cnt", 14)); } // Start from cnt 4 and cnt 11 and find any rev (backward graph) @Test public void backwardMultipleSources() { ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 4), "rev") .addSrc(fakeSWHID("cnt", 11).toString()).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("cnt", 11), fakeSWHID("dir", 12), fakeSWHID("rev", 13)); Assertions.assertEquals(expected, actual); } // Start from all directories and find any origin (backward graph) @Test public void backwardMultipleSourcesAllDirToOri() { ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("dir", 2), "ori") .addSrc(fakeSWHID("dir", 6).toString()).addSrc(fakeSWHID("dir", 8).toString()) .addSrc(fakeSWHID("dir", 12).toString()).addSrc(fakeSWHID("dir", 16).toString()) .addSrc(fakeSWHID("dir", 17).toString()).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("dir", 8), fakeSWHID("rev", 9), fakeSWHID("snp", 20), new SWHID(TEST_ORIGIN_ID)); Assertions.assertEquals(expected, actual); } // Impossible path between rev 9 and any release (forward graph) @Test public void forwardImpossiblePath() { // Check that the return is STATUS.NOT_FOUND StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathTo(getRequestBuilder(fakeSWHID("rev", 9), "rel").build()); }); Assertions.assertEquals(thrown.getStatus(), Status.NOT_FOUND); } // Path from cnt 15 to any rel with various max depths @Test public void maxDepth() { // Works with max_depth = 2 ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel") .setDirection(GraphDirection.BACKWARD).setMaxDepth(4).build())); List expected = List.of(fakeSWHID("cnt", 15), fakeSWHID("dir", 16), fakeSWHID("dir", 17), fakeSWHID("rev", 18), fakeSWHID("rel", 19)); Assertions.assertEquals(expected, actual); // Check that it throws NOT_FOUND with max depth = 1 StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel").setDirection(GraphDirection.BACKWARD) .setMaxDepth(3).build()); }); Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); } // Path from cnt 15 to any rel with various max edges @Test public void maxEdges() { ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel") .setDirection(GraphDirection.BACKWARD).setMaxEdges(4).build())); List expected = List.of(fakeSWHID("cnt", 15), fakeSWHID("dir", 16), fakeSWHID("dir", 17), fakeSWHID("rev", 18), fakeSWHID("rel", 19)); Assertions.assertEquals(expected, actual); StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel").setDirection(GraphDirection.BACKWARD) .setMaxEdges(3).build()); }); Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java index d1d6d9b..22e3a54 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java @@ -1,284 +1,291 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import com.google.protobuf.Descriptors; import com.google.protobuf.FieldMask; import io.grpc.Status; import io.grpc.StatusRuntimeException; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.SWHID; import java.util.*; import static org.junit.jupiter.api.Assertions.*; public class GetNodeTest extends TraversalServiceTest { @Test public void testNotFound() { StatusRuntimeException thrown = assertThrows(StatusRuntimeException.class, () -> client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("cnt", 404).toString()).build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } @Test public void testInvalidSwhid() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client.getNode( GetNodeRequest.newBuilder().setSwhid("swh:1:lol:0000000000000000000000000000000000000001").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client.getNode( GetNodeRequest.newBuilder().setSwhid("swh:1:cnt:000000000000000000000000000000000000000z").build())); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } @Test public void testContents() { List expectedCnts = List.of(1, 4, 5, 7, 11, 14, 15); Map expectedLengths = Map.of(1, 42, 4, 404, 5, 1337, 7, 666, 11, 313, 14, 14, 15, 404); Set expectedSkipped = Set.of(15); for (Integer cntId : expectedCnts) { Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("cnt", cntId).toString()).build()); assertTrue(n.hasCnt()); assertTrue(n.getCnt().hasLength()); assertEquals((long) expectedLengths.get(cntId), n.getCnt().getLength()); assertTrue(n.getCnt().hasIsSkipped()); assertEquals(expectedSkipped.contains(cntId), n.getCnt().getIsSkipped()); } } @Test public void testRevisions() { List expectedRevs = List.of(3, 9, 13, 18); Map expectedMessages = Map.of(3, "Initial commit", 9, "Add parser", 13, "Add tests", 18, "Refactor codebase"); Map expectedAuthors = Map.of(3, "foo", 9, "bar", 13, "foo", 18, "baz"); Map expectedCommitters = Map.of(3, "foo", 9, "bar", 13, "bar", 18, "foo"); Map expectedAuthorTimestamps = Map.of(3, 1111122220L, 9, 1111144440L, 13, 1111166660L, 18, 1111177770L); Map expectedCommitterTimestamps = Map.of(3, 1111122220L, 9, 1111155550L, 13, 1111166660L, 18, 1111177770L); Map expectedAuthorTimestampOffsets = Map.of(3, 120, 9, 120, 13, 120, 18, 0); Map expectedCommitterTimestampOffsets = Map.of(3, 120, 9, 120, 13, 120, 18, 0); HashMap personMapping = new HashMap<>(); for (Integer revId : expectedRevs) { Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("rev", revId).toString()).build()); assertTrue(n.hasRev()); assertTrue(n.getRev().hasMessage()); assertEquals(expectedMessages.get(revId), n.getRev().getMessage().toStringUtf8()); // Persons are anonymized, we just need to check that the mapping is self-consistent assertTrue(n.getRev().hasAuthor()); assertTrue(n.getRev().hasCommitter()); int[] actualPersons = new int[]{(int) n.getRev().getAuthor(), (int) n.getRev().getCommitter()}; String[] expectedPersons = new String[]{expectedAuthors.get(revId), expectedCommitters.get(revId)}; for (int i = 0; i < actualPersons.length; i++) { int actualPerson = actualPersons[i]; String expectedPerson = expectedPersons[i]; assertTrue(actualPerson >= 0); if (personMapping.containsKey(actualPerson)) { assertEquals(personMapping.get(actualPerson), expectedPerson); } else { personMapping.put(actualPerson, expectedPerson); } } assertTrue(n.getRev().hasAuthorDate()); assertTrue(n.getRev().hasAuthorDateOffset()); assertTrue(n.getRev().hasCommitterDate()); assertTrue(n.getRev().hasCommitterDateOffset()); // FIXME: all the timestamps are one hour off?! // System.err.println(revId + " " + n.getRev().getAuthorDate() + " " + // n.getRev().getAuthorDateOffset()); // System.err.println(revId + " " + n.getRev().getCommitterDate() + " " + // n.getRev().getCommitterDateOffset()); // assertEquals(expectedAuthorTimestamps.get(revId), n.getRev().getAuthorDate()); assertEquals(expectedAuthorTimestampOffsets.get(revId), n.getRev().getAuthorDateOffset()); // assertEquals(expectedCommitterTimestamps.get(revId), n.getRev().getAuthorDate()); assertEquals(expectedCommitterTimestampOffsets.get(revId), n.getRev().getAuthorDateOffset()); } } @Test public void testReleases() { List expectedRels = List.of(10, 19); Map expectedMessages = Map.of(10, "Version 1.0", 19, "Version 2.0"); Map expectedNames = Map.of(10, "v1.0", 19, "v2.0"); Map expectedAuthors = Map.of(10, "foo", 19, "bar"); Map expectedAuthorTimestamps = Map.of(10, 1234567890L); Map expectedAuthorTimestampOffsets = Map.of(3, 120); HashMap personMapping = new HashMap<>(); for (Integer relId : expectedRels) { Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("rel", relId).toString()).build()); assertTrue(n.hasRel()); assertTrue(n.getRel().hasMessage()); assertEquals(expectedMessages.get(relId), n.getRel().getMessage().toStringUtf8()); // FIXME: names are always empty?! // System.err.println(relId + " " + n.getRel().getName()); // assertEquals(expectedNames.get(relId), n.getRel().getName().toStringUtf8()); // Persons are anonymized, we just need to check that the mapping is self-consistent assertTrue(n.getRel().hasAuthor()); int actualPerson = (int) n.getRel().getAuthor(); String expectedPerson = expectedAuthors.get(relId); assertTrue(actualPerson >= 0); if (personMapping.containsKey(actualPerson)) { assertEquals(personMapping.get(actualPerson), expectedPerson); } else { personMapping.put(actualPerson, expectedPerson); } assertTrue(n.getRel().hasAuthorDate()); assertTrue(n.getRel().hasAuthorDateOffset()); // FIXME: all the timestamps are one hour off?! // if (expectedAuthorTimestamps.containsKey(relId)) { // assertEquals(expectedAuthorTimestamps.get(revId), n.getRev().getAuthorDate()); // } if (expectedAuthorTimestampOffsets.containsKey(relId)) { assertEquals(expectedAuthorTimestampOffsets.get(relId), n.getRev().getAuthorDateOffset()); } } } @Test public void testOrigins() { List expectedOris = List.of(new SWHID(TEST_ORIGIN_ID)); Map expectedUrls = Map.of(new SWHID(TEST_ORIGIN_ID), "https://example.com/swh/graph"); for (SWHID oriSwhid : expectedOris) { Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(oriSwhid.toString()).build()); assertTrue(n.hasOri()); assertTrue(n.getOri().hasUrl()); assertEquals(expectedUrls.get(oriSwhid), n.getOri().getUrl()); } } @Test public void testCntMask() { Node n; String swhid = fakeSWHID("cnt", 1).toString(); // No mask, all fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); assertTrue(n.hasCnt()); assertTrue(n.getCnt().hasLength()); assertEquals(42, n.getCnt().getLength()); assertTrue(n.getCnt().hasIsSkipped()); assertFalse(n.getCnt().getIsSkipped()); // Empty mask, no fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); assertFalse(n.getCnt().hasLength()); assertFalse(n.getCnt().hasIsSkipped()); // Mask with length, no isSkipped n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) .setMask(FieldMask.newBuilder().addPaths("cnt.length").build()).build()); assertTrue(n.getCnt().hasLength()); assertFalse(n.getCnt().hasIsSkipped()); // Mask with isSkipped, no length n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) .setMask(FieldMask.newBuilder().addPaths("cnt.is_skipped").build()).build()); assertFalse(n.getCnt().hasLength()); assertTrue(n.getCnt().hasIsSkipped()); } @Test public void testRevMask() { Node n; String swhid = fakeSWHID("rev", 3).toString(); // No mask, all fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); assertTrue(n.hasRev()); assertTrue(n.getRev().hasMessage()); assertTrue(n.getRev().hasAuthor()); assertTrue(n.getRev().hasAuthorDate()); assertTrue(n.getRev().hasAuthorDateOffset()); assertTrue(n.getRev().hasCommitter()); assertTrue(n.getRev().hasCommitterDate()); assertTrue(n.getRev().hasCommitterDateOffset()); // Empty mask, no fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); assertFalse(n.getRev().hasMessage()); assertFalse(n.getRev().hasAuthor()); assertFalse(n.getRev().hasAuthorDate()); assertFalse(n.getRev().hasAuthorDateOffset()); assertFalse(n.getRev().hasCommitter()); assertFalse(n.getRev().hasCommitterDate()); assertFalse(n.getRev().hasCommitterDateOffset()); // Test all masks with single fields for (Descriptors.FieldDescriptor includedField : RevisionData.getDefaultInstance().getAllFields().keySet()) { n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) .setMask(FieldMask.newBuilder().addPaths("rev." + includedField.getName()).build()).build()); for (Descriptors.FieldDescriptor f : n.getRev().getDescriptorForType().getFields()) { assertEquals(n.getRev().hasField(f), f.getName().equals(includedField.getName())); } } } @Test public void testRelMask() { Node n; String swhid = fakeSWHID("rel", 19).toString(); // No mask, all fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); assertTrue(n.hasRel()); assertTrue(n.getRel().hasMessage()); assertTrue(n.getRel().hasAuthor()); assertTrue(n.getRel().hasAuthorDate()); assertTrue(n.getRel().hasAuthorDateOffset()); // Empty mask, no fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); assertFalse(n.getRel().hasMessage()); assertFalse(n.getRel().hasAuthor()); assertFalse(n.getRel().hasAuthorDate()); assertFalse(n.getRel().hasAuthorDateOffset()); // Test all masks with single fields for (Descriptors.FieldDescriptor includedField : ReleaseData.getDefaultInstance().getAllFields().keySet()) { n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) .setMask(FieldMask.newBuilder().addPaths("rel." + includedField.getName()).build()).build()); for (Descriptors.FieldDescriptor f : n.getRel().getDescriptorForType().getFields()) { assertEquals(n.getRel().hasField(f), f.getName().equals(includedField.getName())); } } } @Test public void testOriMask() { Node n; String swhid = TEST_ORIGIN_ID; // No mask, all fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); assertTrue(n.hasOri()); assertTrue(n.getOri().hasUrl()); // Empty mask, no fields present n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); assertFalse(n.getOri().hasUrl()); // Test all masks with single fields for (Descriptors.FieldDescriptor includedField : OriginData.getDefaultInstance().getAllFields().keySet()) { n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) .setMask(FieldMask.newBuilder().addPaths("ori." + includedField.getName()).build()).build()); for (Descriptors.FieldDescriptor f : n.getOri().getDescriptorForType().getFields()) { assertEquals(n.getOri().hasField(f), f.getName().equals(includedField.getName())); } } } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java index e422fab..e8224c3 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java @@ -1,18 +1,25 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; public class StatsTest extends TraversalServiceTest { @Test public void testStats() { StatsResponse stats = client.stats(StatsRequest.getDefaultInstance()); assertEquals(stats.getNumNodes(), 21); assertEquals(stats.getNumEdges(), 23); assertEquals(stats.getIndegreeMin(), 0); assertEquals(stats.getIndegreeMax(), 3); assertEquals(stats.getOutdegreeMin(), 0); assertEquals(stats.getOutdegreeMax(), 3); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java index b11c1fc..862e1ea 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java @@ -1,58 +1,65 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import io.grpc.ManagedChannel; import io.grpc.Server; import io.grpc.inprocess.InProcessChannelBuilder; import io.grpc.inprocess.InProcessServerBuilder; import io.grpc.testing.GrpcCleanupRule; import org.junit.Rule; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.softwareheritage.graph.GraphTest; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.SwhBidirectionalGraph; import java.util.ArrayList; import java.util.Iterator; public class TraversalServiceTest extends GraphTest { @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); private static Server server; private static ManagedChannel channel; protected static SwhBidirectionalGraph g; protected static TraversalServiceGrpc.TraversalServiceBlockingStub client; @BeforeAll static void setup() throws Exception { String serverName = InProcessServerBuilder.generateName(); g = GraphServer.loadGraph(getGraphPath().toString()); server = InProcessServerBuilder.forName(serverName).directExecutor() .addService(new GraphServer.TraversalService(g.copy())).build().start(); channel = InProcessChannelBuilder.forName(serverName).directExecutor().build(); client = TraversalServiceGrpc.newBlockingStub(channel); } @AfterAll static void teardown() { channel.shutdownNow(); server.shutdownNow(); } public ArrayList getSWHIDs(Iterator it) { ArrayList res = new ArrayList<>(); it.forEachRemaining((Node n) -> { res.add(new SWHID(n.getSwhid())); }); return res; } public ArrayList getSWHIDs(Path p) { ArrayList res = new ArrayList<>(); p.getNodeList().forEach((Node n) -> { res.add(new SWHID(n.getSwhid())); }); return res; } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java index 907aec9..6e8a7ee 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java @@ -1,93 +1,100 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.GraphTest; import org.softwareheritage.graph.SWHID; import java.util.ArrayList; public class TraverseLeavesTest extends TraversalServiceTest { private TraversalRequest.Builder getLeavesRequestBuilder(SWHID src) { return TraversalRequest.newBuilder().addSrc(src.toString()) .setReturnNodes(NodeFilter.newBuilder().setMaxTraversalSuccessors(0).build()); } @Test public void forwardFromSnp() { TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("snp", 20)).build(); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); ArrayList actualLeaves = getSWHIDs(client.traverse(request)); GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void forwardFromRel() { TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("rel", 19)).build(); ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000015")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000011")); GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void backwardFromLeaf() { TraversalRequest request1 = getLeavesRequestBuilder(fakeSWHID("cnt", 15)).setDirection(GraphDirection.BACKWARD) .build(); ArrayList actualLeaves1 = getSWHIDs(client.traverse(request1)); ArrayList expectedLeaves1 = new ArrayList<>(); expectedLeaves1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019")); GraphTest.assertEqualsAnyOrder(expectedLeaves1, actualLeaves1); TraversalRequest request2 = getLeavesRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD) .build(); ArrayList actualLeaves2 = getSWHIDs(client.traverse(request2)); ArrayList expectedLeaves2 = new ArrayList<>(); expectedLeaves2.add(new SWHID(TEST_ORIGIN_ID)); expectedLeaves2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019")); GraphTest.assertEqualsAnyOrder(expectedLeaves2, actualLeaves2); } @Test public void forwardRevToRevOnly() { TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("rev", 18)).setEdges("rev:rev").build(); ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003")); GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void forwardDirToAll() { TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("dir", 8)).setEdges("dir:*").build(); ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void backwardCntToDirDirToDir() { TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("cnt", 5)).setEdges("cnt:dir,dir:dir") .setDirection(GraphDirection.BACKWARD).build(); ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000012")); GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java index bb43920..94c92fa 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java @@ -1,130 +1,137 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.GraphTest; import org.softwareheritage.graph.SWHID; import java.util.ArrayList; public class TraverseNeighborsTest extends TraversalServiceTest { private TraversalRequest.Builder getNeighborsRequestBuilder(SWHID src) { return TraversalRequest.newBuilder().addSrc(src.toString()).setMinDepth(1).setMaxDepth(1); } @Test public void zeroNeighbor() { ArrayList expectedNodes = new ArrayList<>(); TraversalRequest request1 = getNeighborsRequestBuilder(new SWHID(TEST_ORIGIN_ID)) .setDirection(GraphDirection.BACKWARD).build(); ArrayList actuals1 = getSWHIDs(client.traverse(request1)); GraphTest.assertEqualsAnyOrder(expectedNodes, actuals1); TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("cnt", 4)).build(); ArrayList actuals2 = getSWHIDs(client.traverse(request2)); GraphTest.assertEqualsAnyOrder(expectedNodes, actuals2); TraversalRequest request3 = getNeighborsRequestBuilder(fakeSWHID("cnt", 15)).build(); ArrayList actuals3 = getSWHIDs(client.traverse(request3)); GraphTest.assertEqualsAnyOrder(expectedNodes, actuals3); TraversalRequest request4 = getNeighborsRequestBuilder(fakeSWHID("rel", 19)) .setDirection(GraphDirection.BACKWARD).build(); ArrayList actuals4 = getSWHIDs(client.traverse(request4)); GraphTest.assertEqualsAnyOrder(expectedNodes, actuals4); TraversalRequest request5 = getNeighborsRequestBuilder(fakeSWHID("dir", 8)).setEdges("snp:*,rev:*,rel:*") .build(); ArrayList actuals5 = getSWHIDs(client.traverse(request5)); GraphTest.assertEqualsAnyOrder(expectedNodes, actuals5); } @Test public void oneNeighbor() { TraversalRequest request1 = getNeighborsRequestBuilder(fakeSWHID("rev", 3)).build(); ArrayList actuals1 = getSWHIDs(client.traverse(request1)); ArrayList expectedNodes1 = new ArrayList<>(); expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("dir", 17)).setEdges("dir:cnt").build(); ArrayList actuals2 = getSWHIDs(client.traverse(request2)); ArrayList expectedNodes2 = new ArrayList<>(); expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014")); GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); TraversalRequest request3 = getNeighborsRequestBuilder(fakeSWHID("dir", 12)) .setDirection(GraphDirection.BACKWARD).build(); ArrayList actuals3 = getSWHIDs(client.traverse(request3)); ArrayList expectedNodes3 = new ArrayList<>(); expectedNodes3.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3); TraversalRequest request4 = getNeighborsRequestBuilder(fakeSWHID("rev", 9)) .setDirection(GraphDirection.BACKWARD).setEdges("rev:rev").build(); ArrayList actuals4 = getSWHIDs(client.traverse(request4)); ArrayList expectedNodes4 = new ArrayList<>(); expectedNodes4.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4); TraversalRequest request5 = getNeighborsRequestBuilder(fakeSWHID("snp", 20)) .setDirection(GraphDirection.BACKWARD).build(); ArrayList actuals5 = getSWHIDs(client.traverse(request5)); ArrayList expectedNodes5 = new ArrayList<>(); expectedNodes5.add(new SWHID(TEST_ORIGIN_ID)); GraphTest.assertEqualsAnyOrder(expectedNodes5, actuals5); } @Test public void twoNeighbors() { TraversalRequest request1 = getNeighborsRequestBuilder(fakeSWHID("snp", 20)).build(); ArrayList actuals1 = getSWHIDs(client.traverse(request1)); ArrayList expectedNodes1 = new ArrayList<>(); expectedNodes1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); expectedNodes1.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009")); GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("dir", 8)).setEdges("dir:cnt").build(); ArrayList actuals2 = getSWHIDs(client.traverse(request2)); ArrayList expectedNodes2 = new ArrayList<>(); expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); TraversalRequest request3 = getNeighborsRequestBuilder(fakeSWHID("cnt", 1)) .setDirection(GraphDirection.BACKWARD).build(); ArrayList actuals3 = getSWHIDs(client.traverse(request3)); ArrayList expectedNodes3 = new ArrayList<>(); expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000008")); expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3); TraversalRequest request4 = getNeighborsRequestBuilder(fakeSWHID("rev", 9)) .setDirection(GraphDirection.BACKWARD).setEdges("rev:snp,rev:rel").build(); ArrayList actuals4 = getSWHIDs(client.traverse(request4)); ArrayList expectedNodes4 = new ArrayList<>(); expectedNodes4.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); expectedNodes4.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4); } @Test public void threeNeighbors() { TraversalRequest request1 = getNeighborsRequestBuilder(fakeSWHID("dir", 8)).build(); ArrayList actuals1 = getSWHIDs(client.traverse(request1)); ArrayList expectedNodes1 = new ArrayList<>(); expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000006")); expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("rev", 9)) .setDirection(GraphDirection.BACKWARD).build(); ArrayList actuals2 = getSWHIDs(client.traverse(request2)); ArrayList expectedNodes2 = new ArrayList<>(); expectedNodes2.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); expectedNodes2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); expectedNodes2.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java index 2c55507..9a0ab38 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java @@ -1,110 +1,117 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import com.google.protobuf.Descriptors; import com.google.protobuf.FieldMask; import com.google.protobuf.Message; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.SwhUnidirectionalGraph; import org.softwareheritage.graph.labels.DirEntry; import java.util.*; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; public class TraverseNodesPropertiesTest extends TraversalServiceTest { private TraversalRequest.Builder getTraversalRequestBuilder(SWHID src) { return TraversalRequest.newBuilder().addSrc(src.toString()); } private void checkHasAllFields(Message m) { for (Descriptors.FieldDescriptor fd : m.getAllFields().keySet()) { assertTrue(m.hasField(fd)); } } private void checkHasAllFieldsOfType(Node node) { if (node.hasCnt()) { checkHasAllFields(node.getCnt()); } if (node.hasRev()) { checkHasAllFields(node.getRev()); } if (node.hasRel()) { checkHasAllFields(node.getRel()); } if (node.hasOri()) { checkHasAllFields(node.getOri()); } } private void checkSuccessors(SwhUnidirectionalGraph g, Node node) { HashMap graphSuccessors = new HashMap<>(); ArcLabelledNodeIterator.LabelledArcIterator it = g.labelledSuccessors(g.getNodeId(new SWHID(node.getSwhid()))); long succ; while ((succ = it.nextLong()) != -1) { graphSuccessors.put(g.getSWHID(succ).toString(), (DirEntry[]) it.label().get()); } assertEquals(node.getSuccessorList().stream().map(Successor::getSwhid).collect(Collectors.toSet()), graphSuccessors.keySet()); for (Successor successor : node.getSuccessorList()) { DirEntry[] expectedArray = graphSuccessors.get(successor.getSwhid()); HashMap expectedLabels = new HashMap<>(); for (DirEntry dirEntry : expectedArray) { expectedLabels.put(new String(g.getLabelName(dirEntry.filenameId)), dirEntry.permission); } for (EdgeLabel edgeLabel : successor.getLabelList()) { assertTrue(expectedLabels.containsKey(edgeLabel.getName().toStringUtf8())); if (edgeLabel.getPermission() > 0) { assertEquals(edgeLabel.getPermission(), expectedLabels.get(edgeLabel.getName().toStringUtf8())); } } } } @Test public void forwardFromRoot() { ArrayList response = new ArrayList<>(); client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build()).forEachRemaining(response::add); for (Node node : response) { checkHasAllFieldsOfType(node); checkSuccessors(g.getForwardGraph(), node); } } @Test public void backwardFromLeaf() { ArrayList response = new ArrayList<>(); client.traverse(getTraversalRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD).build()) .forEachRemaining(response::add); for (Node node : response) { checkHasAllFieldsOfType(node); checkSuccessors(g.getBackwardGraph(), node); } } @Test public void forwardFromRootMaskedLabels() { ArrayList response = new ArrayList<>(); client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)) .setMask(FieldMask.newBuilder().addPaths("successor.swhid").addPaths("swhid").build()).build()) .forEachRemaining(response::add); for (Node node : response) { HashSet graphSuccessors = new HashSet<>(); ArcLabelledNodeIterator.LabelledArcIterator it = g .labelledSuccessors(g.getNodeId(new SWHID(node.getSwhid()))); long succ; while ((succ = it.nextLong()) != -1) { graphSuccessors.add(g.getSWHID(succ).toString()); } assertEquals(node.getSuccessorList().stream().map(Successor::getSwhid).collect(Collectors.toSet()), graphSuccessors); } } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java index 7865f36..fe88c5f 100644 --- a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java @@ -1,250 +1,257 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.rpc; import io.grpc.Status; import io.grpc.StatusRuntimeException; import org.junit.jupiter.api.Test; import org.softwareheritage.graph.GraphTest; import org.softwareheritage.graph.SWHID; import java.util.ArrayList; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class TraverseNodesTest extends TraversalServiceTest { private TraversalRequest.Builder getTraversalRequestBuilder(SWHID src) { return TraversalRequest.newBuilder().addSrc(src.toString()); } @Test public void testSrcErrors() { StatusRuntimeException thrown; thrown = assertThrows(StatusRuntimeException.class, () -> client.traverse(TraversalRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build()) .forEachRemaining((n) -> { })); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client .traverse(TraversalRequest.newBuilder() .addSrc("swh:1:lol:0000000000000000000000000000000000000001").build()) .forEachRemaining((n) -> { })); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); thrown = assertThrows(StatusRuntimeException.class, () -> client .traverse(TraversalRequest.newBuilder() .addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build()) .forEachRemaining((n) -> { })); assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); } @Test public void forwardFromRoot() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build())); List expected = List.of(fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), fakeSWHID("cnt", 7), fakeSWHID("dir", 2), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("snp", 20), new SWHID(TEST_ORIGIN_ID)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardFromMiddle() { ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("dir", 12)).build())); List expected = List.of(fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), fakeSWHID("cnt", 7), fakeSWHID("cnt", 11), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("dir", 12)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardRelRev() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(fakeSWHID("rel", 10)).setEdges("rel:rev,rev:rev").build())); List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rev", 9), fakeSWHID("rev", 3)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardFilterReturnedNodesDir() { ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("rel", 10)) .setReturnNodes(NodeFilter.newBuilder().setTypes("dir").build()).build())); List expected = List.of(fakeSWHID("dir", 2), fakeSWHID("dir", 8), fakeSWHID("dir", 6)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void backwardFromRoot() { ArrayList actual = getSWHIDs(client.traverse( getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(new SWHID(TEST_ORIGIN_ID)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void backwardFromMiddle() { ArrayList actual = getSWHIDs(client.traverse( getTraversalRequestBuilder(fakeSWHID("dir", 12)).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("dir", 12), fakeSWHID("rel", 19), fakeSWHID("rev", 13), fakeSWHID("rev", 18)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void backwardFromLeaf() { ArrayList actual = getSWHIDs(client.traverse( getTraversalRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 4), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("dir", 12), fakeSWHID("rel", 10), fakeSWHID("rel", 19), fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18), fakeSWHID("snp", 20)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardSnpToRev() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)).setEdges("snp:rev").build())); List expected = List.of(fakeSWHID("rev", 9), fakeSWHID("snp", 20)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardRelToRevRevToRev() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(fakeSWHID("rel", 10)).setEdges("rel:rev,rev:rev").build())); List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardRevToAllDirToAll() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(fakeSWHID("rev", 13)).setEdges("rev:*,dir:*").build())); List expected = List.of(fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), fakeSWHID("cnt", 7), fakeSWHID("cnt", 11), fakeSWHID("dir", 2), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("dir", 12), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("rev", 13)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardSnpToAllRevToAll() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)).setEdges("snp:*,rev:*").build())); List expected = List.of(fakeSWHID("dir", 2), fakeSWHID("dir", 8), fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("snp", 20)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardNoEdges() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)).setEdges("").build())); List expected = List.of(fakeSWHID("snp", 20)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void backwardRevToRevRevToRel() { ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("rev", 3)) .setEdges("rev:rev,rev:rel").setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rel", 19), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardFromRootNodesOnly() { ArrayList actual = getSWHIDs( client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build())); List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), fakeSWHID("cnt", 7), fakeSWHID("dir", 2), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("snp", 20)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void backwardRevToAllNodesOnly() { ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("rev", 3)) .setDirection(GraphDirection.BACKWARD).setEdges("rev:*").build())); List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rel", 19), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18), fakeSWHID("snp", 20)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void forwardMultipleSources() { ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)) .addSrc(fakeSWHID("rel", 19).toString()).setMaxDepth(1).build())); List expected = List.of(fakeSWHID("snp", 20), fakeSWHID("rel", 19), fakeSWHID("rel", 10), fakeSWHID("rev", 9), fakeSWHID("rev", 18)); GraphTest.assertEqualsAnyOrder(expected, actual); } @Test public void backwardMultipleSources() { ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("cnt", 5)) .addSrc(fakeSWHID("dir", 16).toString()).setMaxDepth(2).setDirection(GraphDirection.BACKWARD).build())); List expected = List.of(fakeSWHID("cnt", 5), fakeSWHID("dir", 16), fakeSWHID("dir", 6), fakeSWHID("dir", 8), fakeSWHID("dir", 17), fakeSWHID("rev", 18)); GraphTest.assertEqualsAnyOrder(expected, actual); } // Go from rel 19 with various max depths @Test public void maxDepth() { TraversalRequest.Builder builder = getTraversalRequestBuilder(fakeSWHID("rel", 19)); ArrayList actual; List expected; actual = getSWHIDs(client.traverse(builder.setMaxDepth(0).build())); expected = List.of(fakeSWHID("rel", 19)); GraphTest.assertEqualsAnyOrder(expected, actual); actual = getSWHIDs(client.traverse(builder.setMaxDepth(1).build())); expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18)); GraphTest.assertEqualsAnyOrder(expected, actual); actual = getSWHIDs(client.traverse(builder.setMaxDepth(2).build())); expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17)); GraphTest.assertEqualsAnyOrder(expected, actual); actual = getSWHIDs(client.traverse(builder.setMaxDepth(3).build())); expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17), fakeSWHID("rev", 9), fakeSWHID("dir", 12), fakeSWHID("dir", 16), fakeSWHID("cnt", 14)); GraphTest.assertEqualsAnyOrder(expected, actual); } // Go from rel 19 with various max edges @Test public void maxEdges() { TraversalRequest.Builder builder = getTraversalRequestBuilder(fakeSWHID("rel", 19)); ArrayList actual; List expected; actual = getSWHIDs(client.traverse(builder.setMaxEdges(1).build())); expected = List.of(fakeSWHID("rel", 19)); GraphTest.assertEqualsAnyOrder(expected, actual); actual = getSWHIDs(client.traverse(builder.setMaxEdges(3).build())); expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18)); GraphTest.assertEqualsAnyOrder(expected, actual); actual = getSWHIDs(client.traverse(builder.setMaxEdges(7).build())); expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17), fakeSWHID("cnt", 14)); GraphTest.assertEqualsAnyOrder(expected, actual); actual = getSWHIDs(client.traverse(builder.setMaxEdges(12).build())); expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17), fakeSWHID("rev", 9), fakeSWHID("dir", 12), fakeSWHID("dir", 16), fakeSWHID("cnt", 14), fakeSWHID("cnt", 15)); GraphTest.assertEqualsAnyOrder(expected, actual); } } diff --git a/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java index d368b03..ebc92a7 100644 --- a/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java +++ b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java @@ -1,86 +1,93 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongArrays; import org.junit.jupiter.api.Test; import java.util.Random; import static org.junit.jupiter.api.Assertions.assertTrue; public class ForkJoinBigQuickSort2Test { private static long[] identity(final int n) { final long[] perm = new long[n]; for (int i = perm.length; i-- != 0;) perm[i] = i; return perm; } private static void checkArraySorted(long[] x, long[] y) { checkArraySorted(x, y, 0, x.length); } private static void checkArraySorted(long[] x, long[] y, int from, int to) { for (int i = to - 1; i-- != from;) assertTrue(x[i] < x[i + 1] || x[i] == x[i + 1] && (y[i] < y[i + 1] || y[i] == y[i + 1]), String.format("%d: <%d, %d>, <%d, %d>", i, x[i], y[i], x[i + 1], y[i + 1])); } private static void sortBig2(long[] x, long[] y, long from, long to) { ForkJoinBigQuickSort2.parallelQuickSort(BigArrays.wrap(x), BigArrays.wrap(y), from, to); } private static void sortBig2(long[] x, long[] y) { sortBig2(x, y, 0, x.length); } @Test public void testParallelQuickSort3() { final long[][] d = new long[2][]; d[0] = new long[10]; for (int i = d[0].length; i-- != 0;) d[0][i] = 3 - i % 3; d[1] = LongArrays.shuffle(identity(10), new Random(0)); sortBig2(d[0], d[1]); checkArraySorted(d[0], d[1]); d[0] = new long[100000]; for (int i = d[0].length; i-- != 0;) d[0][i] = 100 - i % 100; d[1] = LongArrays.shuffle(identity(100000), new Random(6)); sortBig2(d[0], d[1]); checkArraySorted(d[0], d[1]); d[0] = new long[10]; for (int i = d[0].length; i-- != 0;) d[0][i] = i % 3 - 2; Random random = new Random(0); d[1] = new long[d[0].length]; for (int i = d[1].length; i-- != 0;) d[1][i] = random.nextInt(); sortBig2(d[0], d[1]); checkArraySorted(d[0], d[1]); d[0] = new long[100000]; d[1] = new long[100000]; sortBig2(d[0], d[1]); checkArraySorted(d[0], d[1]); d[0] = new long[100000]; random = new Random(0); for (int i = d[0].length; i-- != 0;) d[0][i] = random.nextInt(); d[1] = new long[d[0].length]; for (int i = d[1].length; i-- != 0;) d[1][i] = random.nextInt(); sortBig2(d[0], d[1]); checkArraySorted(d[0], d[1]); for (int i = 100; i-- != 10;) d[0][i] = random.nextInt(); for (int i = 100; i-- != 10;) d[1][i] = random.nextInt(); sortBig2(d[0], d[1], 10, 100); checkArraySorted(d[0], d[1], 10, 100); } } diff --git a/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java index a559b4a..1f1fa38 100644 --- a/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java +++ b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java @@ -1,90 +1,97 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.fastutil.longs.LongArrays; import org.junit.jupiter.api.Test; import java.util.Random; import static org.junit.jupiter.api.Assertions.assertTrue; public class ForkJoinQuickSort3Test { private static long[] identity(final int n) { final long[] perm = new long[n]; for (int i = perm.length; i-- != 0;) perm[i] = i; return perm; } private static void checkArraySorted(long[] x, long[] y, long[] z) { checkArraySorted(x, y, z, 0, x.length); } private static void checkArraySorted(long[] x, long[] y, long[] z, int from, int to) { for (int i = to - 1; i-- != from;) assertTrue(x[i] < x[i + 1] || x[i] == x[i + 1] && (y[i] < y[i + 1] || y[i] == y[i + 1] && z[i] <= z[i + 1]), String.format("%d: <%d, %d, %d>, <%d, %d, %d>", i, x[i], y[i], z[i], x[i + 1], y[i + 1], z[i + 1])); } @Test public void testParallelQuickSort3() { final long[][] d = new long[3][]; d[0] = new long[10]; for (int i = d[0].length; i-- != 0;) d[0][i] = 3 - i % 3; d[1] = LongArrays.shuffle(identity(10), new Random(0)); d[2] = LongArrays.shuffle(identity(10), new Random(1)); ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); checkArraySorted(d[0], d[1], d[2]); d[0] = new long[100000]; for (int i = d[0].length; i-- != 0;) d[0][i] = 100 - i % 100; d[1] = LongArrays.shuffle(identity(100000), new Random(6)); d[2] = LongArrays.shuffle(identity(100000), new Random(7)); ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); checkArraySorted(d[0], d[1], d[2]); d[0] = new long[10]; for (int i = d[0].length; i-- != 0;) d[0][i] = i % 3 - 2; Random random = new Random(0); d[1] = new long[d[0].length]; for (int i = d[1].length; i-- != 0;) d[1][i] = random.nextInt(); d[2] = new long[d[0].length]; for (int i = d[2].length; i-- != 0;) d[2][i] = random.nextInt(); ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); checkArraySorted(d[0], d[1], d[2]); d[0] = new long[100000]; d[1] = new long[100000]; d[2] = new long[100000]; for (int i = d[0].length; i-- != 0;) d[2][i] = random.nextInt(); ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); checkArraySorted(d[0], d[1], d[2]); d[0] = new long[100000]; random = new Random(0); for (int i = d[0].length; i-- != 0;) d[0][i] = random.nextInt(); d[1] = new long[d[0].length]; for (int i = d[1].length; i-- != 0;) d[1][i] = random.nextInt(); d[2] = new long[d[0].length]; for (int i = d[2].length; i-- != 0;) d[2][i] = random.nextInt(); ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); checkArraySorted(d[0], d[1], d[2]); for (int i = 100; i-- != 10;) d[0][i] = random.nextInt(); for (int i = 100; i-- != 10;) d[1][i] = random.nextInt(); for (int i = 100; i-- != 10;) d[2][i] = random.nextInt(); ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2], 10, 100); checkArraySorted(d[0], d[1], d[2], 10, 100); } } diff --git a/swh/graph/tests/test_http_client.py b/swh/graph/tests/test_http_client.py index 93240ef..21021b3 100644 --- a/swh/graph/tests/test_http_client.py +++ b/swh/graph/tests/test_http_client.py @@ -1,373 +1,378 @@ +# Copyright (c) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + import hashlib import pytest from pytest import raises from swh.core.api import RemoteException from swh.graph.http_client import GraphArgumentException TEST_ORIGIN_ID = "swh:1:ori:{}".format( hashlib.sha1(b"https://example.com/swh/graph").hexdigest() ) def test_stats(graph_client): stats = graph_client.stats() assert stats["num_nodes"] == 21 assert stats["num_edges"] == 23 assert isinstance(stats["compression_ratio"], float) assert isinstance(stats["bits_per_node"], float) assert isinstance(stats["bits_per_edge"], float) assert isinstance(stats["avg_locality"], float) assert stats["indegree_min"] == 0 assert stats["indegree_max"] == 3 assert isinstance(stats["indegree_avg"], float) assert stats["outdegree_min"] == 0 assert stats["outdegree_max"] == 3 assert isinstance(stats["outdegree_avg"], float) def test_leaves(graph_client): actual = list(graph_client.leaves(TEST_ORIGIN_ID)) expected = [ "swh:1:cnt:0000000000000000000000000000000000000001", "swh:1:cnt:0000000000000000000000000000000000000004", "swh:1:cnt:0000000000000000000000000000000000000005", "swh:1:cnt:0000000000000000000000000000000000000007", ] assert set(actual) == set(expected) def test_neighbors(graph_client): actual = list( graph_client.neighbors( "swh:1:rev:0000000000000000000000000000000000000009", direction="backward" ) ) expected = [ "swh:1:snp:0000000000000000000000000000000000000020", "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000013", ] assert set(actual) == set(expected) def test_visit_nodes(graph_client): actual = list( graph_client.visit_nodes( "swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev", ) ) expected = [ "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ] assert set(actual) == set(expected) def test_visit_nodes_filtered(graph_client): actual = list( graph_client.visit_nodes( "swh:1:rel:0000000000000000000000000000000000000010", return_types="dir", ) ) expected = [ "swh:1:dir:0000000000000000000000000000000000000002", "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:dir:0000000000000000000000000000000000000006", ] assert set(actual) == set(expected) def test_visit_nodes_filtered_star(graph_client): actual = list( graph_client.visit_nodes( "swh:1:rel:0000000000000000000000000000000000000010", return_types="*", ) ) expected = [ "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", "swh:1:cnt:0000000000000000000000000000000000000001", "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:cnt:0000000000000000000000000000000000000007", "swh:1:dir:0000000000000000000000000000000000000006", "swh:1:cnt:0000000000000000000000000000000000000004", "swh:1:cnt:0000000000000000000000000000000000000005", ] assert set(actual) == set(expected) def test_visit_edges(graph_client): actual = list( graph_client.visit_edges( "swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev,rev:dir", ) ) expected = [ ( "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:dir:0000000000000000000000000000000000000008", ), ( "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", ), ] assert set(actual) == set(expected) def test_visit_edges_limited(graph_client): actual = list( graph_client.visit_edges( "swh:1:rel:0000000000000000000000000000000000000010", max_edges=4, edges="rel:rev,rev:rev,rev:dir", ) ) expected = [ ( "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:dir:0000000000000000000000000000000000000008", ), ( "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", ), ] # As there are four valid answers (up to reordering), we cannot check for # equality. Instead, we check the client returned all edges but one. assert set(actual).issubset(set(expected)) assert len(actual) == 3 def test_visit_edges_diamond_pattern(graph_client): actual = list( graph_client.visit_edges( "swh:1:rev:0000000000000000000000000000000000000009", edges="*", ) ) expected = [ ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:dir:0000000000000000000000000000000000000008", ), ( "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", ), ( "swh:1:dir:0000000000000000000000000000000000000002", "swh:1:cnt:0000000000000000000000000000000000000001", ), ( "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:cnt:0000000000000000000000000000000000000001", ), ( "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:cnt:0000000000000000000000000000000000000007", ), ( "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:dir:0000000000000000000000000000000000000006", ), ( "swh:1:dir:0000000000000000000000000000000000000006", "swh:1:cnt:0000000000000000000000000000000000000004", ), ( "swh:1:dir:0000000000000000000000000000000000000006", "swh:1:cnt:0000000000000000000000000000000000000005", ), ] assert set(actual) == set(expected) @pytest.mark.skip(reason="currently disabled due to T1969") def test_walk(graph_client): args = ("swh:1:dir:0000000000000000000000000000000000000016", "rel") kwargs = { "edges": "dir:dir,dir:rev,rev:*", "direction": "backward", "traversal": "bfs", } actual = list(graph_client.walk(*args, **kwargs)) expected = [ "swh:1:dir:0000000000000000000000000000000000000016", "swh:1:dir:0000000000000000000000000000000000000017", "swh:1:rev:0000000000000000000000000000000000000018", "swh:1:rel:0000000000000000000000000000000000000019", ] assert set(actual) == set(expected) kwargs2 = kwargs.copy() kwargs2["limit"] = -1 actual = list(graph_client.walk(*args, **kwargs2)) expected = ["swh:1:rel:0000000000000000000000000000000000000019"] assert set(actual) == set(expected) kwargs2 = kwargs.copy() kwargs2["limit"] = 2 actual = list(graph_client.walk(*args, **kwargs2)) expected = [ "swh:1:dir:0000000000000000000000000000000000000016", "swh:1:dir:0000000000000000000000000000000000000017", ] assert set(actual) == set(expected) @pytest.mark.skip(reason="Random walk is deprecated") def test_random_walk_dst_is_type(graph_client): """as the walk is random, we test a visit from a cnt node to a release reachable from every single path in the backward graph, and only check the final node of the path (i.e., the release) """ args = ("swh:1:cnt:0000000000000000000000000000000000000015", "rel") kwargs = {"direction": "backward"} expected_root = "swh:1:rel:0000000000000000000000000000000000000019" actual = list(graph_client.random_walk(*args, **kwargs)) assert len(actual) > 1 # no release directly links to a content assert actual[0] == args[0] assert actual[-1] == expected_root kwargs2 = kwargs.copy() kwargs2["limit"] = -1 actual = list(graph_client.random_walk(*args, **kwargs2)) assert actual == [expected_root] kwargs2["limit"] = -2 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 2 assert actual[-1] == expected_root kwargs2["limit"] = 3 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 3 @pytest.mark.skip(reason="Random walk is deprecated") def test_random_walk_dst_is_node(graph_client): """Same as test_random_walk_dst_is_type, but we target the specific release node instead of a type """ args = ( "swh:1:cnt:0000000000000000000000000000000000000015", "swh:1:rel:0000000000000000000000000000000000000019", ) kwargs = {"direction": "backward"} expected_root = "swh:1:rel:0000000000000000000000000000000000000019" actual = list(graph_client.random_walk(*args, **kwargs)) assert len(actual) > 1 # no origin directly links to a content assert actual[0] == args[0] assert actual[-1] == expected_root kwargs2 = kwargs.copy() kwargs2["limit"] = -1 actual = list(graph_client.random_walk(*args, **kwargs2)) assert actual == [expected_root] kwargs2["limit"] = -2 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 2 assert actual[-1] == expected_root kwargs2["limit"] = 3 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 3 def test_count(graph_client): actual = graph_client.count_leaves(TEST_ORIGIN_ID) assert actual == 4 actual = graph_client.count_visit_nodes( "swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev" ) assert actual == 3 actual = graph_client.count_neighbors( "swh:1:rev:0000000000000000000000000000000000000009", direction="backward" ) assert actual == 3 def test_param_validation(graph_client): with raises(GraphArgumentException) as exc_info: # SWHID not found list(graph_client.leaves("swh:1:rel:00ffffffff000000000000000000000000000010")) if exc_info.value.response: assert exc_info.value.response.status_code == 404 with raises(GraphArgumentException) as exc_info: # malformed SWHID list( graph_client.neighbors("swh:1:rel:00ffffffff00000000zzzzzzz000000000000010") ) if exc_info.value.response: assert exc_info.value.response.status_code == 400 with raises(GraphArgumentException) as exc_info: # malformed edge specificaiton list( graph_client.visit_nodes( "swh:1:dir:0000000000000000000000000000000000000016", edges="dir:notanodetype,dir:rev,rev:*", direction="backward", ) ) if exc_info.value.response: assert exc_info.value.response.status_code == 400 with raises(GraphArgumentException) as exc_info: # malformed direction list( graph_client.visit_nodes( "swh:1:dir:0000000000000000000000000000000000000016", edges="dir:dir,dir:rev,rev:*", direction="notadirection", ) ) if exc_info.value.response: assert exc_info.value.response.status_code == 400 @pytest.mark.skip(reason="currently disabled due to T1969") def test_param_validation_walk(graph_client): """test validation of walk-specific parameters only""" with raises(RemoteException) as exc_info: # malformed traversal order list( graph_client.walk( "swh:1:dir:0000000000000000000000000000000000000016", "rel", edges="dir:dir,dir:rev,rev:*", direction="backward", traversal="notatraversalorder", ) ) assert exc_info.value.response.status_code == 400