diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
index c266dbc..0c382f7 100644
--- a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
+++ b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
@@ -1,91 +1,98 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import java.util.ArrayList;
/**
* Edge restriction based on node types, used when visiting the graph.
*
* Software Heritage
* graph contains multiple node types (contents, directories, revisions, ...) and restricting
* the traversal to specific node types is necessary for many querying operations:
* use cases.
*
* @author The Software Heritage developers
*/
public class AllowedEdges {
/**
* 2D boolean matrix storing access rights for all combination of src/dst node types (first
* dimension is source, second dimension is destination), when edge restriction is not enforced this
* array is set to null for early bypass.
*/
public boolean[][] restrictedTo;
/**
* Constructor.
*
* @param edgesFmt a formatted string describing allowed
* edges
*/
public AllowedEdges(String edgesFmt) {
int nbNodeTypes = Node.Type.values().length;
this.restrictedTo = new boolean[nbNodeTypes][nbNodeTypes];
// Special values (null, empty, "*")
if (edgesFmt == null || edgesFmt.isEmpty()) {
return;
}
if (edgesFmt.equals("*")) {
// Allows for quick bypass (with simple null check) when no edge restriction
restrictedTo = null;
return;
}
// Format: "src1:dst1,src2:dst2,[...]"
String[] edgeTypes = edgesFmt.split(",");
for (String edgeType : edgeTypes) {
String[] nodeTypes = edgeType.split(":");
if (nodeTypes.length != 2) {
throw new IllegalArgumentException("Cannot parse edge type: " + edgeType);
}
ArrayList srcTypes = Node.Type.parse(nodeTypes[0]);
ArrayList dstTypes = Node.Type.parse(nodeTypes[1]);
for (Node.Type srcType : srcTypes) {
for (Node.Type dstType : dstTypes) {
restrictedTo[srcType.ordinal()][dstType.ordinal()] = true;
}
}
}
}
/**
* Checks if a given edge can be followed during graph traversal.
*
* @param srcType edge source type
* @param dstType edge destination type
* @return true if allowed and false otherwise
*/
public boolean isAllowed(Node.Type srcType, Node.Type dstType) {
if (restrictedTo == null)
return true;
return restrictedTo[srcType.ordinal()][dstType.ordinal()];
}
/**
* Return a new AllowedEdges instance with reversed edge restrictions. e.g. "src1:dst1,src2:dst2"
* becomes "dst1:src1,dst2:src2"
*
* @return a new AllowedEdges instance with reversed edge restrictions
*/
public AllowedEdges reverse() {
AllowedEdges reversed = new AllowedEdges(null);
reversed.restrictedTo = new boolean[restrictedTo.length][restrictedTo[0].length];
for (int i = 0; i < restrictedTo.length; i++) {
for (int j = 0; j < restrictedTo[0].length; j++) {
reversed.restrictedTo[i][j] = restrictedTo[j][i];
}
}
return reversed;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
index ddecfd4..b63edf2 100644
--- a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
+++ b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
@@ -1,50 +1,57 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
/**
* Node type restriction, useful to implement filtering of returned nodes during traversal.
*
* @author The Software Heritage developers
*/
public class AllowedNodes {
public boolean[] restrictedTo;
/**
* Constructor.
*
* @param nodesFmt a formatted string describing allowed nodes
*/
public AllowedNodes(String nodesFmt) {
int nbNodeTypes = Node.Type.values().length;
this.restrictedTo = new boolean[nbNodeTypes];
// Special values (null, empty, "*")
if (nodesFmt == null || nodesFmt.isEmpty()) {
return;
}
if (nodesFmt.equals("*")) {
// Allows for quick bypass (with simple null check) when no node restriction
restrictedTo = null;
return;
}
// Format: "nodeType1,nodeType2,[...]"
String[] nodeTypesStr = nodesFmt.split(",");
for (String nodeTypeStr : nodeTypesStr) {
for (Node.Type nodeType : Node.Type.parse(nodeTypeStr)) {
this.restrictedTo[Node.Type.toInt(nodeType)] = true;
}
}
}
/**
* Checks if a given node type is allowed.
*
* @param nodeType node type to check
* @return true if allowed and false otherwise
*/
public boolean isAllowed(Node.Type nodeType) {
if (restrictedTo == null)
return true;
return restrictedTo[Node.Type.toInt(nodeType)];
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/Node.java b/java/src/main/java/org/softwareheritage/graph/Node.java
index be3efde..9d46a76 100644
--- a/java/src/main/java/org/softwareheritage/graph/Node.java
+++ b/java/src/main/java/org/softwareheritage/graph/Node.java
@@ -1,139 +1,146 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import java.util.*;
/**
* A node in the Software Heritage graph.
*
* @author The Software Heritage developers
*/
public class Node {
/**
* Software Heritage graph node types, as described in the
* data model.
*/
public enum Type {
/** Content node */
CNT,
/** Directory node */
DIR,
/** Origin node */
ORI,
/** Release node */
REL,
/** Revision node */
REV,
/** Snapshot node */
SNP;
/**
* Converts integer to corresponding SWH node type.
*
* @param intType node type represented as an integer
* @return the corresponding {@link Node.Type} value
* @see org.softwareheritage.graph.Node.Type
*/
public static Node.Type fromInt(int intType) {
switch (intType) {
case 0:
return CNT;
case 1:
return DIR;
case 2:
return ORI;
case 3:
return REL;
case 4:
return REV;
case 5:
return SNP;
}
return null;
}
/**
* Converts node types to the corresponding int value
*
* @param type node type as an enum
* @return the corresponding int value
*/
public static int toInt(Node.Type type) {
switch (type) {
case CNT:
return 0;
case DIR:
return 1;
case ORI:
return 2;
case REL:
return 3;
case REV:
return 4;
case SNP:
return 5;
}
throw new IllegalArgumentException("Unknown node type: " + type);
}
/**
* Converts string to corresponding SWH node type.
*
* @param strType node type represented as a string
* @return the corresponding {@link Node.Type} value
* @see org.softwareheritage.graph.Node.Type
*/
public static Node.Type fromStr(String strType) {
if (!strType.matches("cnt|dir|ori|rel|rev|snp")) {
throw new IllegalArgumentException("Unknown node type: " + strType);
}
return Node.Type.valueOf(strType.toUpperCase());
}
/**
* Converts byte array name to the int code of the corresponding SWH node type. Used for
* performance-critical deserialization.
*
* @param name node type represented as a byte array (e.g. b"cnt")
* @return the ordinal value of the corresponding {@link Node.Type}
* @see org.softwareheritage.graph.Node.Type
*/
public static int byteNameToInt(byte[] name) {
if (Arrays.equals(name, "cnt".getBytes())) {
return 0;
} else if (Arrays.equals(name, "dir".getBytes())) {
return 1;
} else if (Arrays.equals(name, "ori".getBytes())) {
return 2;
} else if (Arrays.equals(name, "rel".getBytes())) {
return 3;
} else if (Arrays.equals(name, "rev".getBytes())) {
return 4;
} else if (Arrays.equals(name, "snp".getBytes())) {
return 5;
} else
return -1;
}
/**
* Parses SWH node type possible values from formatted string (see the
* API syntax).
*
* @param strFmtType node types represented as a formatted string
* @return a list containing the {@link Node.Type} values
* @see org.softwareheritage.graph.Node.Type
*/
public static ArrayList parse(String strFmtType) {
ArrayList types = new ArrayList<>();
if (strFmtType.equals("*")) {
List nodeTypes = Arrays.asList(Node.Type.values());
types.addAll(nodeTypes);
} else {
types.add(Node.Type.fromStr(strFmtType));
}
return types;
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SWHID.java b/java/src/main/java/org/softwareheritage/graph/SWHID.java
index 16aff83..18951fc 100644
--- a/java/src/main/java/org/softwareheritage/graph/SWHID.java
+++ b/java/src/main/java/org/softwareheritage/graph/SWHID.java
@@ -1,118 +1,125 @@
+/*
+ * Copyright (c) 2019 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import com.fasterxml.jackson.annotation.JsonValue;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.binary.Hex;
/**
* A Software Heritage persistent identifier (SWHID), see persistent
* identifier documentation.
*
* @author The Software Heritage developers
*/
public class SWHID {
/** Fixed hash length of the SWHID */
public static final int HASH_LENGTH = 40;
/** Full SWHID as a string */
String swhid;
/** SWHID node type */
Node.Type type;
/**
* Constructor.
*
* @param swhid full SWHID as a string
*/
public SWHID(String swhid) {
this.swhid = swhid;
// SWHID format: 'swh:1:type:hash'
String[] parts = swhid.split(":");
if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) {
throw new IllegalArgumentException("malformed SWHID: " + swhid);
}
this.type = Node.Type.fromStr(parts[2]);
if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) {
throw new IllegalArgumentException("malformed SWHID: " + swhid);
}
}
/**
* Creates a SWHID from a compact binary representation.
*
* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes .
*/
public static SWHID fromBytes(byte[] input) {
byte[] digest = new byte[20];
System.arraycopy(input, 2, digest, 0, digest.length);
String swhidStr = String.format("swh:%d:%s:%s", input[0], Node.Type.fromInt(input[1]).toString().toLowerCase(),
Hex.encodeHexString(digest));
return new SWHID(swhidStr);
}
@Override
public boolean equals(Object otherObj) {
if (otherObj == this)
return true;
if (!(otherObj instanceof SWHID))
return false;
SWHID other = (SWHID) otherObj;
return swhid.equals(other.getSWHID());
}
@Override
public int hashCode() {
return swhid.hashCode();
}
@Override
public String toString() {
return swhid;
}
/**
* Converts SWHID to a compact binary representation.
*
* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes .
*/
public byte[] toBytes() {
byte[] bytes = new byte[22];
byte[] digest;
bytes[0] = (byte) 1; // namespace version
bytes[1] = (byte) Node.Type.toInt(this.type); // SWHID type
try {
digest = Hex.decodeHex(this.swhid.substring(10)); // SHA1 hash
System.arraycopy(digest, 0, bytes, 2, digest.length);
} catch (DecoderException e) {
throw new IllegalArgumentException("invalid hex sequence in SWHID: " + this.swhid);
}
return bytes;
}
/**
* Returns full SWHID as a string.
*
* @return full SWHID string
*/
@JsonValue
public String getSWHID() {
return swhid;
}
/**
* Returns SWHID node type.
*
* @return SWHID corresponding {@link Node.Type}
* @see org.softwareheritage.graph.Node.Type
*/
public Node.Type getType() {
return type;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/Subgraph.java b/java/src/main/java/org/softwareheritage/graph/Subgraph.java
index 53ef937..591279c 100644
--- a/java/src/main/java/org/softwareheritage/graph/Subgraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/Subgraph.java
@@ -1,224 +1,231 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.NodeIterator;
import java.util.NoSuchElementException;
public class Subgraph extends ImmutableGraph {
private final SwhBidirectionalGraph underlyingGraph;
public final AllowedNodes allowedNodeTypes;
private long nodeCount = -1;
/**
* Constructor.
*
*/
public Subgraph(SwhBidirectionalGraph underlyingGraph, AllowedNodes allowedNodeTypes) {
this.underlyingGraph = underlyingGraph.copy();
this.allowedNodeTypes = allowedNodeTypes;
}
/**
* Return a flyweight copy of the graph.
*/
@Override
public Subgraph copy() {
return new Subgraph(this.underlyingGraph.copy(), allowedNodeTypes);
}
@Override
public boolean randomAccess() {
return underlyingGraph.randomAccess();
}
/**
* Return a transposed version of the graph.
*/
public Subgraph transpose() {
return new Subgraph(underlyingGraph.transpose(), allowedNodeTypes);
}
/**
* Return a symmetric version of the graph.
*/
public Subgraph symmetrize() {
return new Subgraph(underlyingGraph.symmetrize(), allowedNodeTypes);
}
/**
* Returns number of nodes in the graph.
*
* @return number of nodes in the graph
*/
@Override
public long numNodes() {
if (nodeCount == -1) {
for (long i = 0; i < underlyingGraph.numNodes(); ++i) {
if (nodeExists(i))
++nodeCount;
}
}
return nodeCount;
}
/**
* Returns number of edges in the graph.
*
* @return number of edges in the graph
*/
@Override
public long numArcs() {
throw new UnsupportedOperationException("Cannot determine the number of arcs in a subgraph");
}
public long maxNodeNumber() {
return underlyingGraph.numNodes();
}
public boolean nodeExists(long node) {
return allowedNodeTypes.isAllowed(underlyingGraph.getNodeType(node));
}
/**
* Returns lazy iterator of successors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of successors of the node, specified as a
* WebGraph LazyLongIterator
*/
@Override
public LazyLongIterator successors(long nodeId) {
if (!nodeExists(nodeId)) {
throw new IllegalArgumentException("Node " + nodeId + " not in subgraph");
}
LazyLongIterator allSuccessors = underlyingGraph.successors(nodeId);
return new LazyLongIterator() {
@Override
public long nextLong() {
long neighbor;
while ((neighbor = allSuccessors.nextLong()) != -1) {
if (nodeExists(neighbor)) {
return neighbor;
}
}
return -1;
}
@Override
public long skip(final long n) {
long i;
for (i = 0; i < n && nextLong() != -1; i++)
;
return i;
}
};
}
/**
* Returns the outdegree of a node.
*
* @param nodeId node specified as a long id
* @return outdegree of a node
*/
@Override
public long outdegree(long nodeId) {
long deg = 0;
for (LazyLongIterator allSuccessors = successors(nodeId); allSuccessors.nextLong() != -1; ++deg)
;
return deg;
}
@Override
public NodeIterator nodeIterator() {
return new NodeIterator() {
final long n = numNodes();
long i = -1;
long done = 0;
@Override
public boolean hasNext() {
return done <= n;
}
@Override
public long nextLong() {
if (!hasNext())
throw new NoSuchElementException();
do {
++i;
if (i >= underlyingGraph.numNodes())
throw new NoSuchElementException();
} while (!nodeExists(i));
++done;
return i;
}
@Override
public long outdegree() {
return Subgraph.this.outdegree(i);
}
@Override
public LazyLongIterator successors() {
return Subgraph.this.successors(i);
}
};
}
/**
* Returns lazy iterator of predecessors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of predecessors of the node, specified as a
* WebGraph LazyLongIterator
*/
public LazyLongIterator predecessors(long nodeId) {
return this.transpose().successors(nodeId);
}
/**
* Returns the indegree of a node.
*
* @param nodeId node specified as a long id
* @return indegree of a node
*/
public long indegree(long nodeId) {
return this.transpose().outdegree(nodeId);
}
/**
* Converts {@link SWHID} node to long.
*
* @param swhid node specified as a {@link SWHID}
* @return internal long node id
* @see SWHID
*/
public long getNodeId(SWHID swhid) {
return underlyingGraph.getNodeId(swhid);
}
/**
* Converts long id node to {@link SWHID}.
*
* @param nodeId node specified as a long id
* @return external SWHID
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
return underlyingGraph.getSWHID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return underlyingGraph.getNodeType(nodeId);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java
index 446dd65..04b2a8c 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java
@@ -1,180 +1,187 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import it.unimi.dsi.big.webgraph.BidirectionalImmutableGraph;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.IOException;
import java.io.InputStream;
/**
* Class representing the compressed Software Heritage graph in both directions (forward and
* backward).
*
* This class uses the {@link BidirectionalImmutableGraph} class internally to implement the
* backward equivalent of graph operations ({@link SwhBidirectionalGraph#indegree(long)},
* {@link SwhBidirectionalGraph#predecessors(long)}, etc.) by holding a reference to two
* {@link SwhUnidirectionalGraph} (a forward graph and a backward graph).
*
* Both graphs share their graph properties in memory by storing references to the same
* {@link SwhGraphProperties} object.
*
*
* ┌──────────────┐
* │ImmutableGraph◄────────┐
* └────▲─────────┘ │extends
* │ │
* │ ┌──────────┴────────────────┐
* extends│ │BidirectionalImmutableGraph│
* │ └────────────▲──────────────┘
* │ │extends
* ┌──────────────┴───────┐ ┌──────┴──────────────┐
* │SwhUnidirectionalGraph│◄────┤SwhBidirectionalGraph│
* └──┬──────────────┬────┘ └────────┬───────────┬┘
* │ │ contains x2 │ │
* │ │ │ │
* │ implements│ │implements │
* │ ┌▼──────────┐ │ │
* │ │SwhGraph(I)◄────────┘ │
* contains │ └───────────┘ │contains
* │ │
* │ ┌──────────────────┐ │
* └────────────►SwhGraphProperties◄──────────────┘
* └──────────────────┘
*
*
* @author The Software Heritage developers
* @see SwhUnidirectionalGraph
*/
public class SwhBidirectionalGraph extends BidirectionalImmutableGraph implements SwhGraph {
/** Property data of the graph (id/type mappings etc.) */
public final SwhGraphProperties properties;
private final SwhUnidirectionalGraph forwardGraph;
private final SwhUnidirectionalGraph backwardGraph;
public SwhBidirectionalGraph(SwhUnidirectionalGraph forwardGraph, SwhUnidirectionalGraph backwardGraph,
SwhGraphProperties properties) {
super(forwardGraph, backwardGraph);
this.forwardGraph = forwardGraph;
this.backwardGraph = backwardGraph;
this.properties = properties;
}
private SwhBidirectionalGraph(BidirectionalImmutableGraph graph, SwhGraphProperties properties) {
super(graph.forward, graph.backward);
this.forwardGraph = new SwhUnidirectionalGraph(graph.forward, properties);
this.backwardGraph = new SwhUnidirectionalGraph(graph.backward, properties);
this.properties = properties;
}
public static SwhBidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl)
throws IOException {
SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadGraphOnly(method, path, is, pl);
SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadGraphOnly(method, path + "-transposed", is, pl);
SwhGraphProperties properties = SwhGraphProperties.load(path);
forward.setProperties(properties);
backward.setProperties(properties);
return new SwhBidirectionalGraph(forward, backward, properties);
}
public static SwhBidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl)
throws IOException {
SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path, is, pl);
SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path + "-transposed", is,
pl);
SwhGraphProperties properties = SwhGraphProperties.load(path);
forward.setProperties(properties);
backward.setProperties(properties);
return new SwhBidirectionalGraph(forward, backward, properties);
}
// loadXXX methods from ImmutableGraph
public static SwhBidirectionalGraph load(String path, ProgressLogger pl) throws IOException {
return load(LoadMethod.STANDARD, path, null, pl);
}
public static SwhBidirectionalGraph load(String path) throws IOException {
return load(LoadMethod.STANDARD, path, null, null);
}
public static SwhBidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException {
return load(LoadMethod.MAPPED, path, null, pl);
}
public static SwhBidirectionalGraph loadMapped(String path) throws IOException {
return load(LoadMethod.MAPPED, path, null, null);
}
public static SwhBidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException {
return load(LoadMethod.OFFLINE, path, null, pl);
}
public static SwhBidirectionalGraph loadOffline(String path) throws IOException {
return load(LoadMethod.OFFLINE, path, null, null);
}
// Labelled versions of the loadXXX methods from ImmutableGraph
public static SwhBidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException {
return loadLabelled(LoadMethod.STANDARD, path, null, pl);
}
public static SwhBidirectionalGraph loadLabelled(String path) throws IOException {
return loadLabelled(LoadMethod.STANDARD, path, null, null);
}
public static SwhBidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException {
return loadLabelled(LoadMethod.MAPPED, path, null, pl);
}
public static SwhBidirectionalGraph loadLabelledMapped(String path) throws IOException {
return loadLabelled(LoadMethod.MAPPED, path, null, null);
}
public static SwhBidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException {
return loadLabelled(LoadMethod.OFFLINE, path, null, pl);
}
public static SwhBidirectionalGraph loadLabelledOffline(String path) throws IOException {
return loadLabelled(LoadMethod.OFFLINE, path, null, null);
}
@Override
public SwhBidirectionalGraph copy() {
return new SwhBidirectionalGraph(forwardGraph.copy(), backwardGraph.copy(), this.properties);
}
@Override
public SwhBidirectionalGraph transpose() {
return new SwhBidirectionalGraph(super.transpose(), this.properties);
}
@Override
public SwhBidirectionalGraph symmetrize() {
return new SwhBidirectionalGraph(super.symmetrize(), this.properties);
}
public SwhUnidirectionalGraph getForwardGraph() {
return this.forwardGraph;
}
public SwhUnidirectionalGraph getBackwardGraph() {
return this.backwardGraph;
}
/**
* Returns a *labelled* lazy iterator over the successors of a given node. The iteration terminates
* when -1 is returned.
*/
public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) {
return forwardGraph.labelledSuccessors(x);
}
/**
* Returns a *labelled* lazy iterator over the predecessors of a given node. The iteration
* terminates when -1 is returned.
*/
public ArcLabelledNodeIterator.LabelledArcIterator labelledPredecessors(long x) {
return backwardGraph.labelledSuccessors(x);
}
public void close() throws IOException {
this.properties.close();
}
@Override
public SwhGraphProperties getProperties() {
return properties;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
index aa92536..432de35 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
@@ -1,144 +1,151 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import java.io.IOException;
/**
* Common interface for SWH graph classes.
*
* This interface forwards all property loading/access methods to the SwhGraphProperties object
* returned by the getProperties() method of the implementing class. This allows API users to write
* graph.getNodeType() instead of graph.getProperties().getNodeType().
*/
public interface SwhGraph {
/**
* Cleans up graph resources after use.
*/
void close() throws IOException;
/**
* Returns the SWH graph properties object of this graph.
*
* @return graph properties
*/
SwhGraphProperties getProperties();
/** @see SwhGraphProperties#getPath() */
default String getPath() {
return getProperties().getPath();
}
/** @see SwhGraphProperties#getNodeId(SWHID) */
default long getNodeId(SWHID swhid) {
return getProperties().getNodeId(swhid);
}
/** @see SwhGraphProperties#getSWHID(long) */
default SWHID getSWHID(long nodeId) {
return getProperties().getSWHID(nodeId);
}
/** @see SwhGraphProperties#getNodeType(long) */
default Node.Type getNodeType(long nodeId) {
return getProperties().getNodeType(nodeId);
}
/** @see SwhGraphProperties#loadContentLength() */
default void loadContentLength() throws IOException {
getProperties().loadContentLength();
}
/** @see SwhGraphProperties#getContentLength(long) */
default Long getContentLength(long nodeId) {
return getProperties().getContentLength(nodeId);
}
/** @see SwhGraphProperties#loadPersonIds() */
default void loadPersonIds() throws IOException {
getProperties().loadPersonIds();
}
/** @see SwhGraphProperties#getAuthorId(long) */
default Long getAuthorId(long nodeId) {
return getProperties().getAuthorId(nodeId);
}
/** @see SwhGraphProperties#getCommitterId(long) */
default Long getCommitterId(long nodeId) {
return getProperties().getCommitterId(nodeId);
}
/** @see SwhGraphProperties#loadContentIsSkipped() */
default void loadContentIsSkipped() throws IOException {
getProperties().loadContentIsSkipped();
}
/** @see SwhGraphProperties#isContentSkipped(long) */
default boolean isContentSkipped(long nodeId) {
return getProperties().isContentSkipped(nodeId);
}
/** @see SwhGraphProperties#loadAuthorTimestamps() */
default void loadAuthorTimestamps() throws IOException {
getProperties().loadAuthorTimestamps();
}
/** @see SwhGraphProperties#getAuthorTimestamp(long) */
default Long getAuthorTimestamp(long nodeId) {
return getProperties().getAuthorTimestamp(nodeId);
}
/** @see SwhGraphProperties#getAuthorTimestampOffset(long) */
default Short getAuthorTimestampOffset(long nodeId) {
return getProperties().getAuthorTimestampOffset(nodeId);
}
/** @see SwhGraphProperties#loadCommitterTimestamps() */
default void loadCommitterTimestamps() throws IOException {
getProperties().loadCommitterTimestamps();
}
/** @see SwhGraphProperties#getCommitterTimestamp(long) */
default Long getCommitterTimestamp(long nodeId) {
return getProperties().getCommitterTimestamp(nodeId);
}
/** @see SwhGraphProperties#getCommitterTimestampOffset(long) */
default Short getCommitterTimestampOffset(long nodeId) {
return getProperties().getCommitterTimestampOffset(nodeId);
}
/** @see SwhGraphProperties#loadMessages() */
default void loadMessages() throws IOException {
getProperties().loadMessages();
}
/** @see SwhGraphProperties#getMessage(long) */
default byte[] getMessage(long nodeId) {
return getProperties().getMessage(nodeId);
}
/** @see SwhGraphProperties#getUrl(long) */
default String getUrl(long nodeId) {
return getProperties().getUrl(nodeId);
}
/** @see SwhGraphProperties#loadTagNames() */
default void loadTagNames() throws IOException {
getProperties().loadTagNames();
}
/** @see SwhGraphProperties#getTagName(long) */
default byte[] getTagName(long nodeId) {
return getProperties().getTagName(nodeId);
}
/** @see SwhGraphProperties#loadLabelNames() */
default void loadLabelNames() throws IOException {
getProperties().loadLabelNames();
}
/** @see SwhGraphProperties#getLabelName(long) */
default byte[] getLabelName(long labelId) {
return getProperties().getLabelName(labelId);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
index 637daee..9de9762 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
@@ -1,323 +1,330 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import it.unimi.dsi.big.util.MappedFrontCodedStringBigList;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.bytes.ByteBigList;
import it.unimi.dsi.fastutil.bytes.ByteMappedBigList;
import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.ints.IntMappedBigList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongMappedBigList;
import it.unimi.dsi.fastutil.shorts.ShortBigList;
import it.unimi.dsi.fastutil.shorts.ShortMappedBigList;
import it.unimi.dsi.sux4j.util.EliasFanoLongBigList;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.maps.NodeTypesMap;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Base64;
/**
* This objects contains SWH graph properties such as node labels.
*
* Some property mappings are necessary because Software Heritage uses string based persistent
* identifiers (SWHID) while WebGraph uses integers internally.
*
* The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph
* using SWHID) and the output (convert back to SWHID for users results).
*
* Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a
* long id → node type map is stored as well to avoid a full SWHID lookup.
*
* @see NodeIdMap
* @see NodeTypesMap
*/
public class SwhGraphProperties {
private final String path;
private final NodeIdMap nodeIdMap;
private final NodeTypesMap nodeTypesMap;
private LongBigList authorTimestamp;
private ShortBigList authorTimestampOffset;
private LongBigList committerTimestamp;
private ShortBigList committerTimestampOffset;
private LongBigList contentLength;
private LongArrayBitVector contentIsSkipped;
private IntBigList authorId;
private IntBigList committerId;
private ByteBigList messageBuffer;
private LongBigList messageOffsets;
private ByteBigList tagNameBuffer;
private LongBigList tagNameOffsets;
private MappedFrontCodedStringBigList edgeLabelNames;
protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
this.path = path;
this.nodeIdMap = nodeIdMap;
this.nodeTypesMap = nodeTypesMap;
}
public static SwhGraphProperties load(String path) throws IOException {
return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path));
}
/**
* Cleans up resources after use.
*/
public void close() throws IOException {
edgeLabelNames.close();
}
/** Return the basename of the compressed graph */
public String getPath() {
return path;
}
/**
* Converts {@link SWHID} node to long.
*
* @param swhid node specified as a {@link SWHID}
* @return internal long node id
* @see SWHID
*/
public long getNodeId(SWHID swhid) {
return nodeIdMap.getNodeId(swhid);
}
/**
* Converts long id node to {@link SWHID}.
*
* @param nodeId node specified as a long id
* @return external SWHID
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
return nodeIdMap.getSWHID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return nodeTypesMap.getType(nodeId);
}
private static LongBigList loadMappedLongs(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return LongMappedBigList.map(raf.getChannel());
}
}
private static IntBigList loadMappedInts(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return IntMappedBigList.map(raf.getChannel());
}
}
private static ShortBigList loadMappedShorts(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return ShortMappedBigList.map(raf.getChannel());
}
}
private static ByteBigList loadMappedBytes(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return ByteMappedBigList.map(raf.getChannel());
}
}
private static LongBigList loadEFLongs(String path) throws IOException {
try {
return (EliasFanoLongBigList) BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
private static byte[] getLine(ByteBigList byteArray, long start) {
long end = start;
while (end < byteArray.size64() && byteArray.getByte(end) != '\n') {
end++;
}
int length = (int) (end - start);
byte[] buffer = new byte[length];
byteArray.getElements(start, buffer, 0, length);
return buffer;
}
/** Load the sizes of the content nodes */
public void loadContentLength() throws IOException {
contentLength = loadMappedLongs(path + ".property.content.length.bin");
}
/** Get the size (in bytes) of the given content node */
public Long getContentLength(long nodeId) {
if (contentLength == null) {
throw new IllegalStateException("Content lengths not loaded");
}
long res = contentLength.getLong(nodeId);
return (res >= 0) ? res : null;
}
/** Load the IDs of the persons (authors and committers) */
public void loadPersonIds() throws IOException {
authorId = loadMappedInts(path + ".property.author_id.bin");
committerId = loadMappedInts(path + ".property.committer_id.bin");
}
/** Get a unique integer ID representing the author of the given revision or release node */
public Long getAuthorId(long nodeId) {
if (authorId == null) {
throw new IllegalStateException("Author IDs not loaded");
}
long res = authorId.getInt(nodeId);
return (res >= 0) ? res : null;
}
/** Get a unique integer ID representing the committer of the given revision node */
public Long getCommitterId(long nodeId) {
if (committerId == null) {
throw new IllegalStateException("Committer IDs not loaded");
}
long res = committerId.getInt(nodeId);
return (res >= 0) ? res : null;
}
/**
* Loads a boolean array indicating whether the given content node was skipped during archive
* ingestion
*/
public void loadContentIsSkipped() throws IOException {
try {
contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin");
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
/** Returns whether the given content node was skipped during archive ingestion */
public boolean isContentSkipped(long nodeId) {
if (contentIsSkipped == null) {
throw new IllegalStateException("Skipped content array not loaded");
}
return contentIsSkipped.getBoolean(nodeId);
}
/** Load the timestamps at which the releases and revisions were authored */
public void loadAuthorTimestamps() throws IOException {
authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin");
authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin");
}
/** Return the timestamp at which the given revision or release was authored */
public Long getAuthorTimestamp(long nodeId) {
if (authorTimestamp == null) {
throw new IllegalStateException("Author timestamps not loaded");
}
long res = authorTimestamp.getLong(nodeId);
return (res > Long.MIN_VALUE) ? res : null;
}
/** Return the timestamp offset at which the given revision or release was authored */
public Short getAuthorTimestampOffset(long nodeId) {
if (authorTimestampOffset == null) {
throw new IllegalStateException("Author timestamp offsets not loaded");
}
short res = authorTimestampOffset.getShort(nodeId);
return (res > Short.MIN_VALUE) ? res : null;
}
/** Load the timestamps at which the releases and revisions were committed */
public void loadCommitterTimestamps() throws IOException {
committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin");
committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin");
}
/** Return the timestamp at which the given revision was committed */
public Long getCommitterTimestamp(long nodeId) {
if (committerTimestamp == null) {
throw new IllegalStateException("Committer timestamps not loaded");
}
long res = committerTimestamp.getLong(nodeId);
return (res > Long.MIN_VALUE) ? res : null;
}
/** Return the timestamp offset at which the given revision was committed */
public Short getCommitterTimestampOffset(long nodeId) {
if (committerTimestampOffset == null) {
throw new IllegalStateException("Committer timestamp offsets not loaded");
}
short res = committerTimestampOffset.getShort(nodeId);
return (res > Short.MIN_VALUE) ? res : null;
}
/** Load the revision messages, the release messages and the origin URLs */
public void loadMessages() throws IOException {
messageBuffer = loadMappedBytes(path + ".property.message.bin");
messageOffsets = loadMappedLongs(path + ".property.message.offset.bin");
}
/** Get the message of the given revision or release node */
public byte[] getMessage(long nodeId) {
if (messageBuffer == null || messageOffsets == null) {
throw new IllegalStateException("Messages not loaded");
}
long startOffset = messageOffsets.getLong(nodeId);
if (startOffset == -1) {
return null;
}
return Base64.getDecoder().decode(getLine(messageBuffer, startOffset));
}
/** Get the URL of the given origin node */
public String getUrl(long nodeId) {
byte[] url = getMessage(nodeId);
return (url != null) ? new String(url) : null;
}
/** Load the release names */
public void loadTagNames() throws IOException {
tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin");
tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin");
}
/** Get the name of the given release node */
public byte[] getTagName(long nodeId) {
if (tagNameBuffer == null || tagNameOffsets == null) {
throw new IllegalStateException("Tag names not loaded");
}
long startOffset = tagNameOffsets.getLong(nodeId);
if (startOffset == -1) {
return null;
}
return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset));
}
/** Load the arc label names (directory entry names and snapshot branch names) */
public void loadLabelNames() throws IOException {
try {
edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl");
} catch (ConfigurationException e) {
throw new IOException(e);
}
}
/**
* Get the arc label name (either a directory entry name or snapshot branch name) associated with
* the given label ID
*/
public byte[] getLabelName(long labelId) {
if (edgeLabelNames == null) {
throw new IllegalStateException("Label names not loaded");
}
return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId));
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java
index 40610c1..3f865d0 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java
@@ -1,223 +1,230 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import java.io.IOException;
import java.io.InputStream;
/**
* Class representing the compressed Software Heritage graph in a single direction.
*
* The compressed graph is stored using the WebGraph
* framework. This class contains an {@link ImmutableGraph} representing the graph itself, as well
* as a reference to the object containing the graph properties (e.g. node labels). Optionally,
* arc labels (properties stored on the graph edges) can also be loaded with the
* loadLabelled...()
function family.
*
* @author The Software Heritage developers
* @see SwhGraphProperties
* @see SwhUnidirectionalGraph
*/
public class SwhUnidirectionalGraph extends ImmutableGraph implements SwhGraph {
/** Underlying ImmutableGraph */
private final ImmutableGraph graph;
/** Labelled ImmutableGraph, null if labels are not loaded */
private ArcLabelledImmutableGraph labelledGraph;
/** Property data of the graph (id/type mappings etc.) */
public SwhGraphProperties properties;
public SwhUnidirectionalGraph(ImmutableGraph graph, SwhGraphProperties properties) {
this.graph = graph;
this.properties = properties;
}
protected SwhUnidirectionalGraph(ImmutableGraph graph, ArcLabelledImmutableGraph labelledGraph,
SwhGraphProperties properties) {
this.graph = graph;
this.labelledGraph = labelledGraph;
this.properties = properties;
}
/**
* Load the (unlabelled) graph only, without the SWH properties.
*/
public static SwhUnidirectionalGraph loadGraphOnly(LoadMethod method, String path, InputStream is,
ProgressLogger pl) throws IOException {
return new SwhUnidirectionalGraph(ImmutableGraph.load(method, path, is, pl), null);
}
/**
* Load the labelled graph only, without the SWH properties.
*/
public static SwhUnidirectionalGraph loadLabelledGraphOnly(LoadMethod method, String path, InputStream is,
ProgressLogger pl) throws IOException {
BitStreamArcLabelledImmutableGraph g = (BitStreamArcLabelledImmutableGraph) BitStreamArcLabelledImmutableGraph
.load(method, path + "-labelled", is, pl);
return new SwhUnidirectionalGraph(g.g, g, null);
}
/**
* Load the SWH properties of the graph from a given path.
*/
public void loadProperties(String path) throws IOException {
properties = SwhGraphProperties.load(path);
}
/**
* Setter for the SWH graph properties.
*
* @param properties The {@link SwhGraphProperties} object containing the graph properties
*/
public void setProperties(SwhGraphProperties properties) {
this.properties = properties;
}
/**
* Load the unlabelled graph and its SWH properties.
*/
public static SwhUnidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl)
throws IOException {
SwhUnidirectionalGraph g = loadGraphOnly(method, path, is, pl);
g.loadProperties(path);
return g;
}
/**
* Load the labelled graph and its SWH properties.
*/
public static SwhUnidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl)
throws IOException {
SwhUnidirectionalGraph g = loadLabelledGraphOnly(method, path, is, pl);
g.loadProperties(path);
return g;
}
// loadXXX methods of ImmutableGraph
public static SwhUnidirectionalGraph load(String path, ProgressLogger pl) throws IOException {
return load(LoadMethod.STANDARD, path, null, pl);
}
public static SwhUnidirectionalGraph load(String path) throws IOException {
return load(LoadMethod.STANDARD, path, null, null);
}
public static SwhUnidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException {
return load(LoadMethod.MAPPED, path, null, pl);
}
public static SwhUnidirectionalGraph loadMapped(String path) throws IOException {
return load(LoadMethod.MAPPED, path, null, null);
}
public static SwhUnidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException {
return load(LoadMethod.OFFLINE, path, null, pl);
}
public static SwhUnidirectionalGraph loadOffline(String path) throws IOException {
return load(LoadMethod.OFFLINE, path, null, null);
}
// Labelled versions of the loadXXX methods from ImmutableGraph
public static SwhUnidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException {
return loadLabelled(LoadMethod.STANDARD, path, null, pl);
}
public static SwhUnidirectionalGraph loadLabelled(String path) throws IOException {
return loadLabelled(LoadMethod.STANDARD, path, null, null);
}
public static SwhUnidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException {
return loadLabelled(LoadMethod.MAPPED, path, null, pl);
}
public static SwhUnidirectionalGraph loadLabelledMapped(String path) throws IOException {
return loadLabelled(LoadMethod.MAPPED, path, null, null);
}
public static SwhUnidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException {
return loadLabelled(LoadMethod.OFFLINE, path, null, pl);
}
public static SwhUnidirectionalGraph loadLabelledOffline(String path) throws IOException {
return loadLabelled(LoadMethod.OFFLINE, path, null, null);
}
@Override
public SwhUnidirectionalGraph copy() {
return new SwhUnidirectionalGraph(this.graph.copy(),
this.labelledGraph != null ? this.labelledGraph.copy() : null, this.properties);
}
@Override
public boolean randomAccess() {
return graph.randomAccess();
}
public void close() throws IOException {
this.properties.close();
}
@Override
public long numNodes() {
return graph.numNodes();
}
@Override
public long numArcs() {
return graph.numArcs();
}
@Override
public LazyLongIterator successors(long nodeId) {
return graph.successors(nodeId);
}
/**
* Returns a labelled node iterator for scanning the graph sequentially, starting from the
* first node.
*/
public ArcLabelledNodeIterator labelledNodeIterator() {
if (labelledGraph == null) {
throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded.");
}
return labelledGraph.nodeIterator();
}
/**
* Returns a labelled node iterator for scanning the graph sequentially, starting from a
* given node.
*/
public ArcLabelledNodeIterator labelledNodeIterator(long from) {
if (labelledGraph == null) {
throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded.");
}
return labelledGraph.nodeIterator(from);
}
/**
* Returns a labelled lazy iterator over the successors of a given node. The iteration
* terminates when -1 is returned.
*/
public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) {
if (labelledGraph == null) {
throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded.");
}
return labelledGraph.successors(x);
}
@Override
public long outdegree(long nodeId) {
return graph.outdegree(nodeId);
}
@Override
public SwhGraphProperties getProperties() {
return properties;
}
public ImmutableGraph underlyingGraph() {
return graph;
}
public ArcLabelledImmutableGraph underlyingLabelledGraph() {
return labelledGraph;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java
index ee71713..1f12744 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java
@@ -1,185 +1,192 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.github.luben.zstd.ZstdInputStream;
import it.unimi.dsi.fastutil.bytes.ByteArrays;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
/**
* A graph dataset in (zstd-compressed) CSV format.
*
* This format does not contain any properties apart from the SWHIDs of the nodes, and optionally
* the labels of the edges and the permissions of the directory entries.
*
* The structure of the dataset is as follows: one directory per object type, each containing:
*
*
* - a number of files
*.nodes.csv.zst
containing the SWHIDs of the objects stored in
* the graph, one per line.
* - a number of files
*.edges.csv.zst
containing the edges of the graph, one per
* line. The format of each edge is as follows:
* SRC_SWHID DST_SWHID [BASE64_LABEL] [INT_PERMISSION]
.
*
*
*/
public class CSVEdgeDataset implements GraphDataset {
final static Logger logger = LoggerFactory.getLogger(CSVEdgeDataset.class);
final private File datasetDir;
public CSVEdgeDataset(String datasetPath) {
this(new File(datasetPath));
}
public CSVEdgeDataset(File datasetDir) {
if (!datasetDir.exists()) {
throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist");
}
this.datasetDir = datasetDir;
}
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
File[] allTables = datasetDir.listFiles();
if (allTables == null) {
return;
}
for (File tableFile : allTables) {
File[] allCsvFiles = tableFile.listFiles();
if (allCsvFiles == null) {
continue;
}
for (File csvFile : allCsvFiles) {
if (csvFile.getName().endsWith(".edges.csv.zst")) {
readEdgesCsvZst(csvFile.getPath(), edgeCb);
} else if (csvFile.getName().endsWith(".nodes.csv.zst")) {
readNodesCsvZst(csvFile.getPath(), nodeCb);
}
}
}
}
public static void readEdgesCsvZst(String csvZstPath, GraphDataset.EdgeCallback cb) throws IOException {
InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath)));
readEdgesCsv(csvInputStream, cb);
}
public static void readEdgesCsv(InputStream csvInputStream, GraphDataset.EdgeCallback cb) throws IOException {
FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream);
Charset charset = StandardCharsets.US_ASCII;
byte[] array = new byte[1024];
for (long line = 0;; line++) {
int start = 0, len;
while ((len = csvReader.readLine(array, start, array.length - start,
FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) {
start += len;
array = ByteArrays.grow(array, array.length + 1);
}
if (len == -1)
break; // EOF
final int lineLength = start + len;
// Skip whitespace at the start of the line.
int offset = 0;
while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
offset++;
if (offset == lineLength) {
continue;
}
if (array[0] == '#')
continue;
// Scan source id.
start = offset;
while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
offset++;
final byte[] ss = Arrays.copyOfRange(array, start, offset);
// Skip whitespace between identifiers.
while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
offset++;
if (offset == lineLength) {
logger.error("Error at line " + line + ": no target");
continue;
}
// Scan target ID
start = offset;
while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
offset++;
final byte[] ts = Arrays.copyOfRange(array, start, offset);
// Skip whitespace between identifiers.
while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
offset++;
// Scan label
byte[] ls = null;
if (offset < lineLength) {
start = offset;
while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
offset++;
ls = Arrays.copyOfRange(array, start, offset);
}
// Skip whitespace between identifiers.
while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
offset++;
// Scan permission
int permission = 0;
if (offset < lineLength) {
start = offset;
while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
offset++;
permission = Integer.parseInt(new String(array, start, offset - start, charset));
}
cb.onEdge(ss, ts, ls, permission);
}
}
public static void readNodesCsvZst(String csvZstPath, GraphDataset.NodeCallback cb) throws IOException {
InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath)));
readNodesCsv(csvInputStream, cb);
}
public static void readNodesCsv(InputStream csvInputStream, GraphDataset.NodeCallback cb) throws IOException {
FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream);
byte[] array = new byte[1024];
for (long line = 0;; line++) {
int start = 0, len;
while ((len = csvReader.readLine(array, start, array.length - start,
FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) {
start += len;
array = ByteArrays.grow(array, array.length + 1);
}
if (len == -1)
break; // EOF
final int lineLength = start + len;
// Skip whitespace at the start of the line.
int offset = 0;
while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
offset++;
if (offset == lineLength) {
continue;
}
if (array[0] == '#')
continue;
// Scan source id.
start = offset;
while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
offset++;
final byte[] ss = Arrays.copyOfRange(array, start, offset);
cb.onNode(ss);
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java
index ef13166..62d3460 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java
@@ -1,51 +1,58 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.io.BinIO;
import java.io.File;
import java.io.IOException;
/**
* CLI program used to compose two on-disk permutations.
*
* It takes two on-disk permutations as parameters, p1 and p2, and writes on disk (p1 o p2) at the
* given location. This is useful for multi-step compression (e.g., Unordered -> BFS -> LLP), as it
* can be used to merge all the intermediate permutations.
*/
public class ComposePermutations {
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("firstPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The first permutation"),
new UnflaggedOption("secondPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED,
"The second permutation"),
new UnflaggedOption("outputPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED,
"The output permutation"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
JSAPResult config = parse_args(args);
String firstPermFilename = config.getString("firstPermutation");
String secondPermFilename = config.getString("secondPermutation");
String outputPermFilename = config.getString("outputPermutation");
long[][] firstPerm = BinIO.loadLongsBig(new File(firstPermFilename));
long[][] secondPerm = BinIO.loadLongsBig(new File(secondPermFilename));
long[][] outputPerm = Util.composePermutationsInPlace(firstPerm, secondPerm);
BinIO.storeLongs(outputPerm, outputPermFilename);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java
index e055f7d..9d58fff 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java
@@ -1,404 +1,411 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.github.luben.zstd.ZstdOutputStream;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.utils.Sort;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicLongArray;
/**
* Read a graph dataset and extract all the unique node SWHIDs it contains, including the ones that
* are not stored as actual objects in the graph, but only referred to by the edges.
* Additionally, extract the set of all unique edge labels in the graph.
*
*
* - The set of nodes is written in
${outputBasename}.nodes.csv.zst
, as a
* zst-compressed sorted list of SWHIDs, one per line.
* - The set of edge labels is written in
${outputBasename}.labels.csv.zst
, as a
* zst-compressed sorted list of labels encoded in base64, one per line.
* - The number of unique nodes referred to in the graph is written in a text file,
*
${outputBasename}.nodes.count.txt
* - The number of unique edges referred to in the graph is written in a text file,
*
${outputBasename}.edges.count.txt
* - The number of unique edge labels is written in a text file,
*
${outputBasename}.labels.count.txt
* - Statistics on the number of nodes of each type are written in a text file,
*
${outputBasename}.nodes.stats.txt
* - Statistics on the number of edges of each type are written in a text file,
*
${outputBasename}.edges.stats.txt
*
*
*
* Rationale: Because the graph can contain holes, loose objects and dangling
* objects, some nodes that are referred to as destinations in the edge relationships might not
* actually be stored in the graph itself. However, to compress the graph using a graph compression
* library, it is necessary to have a list of all the nodes in the graph, including the
* ones that are simply referred to by the edges but not actually stored as concrete objects.
*
*
*
* This class reads the entire graph dataset, and uses sort -u
to extract the set of
* all the unique nodes and unique labels that will be needed as an input for the compression
* process.
*
*/
public class ExtractNodes {
private final static Logger logger = LoggerFactory.getLogger(ExtractNodes.class);
// Create one thread per processor.
final static int numThreads = Runtime.getRuntime().availableProcessors();
// Allocate up to 20% of maximum memory for sorting subprocesses.
final static long sortBufferSize = (long) (Runtime.getRuntime().maxMemory() * 0.2 / numThreads / 2);
private static JSAPResult parseArgs(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the edges dataset"),
new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Basename of the output files"),
new FlaggedOption("format", JSAP.STRING_PARSER, "orc", JSAP.NOT_REQUIRED, 'f', "format",
"Format of the input dataset (orc, csv)"),
new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, String.valueOf(sortBufferSize) + "b",
JSAP.NOT_REQUIRED, 'S', "sort-buffer-size",
"Size of the memory buffer used by each sort process"),
new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir",
"Path to the temporary directory used by sort")});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
System.err.println("Usage error: " + e.getMessage());
System.exit(1);
}
return config;
}
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult parsedArgs = parseArgs(args);
String datasetPath = parsedArgs.getString("dataset");
String outputBasename = parsedArgs.getString("outputBasename");
String datasetFormat = parsedArgs.getString("format");
String sortBufferSize = parsedArgs.getString("sortBufferSize");
String sortTmpPath = parsedArgs.getString("sortTmpDir", null);
File sortTmpDir = new File(sortTmpPath);
sortTmpDir.mkdirs();
// Open edge dataset
GraphDataset dataset;
if (datasetFormat.equals("orc")) {
dataset = new ORCGraphDataset(datasetPath);
} else if (datasetFormat.equals("csv")) {
dataset = new CSVEdgeDataset(datasetPath);
} else {
throw new IllegalArgumentException("Unknown dataset format: " + datasetFormat);
}
extractNodes(dataset, outputBasename, sortBufferSize, sortTmpDir);
}
public static void extractNodes(GraphDataset dataset, String outputBasename, String sortBufferSize, File sortTmpDir)
throws IOException, InterruptedException {
// Read the dataset and write the nodes and labels to the sorting processes
AtomicLong edgeCount = new AtomicLong(0);
AtomicLongArray edgeCountByType = new AtomicLongArray(Node.Type.values().length * Node.Type.values().length);
int numThreads = Runtime.getRuntime().availableProcessors();
ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads);
Process[] nodeSorters = new Process[numThreads];
File[] nodeBatchPaths = new File[numThreads];
Process[] labelSorters = new Process[numThreads];
File[] labelBatches = new File[numThreads];
long[] progressCounts = new long[numThreads];
AtomicInteger nextThreadId = new AtomicInteger(0);
ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement);
ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
pl.itemsName = "edges";
pl.start("Reading node/edge files and writing sorted batches.");
GraphDataset.NodeCallback nodeCallback = (node) -> {
int threadId = threadLocalId.get();
if (nodeSorters[threadId] == null) {
nodeBatchPaths[threadId] = File.createTempFile("nodes", ".txt", sortTmpDir);
nodeSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(),
List.of("-o", nodeBatchPaths[threadId].getPath()));
}
OutputStream nodeOutputStream = nodeSorters[threadId].getOutputStream();
nodeOutputStream.write(node);
nodeOutputStream.write('\n');
};
GraphDataset.NodeCallback labelCallback = (label) -> {
int threadId = threadLocalId.get();
if (labelSorters[threadId] == null) {
labelBatches[threadId] = File.createTempFile("labels", ".txt", sortTmpDir);
labelSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(),
List.of("-o", labelBatches[threadId].getPath()));
}
OutputStream labelOutputStream = labelSorters[threadId].getOutputStream();
labelOutputStream.write(label);
labelOutputStream.write('\n');
};
try {
forkJoinPool.submit(() -> {
try {
dataset.readEdges((node) -> {
nodeCallback.onNode(node);
}, (src, dst, label, perm) -> {
nodeCallback.onNode(src);
nodeCallback.onNode(dst);
if (label != null) {
labelCallback.onNode(label);
}
edgeCount.incrementAndGet();
// Extract type of src and dst from their SWHID: swh:1:XXX
byte[] srcTypeBytes = Arrays.copyOfRange(src, 6, 6 + 3);
byte[] dstTypeBytes = Arrays.copyOfRange(dst, 6, 6 + 3);
int srcType = Node.Type.byteNameToInt(srcTypeBytes);
int dstType = Node.Type.byteNameToInt(dstTypeBytes);
if (srcType != -1 && dstType != -1) {
edgeCountByType.incrementAndGet(srcType * Node.Type.values().length + dstType);
} else {
System.err.println("Invalid edge type: " + new String(srcTypeBytes) + " -> "
+ new String(dstTypeBytes));
System.exit(1);
}
int threadId = threadLocalId.get();
if (++progressCounts[threadId] > 1000) {
synchronized (pl) {
pl.update(progressCounts[threadId]);
}
progressCounts[threadId] = 0;
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}).get();
} catch (ExecutionException e) {
throw new RuntimeException(e);
}
// Close all the sorters stdin
for (int i = 0; i < numThreads; i++) {
if (nodeSorters[i] != null) {
nodeSorters[i].getOutputStream().close();
}
if (labelSorters[i] != null) {
labelSorters[i].getOutputStream().close();
}
}
// Wait for sorting processes to finish
for (int i = 0; i < numThreads; i++) {
if (nodeSorters[i] != null) {
nodeSorters[i].waitFor();
}
if (labelSorters[i] != null) {
labelSorters[i].waitFor();
}
}
pl.done();
ArrayList nodeSortMergerOptions = new ArrayList<>(List.of("-m"));
ArrayList labelSortMergerOptions = new ArrayList<>(List.of("-m"));
for (int i = 0; i < numThreads; i++) {
if (nodeBatchPaths[i] != null) {
nodeSortMergerOptions.add(nodeBatchPaths[i].getPath());
}
if (labelBatches[i] != null) {
labelSortMergerOptions.add(labelBatches[i].getPath());
}
}
// Spawn node merge-sorting process
Process nodeSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), nodeSortMergerOptions);
nodeSortMerger.getOutputStream().close();
OutputStream nodesFileOutputStream = new ZstdOutputStream(
new BufferedOutputStream(new FileOutputStream(outputBasename + ".nodes.csv.zst")));
NodesOutputThread nodesOutputThread = new NodesOutputThread(
new BufferedInputStream(nodeSortMerger.getInputStream()), nodesFileOutputStream);
nodesOutputThread.start();
// Spawn label merge-sorting process
Process labelSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), labelSortMergerOptions);
labelSortMerger.getOutputStream().close();
OutputStream labelsFileOutputStream = new ZstdOutputStream(
new BufferedOutputStream(new FileOutputStream(outputBasename + ".labels.csv.zst")));
LabelsOutputThread labelsOutputThread = new LabelsOutputThread(
new BufferedInputStream(labelSortMerger.getInputStream()), labelsFileOutputStream);
labelsOutputThread.start();
pl.logger().info("Waiting for merge-sort and writing output files...");
nodeSortMerger.waitFor();
labelSortMerger.waitFor();
nodesOutputThread.join();
labelsOutputThread.join();
long[][] edgeCountByTypeArray = new long[Node.Type.values().length][Node.Type.values().length];
for (int i = 0; i < edgeCountByTypeArray.length; i++) {
for (int j = 0; j < edgeCountByTypeArray[i].length; j++) {
edgeCountByTypeArray[i][j] = edgeCountByType.get(i * Node.Type.values().length + j);
}
}
// Write node, edge and label counts/statistics
printEdgeCounts(outputBasename, edgeCount.get(), edgeCountByTypeArray);
printNodeCounts(outputBasename, nodesOutputThread.getNodeCount(), nodesOutputThread.getNodeTypeCounts());
printLabelCounts(outputBasename, labelsOutputThread.getLabelCount());
// Clean up sorted batches
for (int i = 0; i < numThreads; i++) {
if (nodeBatchPaths[i] != null) {
nodeBatchPaths[i].delete();
}
if (labelBatches[i] != null) {
labelBatches[i].delete();
}
}
}
private static void printEdgeCounts(String basename, long edgeCount, long[][] edgeTypeCounts) throws IOException {
PrintWriter nodeCountWriter = new PrintWriter(basename + ".edges.count.txt");
nodeCountWriter.println(edgeCount);
nodeCountWriter.close();
PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".edges.stats.txt");
TreeMap edgeTypeCountsMap = new TreeMap<>();
for (Node.Type src : Node.Type.values()) {
for (Node.Type dst : Node.Type.values()) {
long cnt = edgeTypeCounts[Node.Type.toInt(src)][Node.Type.toInt(dst)];
if (cnt > 0)
edgeTypeCountsMap.put(src.toString().toLowerCase() + ":" + dst.toString().toLowerCase(), cnt);
}
}
for (Map.Entry entry : edgeTypeCountsMap.entrySet()) {
nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue());
}
nodeTypesCountWriter.close();
}
private static void printNodeCounts(String basename, long nodeCount, long[] nodeTypeCounts) throws IOException {
PrintWriter nodeCountWriter = new PrintWriter(basename + ".nodes.count.txt");
nodeCountWriter.println(nodeCount);
nodeCountWriter.close();
PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".nodes.stats.txt");
TreeMap nodeTypeCountsMap = new TreeMap<>();
for (Node.Type v : Node.Type.values()) {
nodeTypeCountsMap.put(v.toString().toLowerCase(), nodeTypeCounts[Node.Type.toInt(v)]);
}
for (Map.Entry entry : nodeTypeCountsMap.entrySet()) {
nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue());
}
nodeTypesCountWriter.close();
}
private static void printLabelCounts(String basename, long labelCount) throws IOException {
PrintWriter nodeCountWriter = new PrintWriter(basename + ".labels.count.txt");
nodeCountWriter.println(labelCount);
nodeCountWriter.close();
}
private static class NodesOutputThread extends Thread {
private final InputStream sortedNodesStream;
private final OutputStream nodesOutputStream;
private long nodeCount = 0;
private final long[] nodeTypeCounts = new long[Node.Type.values().length];
NodesOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) {
this.sortedNodesStream = sortedNodesStream;
this.nodesOutputStream = nodesOutputStream;
}
@Override
public void run() {
BufferedReader reader = new BufferedReader(
new InputStreamReader(sortedNodesStream, StandardCharsets.UTF_8));
try {
String line;
while ((line = reader.readLine()) != null) {
nodesOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
nodesOutputStream.write('\n');
nodeCount++;
try {
Node.Type nodeType = Node.Type.fromStr(line.split(":")[2]);
nodeTypeCounts[Node.Type.toInt(nodeType)]++;
} catch (ArrayIndexOutOfBoundsException e) {
System.err.println("Error parsing SWHID: " + line);
System.exit(1);
}
}
nodesOutputStream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public long getNodeCount() {
return nodeCount;
}
public long[] getNodeTypeCounts() {
return nodeTypeCounts;
}
}
private static class LabelsOutputThread extends Thread {
private final InputStream sortedLabelsStream;
private final OutputStream labelsOutputStream;
private long labelCount = 0;
LabelsOutputThread(InputStream sortedLabelsStream, OutputStream labelsOutputStream) {
this.labelsOutputStream = labelsOutputStream;
this.sortedLabelsStream = sortedLabelsStream;
}
@Override
public void run() {
BufferedReader reader = new BufferedReader(
new InputStreamReader(sortedLabelsStream, StandardCharsets.UTF_8));
try {
String line;
while ((line = reader.readLine()) != null) {
labelsOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
labelsOutputStream.write('\n');
labelCount++;
}
labelsOutputStream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public long getLabelCount() {
return labelCount;
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java
index 6bf20e4..fc5cc5b 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java
@@ -1,129 +1,136 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.github.luben.zstd.ZstdOutputStream;
import com.martiansoftware.jsap.*;
import org.softwareheritage.graph.utils.Sort;
import java.io.*;
import java.nio.charset.StandardCharsets;
/**
* Read a graph dataset and extract all the unique authors it contains.
*
*
* This class reads the revision and release tables of the graph dataset, and uses
* sort -u
to extract the set of all the unique persons (name + email, potentially
* pseudonymized) and store them in a file.
*
*/
public class ExtractPersons {
private static JSAPResult parseArgs(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC dataset"),
new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Basename of the output files"),
new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, "30%", JSAP.NOT_REQUIRED, 'S',
"sort-buffer-size", "Size of the memory buffer used by sort"),
new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir",
"Path to the temporary directory used by sort")});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
System.err.println("Usage error: " + e.getMessage());
System.exit(1);
}
return config;
}
private static void processAuthorColumn(ORCGraphDataset.SwhOrcTable table, String columnName, OutputStream stream)
throws IOException {
table.readBytes64Column(columnName, (swhid, personBase64) -> {
stream.write(personBase64);
stream.write('\n');
});
}
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult parsedArgs = parseArgs(args);
String datasetPath = parsedArgs.getString("dataset");
String outputBasename = parsedArgs.getString("outputBasename");
String sortBufferSize = parsedArgs.getString("sortBufferSize");
String sortTmpDir = parsedArgs.getString("sortTmpDir", null);
ORCGraphDataset dataset = new ORCGraphDataset(datasetPath);
extractPersons(dataset, outputBasename, sortBufferSize, sortTmpDir);
}
public static void extractPersons(ORCGraphDataset dataset, String outputBasename, String sortBufferSize,
String sortTmpDir) throws IOException, InterruptedException {
(new File(sortTmpDir)).mkdirs();
// Spawn person sorting process
Process personSort = Sort.spawnSort(sortBufferSize, sortTmpDir);
BufferedOutputStream personSortStdin = new BufferedOutputStream(personSort.getOutputStream());
BufferedInputStream personSortStdout = new BufferedInputStream(personSort.getInputStream());
OutputStream personsFileOutputStream = new ZstdOutputStream(
new BufferedOutputStream(new FileOutputStream(outputBasename + ".persons.csv.zst")));
PersonsOutputThread personsOutputThread = new PersonsOutputThread(personSortStdout, personsFileOutputStream);
personsOutputThread.start();
processAuthorColumn(dataset.getTable("release"), "author", personSortStdin);
processAuthorColumn(dataset.getTable("revision"), "author", personSortStdin);
processAuthorColumn(dataset.getTable("revision"), "committer", personSortStdin);
// Wait for sorting processes to finish
personSortStdin.close();
personSort.waitFor();
personsOutputThread.join();
// Write person count statistics
printPersonsCounts(outputBasename, personsOutputThread.getPersonCount());
}
private static void printPersonsCounts(String basename, long labelCount) throws IOException {
PrintWriter nodeCountWriter = new PrintWriter(basename + ".persons.count.txt");
nodeCountWriter.println(labelCount);
nodeCountWriter.close();
}
private static class PersonsOutputThread extends Thread {
private final InputStream sortedPersonsStream;
private final OutputStream personsOutputStream;
private long personCount = 0;
PersonsOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) {
this.sortedPersonsStream = sortedNodesStream;
this.personsOutputStream = nodesOutputStream;
}
@Override
public void run() {
BufferedReader reader = new BufferedReader(
new InputStreamReader(sortedPersonsStream, StandardCharsets.UTF_8));
try {
String line;
while ((line = reader.readLine()) != null) {
personsOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
personsOutputStream.write('\n');
personCount++;
}
personsOutputStream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public long getPersonCount() {
return personCount;
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java
index ebd9adb..ae38cda 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java
@@ -1,60 +1,67 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import java.io.IOException;
/**
* GraphDataset is a common interface to represent on-disk graph datasets in various formats,
* usually extracted from the SWH archive with the swh-dataset tool.
*/
public interface GraphDataset {
interface NodeCallback {
void onNode(byte[] node) throws IOException;
}
interface EdgeCallback {
void onEdge(byte[] src, byte[] dst, byte[] label, int permission) throws IOException;
}
/**
* Read the graph dataset and call the callback methods for each node and edge encountered.
*
*
* - The node callback is called for each object stored in the graph.
* - The edge callback is called for each relationship (between two nodes) stored in the
* graph.
*
*
*
* Note that because the graph can contain holes, loose objects and dangling objects, the edge
* callback may be called with parameters representing nodes that are not stored in the graph. This
* is because some nodes that are referred to as destinations in the dataset might not be present in
* the archive (e.g., a revision entry in a directory pointing to a revision that we have not
* crawled yet).
*
*
*
* In order to generate a complete set of all the nodes that are referred to in the graph
* dataset, see the {@link ExtractNodes} class.
*
*
* @param nodeCb callback for each node
* @param edgeCb callback for each edge
*/
void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException;
interface TimestampCallback {
void onTimestamp(byte[] swhid, long timestamp, short offset) throws IOException;
}
interface LongCallback {
void onLong(byte[] swhid, long value) throws IOException;
}
interface BytesCallback {
void onBytes(byte[] swhid, byte[] value) throws IOException;
}
interface HashedEdgeCallback {
void onHashedEdge(long src, long dst, long label, int permission) throws IOException;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java
index 9279c08..31531ec 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java
@@ -1,480 +1,487 @@
+/*
+ * Copyright (c) 2020-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.NodeIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.labels.DirEntry;
import org.softwareheritage.graph.labels.SwhLabel;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.utils.ForkJoinBigQuickSort2;
import org.softwareheritage.graph.utils.ForkJoinQuickSort3;
import java.io.*;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.IntStream;
public class LabelMapBuilder {
final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class);
// Create one thread per processor.
final static int numThreads = Runtime.getRuntime().availableProcessors();
// Allocate up to 40% of maximum memory.
final static int DEFAULT_BATCH_SIZE = Math
.min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (numThreads * 8 * 3)), Arrays.MAX_ARRAY_SIZE);
String orcDatasetPath;
String graphPath;
String outputGraphPath;
String tmpDir;
int batchSize;
long numNodes;
long numArcs;
NodeIdMap nodeIdMap;
Object2LongFunction filenameMph;
long numFilenames;
int totalLabelWidth;
public LabelMapBuilder(String orcDatasetPath, String graphPath, String outputGraphPath, int batchSize,
String tmpDir) throws IOException {
this.orcDatasetPath = orcDatasetPath;
this.graphPath = graphPath;
this.outputGraphPath = (outputGraphPath == null) ? graphPath : outputGraphPath;
this.batchSize = batchSize;
this.tmpDir = tmpDir;
ImmutableGraph graph = ImmutableGraph.loadOffline(graphPath);
this.numArcs = graph.numArcs();
this.numNodes = graph.numNodes();
this.nodeIdMap = new NodeIdMap(graphPath);
filenameMph = NodeIdMap.loadMph(graphPath + ".labels.mph");
numFilenames = getMPHSize(filenameMph);
totalLabelWidth = DirEntry.labelWidth(numFilenames);
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"),
new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"),
new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o',
"output-graph", "Basename of the output graph, same as --graph if not specified"),
new FlaggedOption("batchSize", JSAP.INTEGER_PARSER, String.valueOf(DEFAULT_BATCH_SIZE),
JSAP.NOT_REQUIRED, 'b', "batch-size", "Number of triplets held in memory in each batch"),
new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 'T', "temp-dir",
"Temporary directory path"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult config = parse_args(args);
String orcDataset = config.getString("dataset");
String graphPath = config.getString("graphPath");
String outputGraphPath = config.getString("outputGraphPath");
int batchSize = config.getInt("batchSize");
String tmpDir = config.getString("tmpDir");
LabelMapBuilder builder = new LabelMapBuilder(orcDataset, graphPath, outputGraphPath, batchSize, tmpDir);
builder.computeLabelMap();
}
static long getMPHSize(Object2LongFunction mph) {
return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size();
}
void computeLabelMap() throws IOException {
File tempDirFile = new File(tmpDir);
ObjectArrayList forwardBatches = new ObjectArrayList<>();
ObjectArrayList backwardBatches = new ObjectArrayList<>();
genSortedBatches(forwardBatches, backwardBatches, tempDirFile);
BatchEdgeLabelLineIterator forwardBatchHeapIterator = new BatchEdgeLabelLineIterator(forwardBatches);
writeLabels(forwardBatchHeapIterator, graphPath, outputGraphPath);
for (File batch : forwardBatches) {
batch.delete();
}
BatchEdgeLabelLineIterator backwardBatchHeapIterator = new BatchEdgeLabelLineIterator(backwardBatches);
writeLabels(backwardBatchHeapIterator, graphPath + "-transposed", outputGraphPath + "-transposed");
for (File batch : backwardBatches) {
batch.delete();
}
logger.info("Done");
}
void genSortedBatches(ObjectArrayList forwardBatches, ObjectArrayList backwardBatches, File tempDirFile)
throws IOException {
logger.info("Initializing batch arrays.");
long[][] srcArrays = new long[numThreads][batchSize];
long[][] dstArrays = new long[numThreads][batchSize];
long[][] labelArrays = new long[numThreads][batchSize];
int[] indexes = new int[numThreads];
long[] progressCounts = new long[numThreads];
ProgressLogger plSortingBatches = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plSortingBatches.itemsName = "edges";
plSortingBatches.expectedUpdates = this.numArcs;
plSortingBatches.start("Reading edges and writing sorted batches.");
AtomicInteger nextThreadId = new AtomicInteger(0);
ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement);
readHashedEdgeLabels((src, dst, label, perms) -> {
// System.err.println("0. Input " + src + " " + dst + " " + label + " " + perms);
int threadId = threadLocalId.get();
int idx = indexes[threadId]++;
srcArrays[threadId][idx] = src;
dstArrays[threadId][idx] = dst;
labelArrays[threadId][idx] = DirEntry.toEncoded(label, perms);
if (++progressCounts[threadId] > 1000) {
synchronized (plSortingBatches) {
plSortingBatches.update(progressCounts[threadId]);
}
progressCounts[threadId] = 0;
}
if (idx == batchSize - 1) {
processBidirectionalBatches(batchSize, srcArrays[threadId], dstArrays[threadId], labelArrays[threadId],
tempDirFile, forwardBatches, backwardBatches);
indexes[threadId] = 0;
}
});
IntStream.range(0, numThreads).parallel().forEach(t -> {
int idx = indexes[t];
if (idx > 0) {
try {
processBidirectionalBatches(idx, srcArrays[t], dstArrays[t], labelArrays[t], tempDirFile,
forwardBatches, backwardBatches);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
// Trigger the GC to free up the large arrays
for (int i = 0; i < numThreads; i++) {
srcArrays[i] = null;
dstArrays[i] = null;
labelArrays[i] = null;
}
logger.info("Created " + forwardBatches.size() + " forward batches and " + backwardBatches.size()
+ " backward batches.");
}
void readHashedEdgeLabels(GraphDataset.HashedEdgeCallback cb) throws IOException {
ORCGraphDataset dataset = new ORCGraphDataset(orcDatasetPath);
ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads);
try {
forkJoinPool.submit(() -> {
try {
dataset.readEdges((node) -> {
}, (src, dst, label, perms) -> {
if (label == null) {
return;
}
long srcNode = nodeIdMap.getNodeId(src);
long dstNode = nodeIdMap.getNodeId(dst);
long labelId = filenameMph.getLong(label);
cb.onHashedEdge(srcNode, dstNode, labelId, perms);
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}).get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
}
void processBidirectionalBatches(final int n, final long[] source, final long[] target, final long[] labels,
final File tempDir, final List forwardBatches, final List backwardBatches) throws IOException {
processBatch(n, source, target, labels, tempDir, forwardBatches);
processBatch(n, target, source, labels, tempDir, backwardBatches);
}
void processBatch(final int n, final long[] source, final long[] target, final long[] labels, final File tempDir,
final List batches) throws IOException {
if (n == 0) {
return;
}
ForkJoinQuickSort3.parallelQuickSort(source, target, labels, 0, n);
final File batchFile = File.createTempFile("batch", ".bitstream", tempDir);
batchFile.deleteOnExit();
batches.add(batchFile);
final OutputBitStream batch = new OutputBitStream(batchFile);
// Compute unique triplets
int u = 1;
for (int i = n - 1; i-- != 0;) {
if (source[i] != source[i + 1] || target[i] != target[i + 1] || labels[i] != labels[i + 1]) {
u++;
}
}
batch.writeDelta(u);
// Write batch
long prevSource = source[0];
batch.writeLongDelta(prevSource);
batch.writeLongDelta(target[0]);
batch.writeLongDelta(labels[0]);
// System.err.println("1. Wrote " + prevSource + " " + target[0] + " " + labels[0]);
for (int i = 1; i < n; i++) {
if (source[i] != prevSource) {
// Default case, we write (source - prevsource, target, label)
batch.writeLongDelta(source[i] - prevSource);
batch.writeLongDelta(target[i]);
batch.writeLongDelta(labels[i]);
prevSource = source[i];
} else if (target[i] != target[i - 1] || labels[i] != labels[i - 1]) {
// Case where source is identical with prevsource, but target or label differ.
// We write (0, target - prevtarget, label)
batch.writeLongDelta(0);
batch.writeLongDelta(target[i] - target[i - 1]);
batch.writeLongDelta(labels[i]);
} else {
continue;
}
// System.err.println("1. Wrote " + source[i] + " " + target[i] + " " + labels[i]);
}
batch.close();
}
void writeLabels(EdgeLabelLineIterator mapLines, String graphBasename, String outputGraphBasename)
throws IOException {
// Loading the graph to iterate
ImmutableGraph graph = ImmutableGraph.loadMapped(graphBasename);
// Get the sorted output and write the labels and label offsets
ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plLabels.itemsName = "edges";
plLabels.expectedUpdates = this.numArcs;
plLabels.start("Writing the labels to the label file: " + outputGraphBasename + "-labelled.*");
OutputBitStream labels = new OutputBitStream(
new File(outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABELS_EXTENSION));
OutputBitStream offsets = new OutputBitStream(new File(
outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION));
offsets.writeGamma(0);
EdgeLabelLine line = new EdgeLabelLine(-1, -1, -1, -1);
NodeIterator it = graph.nodeIterator();
boolean started = false;
ArrayList labelBuffer = new ArrayList<>(128);
while (it.hasNext()) {
long srcNode = it.nextLong();
long bits = 0;
LazyLongIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
while (line != null && line.srcNode <= srcNode && line.dstNode <= dstNode) {
if (line.srcNode == srcNode && line.dstNode == dstNode) {
labelBuffer.add(new DirEntry(line.filenameId, line.permission));
}
if (!mapLines.hasNext())
break;
line = mapLines.next();
if (!started) {
plLabels.start("Writing label map to file...");
started = true;
}
}
SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, labelBuffer.toArray(new DirEntry[0]));
labelBuffer.clear();
bits += l.toBitStream(labels, -1);
plLabels.lightUpdate();
}
offsets.writeLongGamma(bits);
}
labels.close();
offsets.close();
plLabels.done();
graph = null;
PrintWriter pw = new PrintWriter(new FileWriter(outputGraphBasename + "-labelled.properties"));
pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName());
pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName()
+ "(DirEntry," + totalLabelWidth + ")");
pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = "
+ Paths.get(outputGraphBasename).getFileName());
pw.close();
}
public static class EdgeLabelLine {
public long srcNode;
public long dstNode;
public long filenameId;
public int permission;
public EdgeLabelLine(long labelSrcNode, long labelDstNode, long labelFilenameId, int labelPermission) {
this.srcNode = labelSrcNode;
this.dstNode = labelDstNode;
this.filenameId = labelFilenameId;
this.permission = labelPermission;
}
}
public abstract static class EdgeLabelLineIterator implements Iterator {
@Override
public abstract boolean hasNext();
@Override
public abstract EdgeLabelLine next();
}
public static class BatchEdgeLabelLineIterator extends EdgeLabelLineIterator {
private static final int STD_BUFFER_SIZE = 128 * 1024;
private final InputBitStream[] batchIbs;
private final int[] inputStreamLength;
private final long[] refArray;
private final LongHeapSemiIndirectPriorityQueue queue;
private final long[] prevTarget;
/** The last returned node (-1 if no node has been returned yet). */
private long lastNode;
private long[][] lastNodeSuccessors = LongBigArrays.EMPTY_BIG_ARRAY;
private long[][] lastNodeLabels = LongBigArrays.EMPTY_BIG_ARRAY;
private long lastNodeOutdegree;
private long lastNodeCurrentSuccessor;
public BatchEdgeLabelLineIterator(final List batches) throws IOException {
this.batchIbs = new InputBitStream[batches.size()];
this.refArray = new long[batches.size()];
this.prevTarget = new long[batches.size()];
this.queue = new LongHeapSemiIndirectPriorityQueue(refArray);
this.inputStreamLength = new int[batches.size()];
for (int i = 0; i < batches.size(); i++) {
batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE);
this.inputStreamLength[i] = batchIbs[i].readDelta();
this.refArray[i] = batchIbs[i].readLongDelta();
queue.enqueue(i);
}
this.lastNode = -1;
this.lastNodeOutdegree = 0;
this.lastNodeCurrentSuccessor = 0;
}
public boolean hasNextNode() {
return !queue.isEmpty();
}
private void readNextNode() throws IOException {
assert hasNext();
int i;
lastNode++;
lastNodeOutdegree = 0;
lastNodeCurrentSuccessor = 0;
/*
* We extract elements from the queue as long as their target is equal to last. If during the
* process we exhaust a batch, we close it.
*/
while (!queue.isEmpty() && refArray[i = queue.first()] == lastNode) {
lastNodeSuccessors = BigArrays.grow(lastNodeSuccessors, lastNodeOutdegree + 1);
lastNodeLabels = BigArrays.grow(lastNodeLabels, lastNodeOutdegree + 1);
long target = prevTarget[i] += batchIbs[i].readLongDelta();
long label = batchIbs[i].readLongDelta();
BigArrays.set(lastNodeSuccessors, lastNodeOutdegree, target);
BigArrays.set(lastNodeLabels, lastNodeOutdegree, label);
// System.err.println("2. Read " + lastNode + " " + target + " " + label);
if (--inputStreamLength[i] == 0) {
queue.dequeue();
batchIbs[i].close();
batchIbs[i] = null;
} else {
// We read a new source and update the queue.
final long sourceDelta = batchIbs[i].readLongDelta();
if (sourceDelta != 0) {
refArray[i] += sourceDelta;
prevTarget[i] = 0;
queue.changed();
}
}
lastNodeOutdegree++;
}
// Neither quicksort nor heaps are stable, so we reestablish order here.
// LongBigArrays.radixSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree);
ForkJoinBigQuickSort2.parallelQuickSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree);
}
@Override
public boolean hasNext() {
return lastNodeCurrentSuccessor < lastNodeOutdegree || hasNextNode();
}
@Override
public EdgeLabelLine next() {
if (lastNode == -1 || lastNodeCurrentSuccessor >= lastNodeOutdegree) {
try {
do {
readNextNode();
} while (hasNextNode() && lastNodeOutdegree == 0);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
long src = lastNode;
long dst = BigArrays.get(lastNodeSuccessors, lastNodeCurrentSuccessor);
long compressedLabel = BigArrays.get(lastNodeLabels, lastNodeCurrentSuccessor);
long labelName = DirEntry.labelNameFromEncoded(compressedLabel);
int permission = DirEntry.permissionFromEncoded(compressedLabel);
// System.err.println("3. Output (encoded): " + src + " " + dst + " " + compressedLabel);
// System.err.println("4. Output (decoded): " + src + " " + dst + " " + labelName + " " +
// permission);
lastNodeCurrentSuccessor++;
return new EdgeLabelLine(src, dst, labelName, permission);
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java
index 80e0c7e..74ef2f3 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java
@@ -1,194 +1,201 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.github.luben.zstd.ZstdInputStream;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.maps.NodeTypesMap;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Scanner;
import java.util.concurrent.TimeUnit;
/**
* Create maps needed at runtime by the graph service, in particular:
*
*
* - WebGraph long node id → SWHID
* - WebGraph long node id → SWH node type (enum)
*
*
* @author The Software Heritage developers
*/
public class NodeMapBuilder {
final static String SORT_BUFFER_SIZE = "40%";
final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class);
/**
* Main entrypoint.
*
* @param args command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV");
System.exit(1);
}
String graphPath = args[0];
String tmpDir = args[1];
logger.info("starting maps generation...");
precomputeNodeIdMap(graphPath, tmpDir);
logger.info("maps generation completed");
}
/**
* Computes and dumps on disk mapping files.
*
* @param graphPath path of the compressed graph
*/
static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException {
ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
plSWHID2Node.itemsName = "nodes";
plNode2SWHID.itemsName = "nodes";
// first half of SWHID->node mapping: SWHID -> WebGraph MPH (long)
Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
plSWHID2Node.expectedUpdates = nbIds;
plNode2SWHID.expectedUpdates = nbIds;
// second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long)
long[][] bfsMap = LongBigArrays.newBigArray(nbIds);
logger.info("loading BFS order file...");
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap);
logger.info("BFS order file loaded");
if (loaded != nbIds) {
logger.error("graph contains " + nbIds + " nodes, but read " + loaded);
System.exit(2);
}
/*
* Read on stdin a list of SWHIDs, hash them with MPH, then permute them according to the .order
* file
*/
FastBufferedReader buffer = new FastBufferedReader(
new InputStreamReader(new ZstdInputStream(new BufferedInputStream(System.in))));
LineIterator swhidIterator = new LineIterator(buffer);
/*
* The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically
* sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the
* sorting job to /usr/bin/sort via pipes
*/
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir);
Process sort = processBuilder.start();
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream());
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream());
// for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap
try (BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream(
new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) {
/*
* background handler for sort output, it will be fed SWHID/node pairs, and will itself fill
* nodeToSwhidMap as soon as data from sort is ready.
*/
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID);
outputHandler.start();
/*
* Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to
* efficiently check edge restrictions.
*/
final int nbBitsPerNodeType = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2));
LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
plSWHID2Node.start("Hashing SWHIDs to fill sort input");
for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) {
String swhidStr = swhidIterator.next().toString();
SWHID swhid = new SWHID(swhidStr);
long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII));
long nodeId = BigArrays.get(bfsMap, mphId);
sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII));
nodeTypesMap.set(nodeId, swhid.getType().ordinal());
plSWHID2Node.lightUpdate();
}
plSWHID2Node.done();
sort_stdin.close();
// write type map
logger.info("storing type map");
BinIO.storeObject(nodeTypesMap, graphPath + NodeTypesMap.NODE_TO_TYPE);
logger.info("type map stored");
// wait for nodeToSwhidMap filling
try {
logger.info("waiting for node2swhid map...");
int sortExitCode = sort.waitFor();
if (sortExitCode != 0) {
logger.error("sort returned non-zero exit code: " + sortExitCode);
System.exit(2);
}
outputHandler.join();
} catch (InterruptedException e) {
logger.error("processing of sort output failed with: " + e);
System.exit(2);
}
}
}
private static class SortOutputHandler extends Thread {
private final Scanner input;
private final OutputStream output;
private final ProgressLogger pl;
SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) {
this.input = new Scanner(input, StandardCharsets.US_ASCII);
this.output = output;
this.pl = pl;
}
public void run() {
boolean sortDone = false;
logger.info("node2swhid: waiting for sort output...");
while (input.hasNextLine()) {
if (!sortDone) {
sortDone = true;
this.pl.start("filling node2swhid map");
}
String line = input.nextLine(); // format: SWHID NODE_ID
SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID
try {
output.write(swhid.toBytes());
} catch (IOException e) {
logger.error("writing to node->SWHID map failed with: " + e);
}
this.pl.lightUpdate();
}
this.pl.done();
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java
index 9ba0e38..d16b5ae 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java
@@ -1,711 +1,718 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.common.primitives.Bytes;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
/**
* A graph dataset in ORC format.
*
* This format of dataset is a full export of the graph, including all the edge and node properties.
*
* For convenience purposes, this class also provides a main method to print all the edges of the
* graph, so that the output can be piped to
* {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph}.
*
* Reading edges from ORC files using this class is about ~2.5 times slower than reading them
* directly from a plaintext format.
*/
public class ORCGraphDataset implements GraphDataset {
final static Logger logger = LoggerFactory.getLogger(ORCGraphDataset.class);
final static public int ORC_BATCH_SIZE = 16 * 1024;
private File datasetDir;
protected ORCGraphDataset() {
}
public ORCGraphDataset(String datasetPath) {
this(new File(datasetPath));
}
public ORCGraphDataset(File datasetDir) {
if (!datasetDir.exists()) {
throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist");
}
this.datasetDir = datasetDir;
}
/**
* Return the given table as a {@link SwhOrcTable}. The return value can be down-casted to the type
* of the specific table it represents.
*/
public SwhOrcTable getTable(String tableName) {
File tableDir = new File(datasetDir, tableName);
if (!tableDir.exists()) {
return null;
}
switch (tableName) {
case "skipped_content":
return new SkippedContentOrcTable(tableDir);
case "content":
return new ContentOrcTable(tableDir);
case "directory":
return new DirectoryOrcTable(tableDir);
case "directory_entry":
return new DirectoryEntryOrcTable(tableDir);
case "revision":
return new RevisionOrcTable(tableDir);
case "revision_history":
return new RevisionHistoryOrcTable(tableDir);
case "release":
return new ReleaseOrcTable(tableDir);
case "snapshot_branch":
return new SnapshotBranchOrcTable(tableDir);
case "snapshot":
return new SnapshotOrcTable(tableDir);
case "origin_visit_status":
return new OriginVisitStatusOrcTable(tableDir);
case "origin_visit":
return new OriginVisitOrcTable(tableDir);
case "origin":
return new OriginOrcTable(tableDir);
default :
return null;
}
}
/** Return all the tables in this dataset as a map of {@link SwhOrcTable}. */
public Map allTables() {
HashMap tables = new HashMap<>();
File[] tableDirs = datasetDir.listFiles();
if (tableDirs == null) {
return tables;
}
for (File tableDir : tableDirs) {
SwhOrcTable table = getTable(tableDir.getName());
if (table != null) {
tables.put(tableDir.getName(), table);
}
}
return tables;
}
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
Map tables = allTables();
for (SwhOrcTable table : tables.values()) {
table.readEdges(nodeCb, edgeCb);
}
}
/**
* A class representing an ORC table, stored on disk as a set of ORC files all in the same
* directory.
*/
public static class ORCTable {
private final File tableDir;
public ORCTable(File tableDir) {
if (!tableDir.exists()) {
throw new IllegalArgumentException("Table " + tableDir.getName() + " does not exist");
}
this.tableDir = tableDir;
}
public static ORCTable load(File tableDir) {
return new ORCTable(tableDir);
}
/**
* Utility function for byte columns. Return as a byte array the value of the given row in the
* column vector.
*/
public static byte[] getBytesRow(BytesColumnVector columnVector, int row) {
if (columnVector.isRepeating) {
row = 0;
}
if (columnVector.isNull[row]) {
return null;
}
return Arrays.copyOfRange(columnVector.vector[row], columnVector.start[row],
columnVector.start[row] + columnVector.length[row]);
}
/**
* Utility function for long columns. Return as a long the value of the given row in the column
* vector.
*/
public static Long getLongRow(LongColumnVector columnVector, int row) {
if (columnVector.isRepeating) {
row = 0;
}
if (columnVector.isNull[row]) {
return null;
}
return columnVector.vector[row];
}
interface ReadOrcBatchHandler {
void accept(VectorizedRowBatch batch, Map columnMap) throws IOException;
}
/**
* Read the table, calling the given handler for each new batch of rows. Optionally, if columns is
* not null, will only scan the columns present in this set instead of the entire table.
*
* If this method is called from within a ForkJoinPool, the ORC table will be read in parallel using
* that thread pool. Otherwise, the ORC files will be read sequentially.
*/
public void readOrcTable(ReadOrcBatchHandler batchHandler, Set columns) throws IOException {
File[] listing = tableDir.listFiles();
if (listing == null) {
throw new IOException("No files found in " + tableDir.getName());
}
ForkJoinPool forkJoinPool = ForkJoinTask.getPool();
if (forkJoinPool == null) {
// Sequential case
for (File file : listing) {
readOrcFile(file.getPath(), batchHandler, columns);
}
} else {
// Parallel case
ArrayList listingArray = new ArrayList<>(Arrays.asList(listing));
listingArray.parallelStream().forEach(file -> {
try {
readOrcFile(file.getPath(), batchHandler, columns);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
}
private void readOrcFile(String path, ReadOrcBatchHandler batchHandler, Set columns)
throws IOException {
try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(new Configuration()))) {
TypeDescription schema = reader.getSchema();
Reader.Options options = reader.options();
if (columns != null) {
options.include(createColumnsToRead(schema, columns));
}
Map columnMap = getColumnMap(schema);
try (RecordReader records = reader.rows(options)) {
VectorizedRowBatch batch = reader.getSchema().createRowBatch(ORC_BATCH_SIZE);
while (records.nextBatch(batch)) {
batchHandler.accept(batch, columnMap);
}
}
}
}
private static Map getColumnMap(TypeDescription schema) {
Map columnMap = new HashMap<>();
List fieldNames = schema.getFieldNames();
for (int i = 0; i < fieldNames.size(); i++) {
columnMap.put(fieldNames.get(i), i);
}
return columnMap;
}
private static boolean[] createColumnsToRead(TypeDescription schema, Set columns) {
boolean[] columnsToRead = new boolean[schema.getMaximumId() + 1];
List fieldNames = schema.getFieldNames();
List columnTypes = schema.getChildren();
for (int i = 0; i < fieldNames.size(); i++) {
if (columns.contains(fieldNames.get(i))) {
logger.debug("Adding column " + fieldNames.get(i) + " with ID " + i + " to the read list");
TypeDescription type = columnTypes.get(i);
for (int id = type.getId(); id <= type.getMaximumId(); id++) {
columnsToRead[id] = true;
}
}
}
return columnsToRead;
}
}
/** Base class for SWH-specific ORC tables. */
public static class SwhOrcTable {
protected ORCTable orcTable;
protected static final byte[] cntPrefix = "swh:1:cnt:".getBytes();
protected static final byte[] dirPrefix = "swh:1:dir:".getBytes();
protected static final byte[] revPrefix = "swh:1:rev:".getBytes();
protected static final byte[] relPrefix = "swh:1:rel:".getBytes();
protected static final byte[] snpPrefix = "swh:1:snp:".getBytes();
protected static final byte[] oriPrefix = "swh:1:ori:".getBytes();
protected String getIdColumn() {
return "id";
}
protected byte[] getSwhidPrefix() {
throw new UnsupportedOperationException();
}
protected byte[] idToSwhid(byte[] id) {
return Bytes.concat(getSwhidPrefix(), id);
}
protected SwhOrcTable() {
}
public SwhOrcTable(File tableDir) {
orcTable = new ORCTable(tableDir);
}
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
// No nodes or edges to read in the table by default.
}
protected static byte[] urlToOriginId(byte[] url) {
return DigestUtils.sha1Hex(url).getBytes();
}
public void readIdColumn(NodeCallback cb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
for (int row = 0; row < batch.size; row++) {
byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
cb.onNode(id);
}
}, Set.of(getIdColumn()));
}
public void readLongColumn(String longColumn, LongCallback cb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
LongColumnVector dateVector = (LongColumnVector) batch.cols[columnMap.get(longColumn)];
for (int row = 0; row < batch.size; row++) {
byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
Long date = ORCTable.getLongRow(dateVector, row);
if (date != null) {
cb.onLong(id, date);
}
}
}, Set.of(getIdColumn(), longColumn));
}
public void readTimestampColumn(String dateColumn, String dateOffsetColumn, TimestampCallback cb)
throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
TimestampColumnVector dateVector = (TimestampColumnVector) batch.cols[columnMap.get(dateColumn)];
LongColumnVector dateOffsetVector = (LongColumnVector) batch.cols[columnMap.get(dateOffsetColumn)];
for (int row = 0; row < batch.size; row++) {
byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
long date = dateVector.getTimestampAsLong(row); // rounded to seconds
Long dateOffset = ORCTable.getLongRow(dateOffsetVector, row);
if (dateOffset != null) {
cb.onTimestamp(id, date, dateOffset.shortValue());
}
}
}, Set.of(getIdColumn(), dateColumn, dateOffsetColumn));
}
public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
BytesColumnVector valueVector = (BytesColumnVector) batch.cols[columnMap.get(longColumn)];
for (int row = 0; row < batch.size; row++) {
byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
byte[] value = Base64.getEncoder().encode(ORCTable.getBytesRow(valueVector, row));
cb.onBytes(id, value);
}
}, Set.of(getIdColumn(), longColumn));
}
}
public static class SkippedContentOrcTable extends SwhOrcTable {
public SkippedContentOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected String getIdColumn() {
return "sha1_git";
}
@Override
protected byte[] getSwhidPrefix() {
return cntPrefix;
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
readIdColumn(nodeCb);
}
}
public static class ContentOrcTable extends SwhOrcTable {
public ContentOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected String getIdColumn() {
return "sha1_git";
}
@Override
protected byte[] getSwhidPrefix() {
return cntPrefix;
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
readIdColumn(nodeCb);
}
}
public static class DirectoryOrcTable extends SwhOrcTable {
public DirectoryOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected byte[] getSwhidPrefix() {
return dirPrefix;
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
readIdColumn(nodeCb);
}
}
public static class DirectoryEntryOrcTable extends SwhOrcTable {
public DirectoryEntryOrcTable(File tableDir) {
super(tableDir);
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
byte[] cntType = "file".getBytes();
byte[] dirType = "dir".getBytes();
byte[] revType = "rev".getBytes();
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector srcVector = (BytesColumnVector) batch.cols[columnMap.get("directory_id")];
BytesColumnVector dstVector = (BytesColumnVector) batch.cols[columnMap.get("target")];
BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("type")];
BytesColumnVector labelVector = (BytesColumnVector) batch.cols[columnMap.get("name")];
LongColumnVector permissionVector = (LongColumnVector) batch.cols[columnMap.get("perms")];
for (int row = 0; row < batch.size; row++) {
byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row);
byte[] targetPrefix;
if (Arrays.equals(targetType, cntType)) {
targetPrefix = cntPrefix;
} else if (Arrays.equals(targetType, dirType)) {
targetPrefix = dirPrefix;
} else if (Arrays.equals(targetType, revType)) {
targetPrefix = revPrefix;
} else {
continue;
}
byte[] src = Bytes.concat(dirPrefix, ORCTable.getBytesRow(srcVector, row));
byte[] dst = Bytes.concat(targetPrefix, ORCTable.getBytesRow(dstVector, row));
byte[] label = Base64.getEncoder().encode(ORCTable.getBytesRow(labelVector, row));
Long permission = ORCTable.getLongRow(permissionVector, row);
edgeCb.onEdge(src, dst, label, permission != null ? permission.intValue() : 0);
}
}, Set.of("directory_id", "target", "type", "name", "perms"));
}
}
public static class RevisionOrcTable extends SwhOrcTable {
public RevisionOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected byte[] getSwhidPrefix() {
return revPrefix;
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")];
BytesColumnVector directoryIdVector = (BytesColumnVector) batch.cols[columnMap.get("directory")];
for (int row = 0; row < batch.size; row++) {
byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row));
byte[] directoryId = Bytes.concat(dirPrefix, ORCTable.getBytesRow(directoryIdVector, row));
nodeCb.onNode(revisionId);
edgeCb.onEdge(revisionId, directoryId, null, -1);
}
}, Set.of("id", "directory"));
}
}
public static class RevisionHistoryOrcTable extends SwhOrcTable {
public RevisionHistoryOrcTable(File tableDir) {
super(tableDir);
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")];
BytesColumnVector parentIdVector = (BytesColumnVector) batch.cols[columnMap.get("parent_id")];
for (int row = 0; row < batch.size; row++) {
byte[] parentId = Bytes.concat(revPrefix, ORCTable.getBytesRow(parentIdVector, row));
byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row));
edgeCb.onEdge(revisionId, parentId, null, -1);
}
}, Set.of("id", "parent_id"));
}
}
public static class ReleaseOrcTable extends SwhOrcTable {
public ReleaseOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected byte[] getSwhidPrefix() {
return relPrefix;
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
byte[] cntType = "content".getBytes();
byte[] dirType = "directory".getBytes();
byte[] revType = "revision".getBytes();
byte[] relType = "release".getBytes();
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector releaseIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")];
BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")];
BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")];
for (int row = 0; row < batch.size; row++) {
byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row);
byte[] targetPrefix;
if (Arrays.equals(targetType, cntType)) {
targetPrefix = cntPrefix;
} else if (Arrays.equals(targetType, dirType)) {
targetPrefix = dirPrefix;
} else if (Arrays.equals(targetType, revType)) {
targetPrefix = revPrefix;
} else if (Arrays.equals(targetType, relType)) {
targetPrefix = relPrefix;
} else {
continue;
}
byte[] releaseId = Bytes.concat(relPrefix, ORCTable.getBytesRow(releaseIdVector, row));
byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row));
nodeCb.onNode(releaseId);
edgeCb.onEdge(releaseId, targetId, null, -1);
}
}, Set.of("id", "target", "target_type"));
}
}
public static class SnapshotOrcTable extends SwhOrcTable {
public SnapshotOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected byte[] getSwhidPrefix() {
return snpPrefix;
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
readIdColumn(nodeCb);
}
}
public static class SnapshotBranchOrcTable extends SwhOrcTable {
public SnapshotBranchOrcTable(File tableDir) {
super(tableDir);
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
byte[] cntType = "content".getBytes();
byte[] dirType = "directory".getBytes();
byte[] revType = "revision".getBytes();
byte[] relType = "release".getBytes();
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot_id")];
BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")];
BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")];
BytesColumnVector branchNameVector = (BytesColumnVector) batch.cols[columnMap.get("name")];
for (int row = 0; row < batch.size; row++) {
byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row);
byte[] targetPrefix;
if (Arrays.equals(targetType, cntType)) {
targetPrefix = cntPrefix;
} else if (Arrays.equals(targetType, dirType)) {
targetPrefix = dirPrefix;
} else if (Arrays.equals(targetType, revType)) {
targetPrefix = revPrefix;
} else if (Arrays.equals(targetType, relType)) {
targetPrefix = relPrefix;
} else {
continue;
}
byte[] snapshotId = Bytes.concat(snpPrefix, ORCTable.getBytesRow(snapshotIdVector, row));
byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row));
byte[] branchName = Base64.getEncoder().encode(ORCTable.getBytesRow(branchNameVector, row));
nodeCb.onNode(snapshotId);
edgeCb.onEdge(snapshotId, targetId, branchName, -1);
}
}, Set.of("snapshot_id", "name", "target", "target_type"));
}
}
public static class OriginVisitStatusOrcTable extends SwhOrcTable {
public OriginVisitStatusOrcTable(File tableDir) {
super(tableDir);
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector originUrlVector = (BytesColumnVector) batch.cols[columnMap.get("origin")];
BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot")];
for (int row = 0; row < batch.size; row++) {
byte[] originId = urlToOriginId(ORCTable.getBytesRow(originUrlVector, row));
byte[] snapshot_id = ORCTable.getBytesRow(snapshotIdVector, row);
if (snapshot_id == null || snapshot_id.length == 0) {
continue;
}
edgeCb.onEdge(Bytes.concat(oriPrefix, originId), Bytes.concat(snpPrefix, snapshot_id), null, -1);
}
}, Set.of("origin", "snapshot"));
}
}
public static class OriginVisitOrcTable extends SwhOrcTable {
public OriginVisitOrcTable(File tableDir) {
super(tableDir);
}
}
public static class OriginOrcTable extends SwhOrcTable {
public OriginOrcTable(File tableDir) {
super(tableDir);
}
@Override
protected byte[] getSwhidPrefix() {
return oriPrefix;
}
@Override
protected byte[] idToSwhid(byte[] id) {
return Bytes.concat(getSwhidPrefix(), urlToOriginId(id));
}
@Override
protected String getIdColumn() {
return "url";
}
@Override
public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
readIdColumn(nodeCb);
}
public void readURLs(BytesCallback cb) throws IOException {
orcTable.readOrcTable((batch, columnMap) -> {
BytesColumnVector urlVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
for (int row = 0; row < batch.size; row++) {
byte[] id = idToSwhid(ORCTable.getBytesRow(urlVector, row));
byte[] url = Base64.getEncoder().encode(ORCTable.getBytesRow(urlVector, row));
cb.onBytes(id, url);
}
}, Set.of(getIdColumn()));
}
}
/**
* Export an ORC graph to the CSV edge dataset format as two different files,
* nodes.csv.zst
and edges.csv.zst
.
*/
public static void exportToCsvDataset(String orcDataset, String csvDatasetBasename) throws IOException {
ORCGraphDataset dataset = new ORCGraphDataset(orcDataset);
File nodesFile = new File(csvDatasetBasename + ".nodes.csv.zst");
File edgesFile = new File(csvDatasetBasename + ".edges.csv.zst");
FastBufferedOutputStream nodesOut = new FastBufferedOutputStream(
new ZstdOutputStream(new FileOutputStream(nodesFile)));
FastBufferedOutputStream edgesOut = new FastBufferedOutputStream(
new ZstdOutputStream(new FileOutputStream(edgesFile)));
dataset.readEdges((node) -> {
nodesOut.write(node);
nodesOut.write('\n');
}, (src, dst, label, perms) -> {
edgesOut.write(src);
edgesOut.write(' ');
edgesOut.write(dst);
if (label != null) {
edgesOut.write(' ');
edgesOut.write(label);
edgesOut.write(' ');
}
if (perms != -1) {
edgesOut.write(' ');
edgesOut.write(Long.toString(perms).getBytes());
}
edgesOut.write('\n');
});
}
/**
* Print all the edges of the graph to stdout. Can be piped to
* {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph} to import the graph dataset and convert
* it to a {@link it.unimi.dsi.big.webgraph.BVGraph}.
*/
public static void printSimpleEdges(String orcDataset) throws IOException {
ORCGraphDataset dataset = new ORCGraphDataset(orcDataset);
FastBufferedOutputStream out = new FastBufferedOutputStream(System.out);
dataset.readEdges((node) -> {
}, (src, dst, label, perms) -> {
out.write(src);
out.write(' ');
out.write(dst);
out.write('\n');
});
out.flush();
}
public static void main(String[] args) throws IOException {
printSimpleEdges(args[0]);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java
index 05531f5..9320d98 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java
@@ -1,252 +1,259 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.IntStream;
import it.unimi.dsi.big.webgraph.BVGraph;
import it.unimi.dsi.big.webgraph.ImmutableSequentialGraph;
import it.unimi.dsi.big.webgraph.NodeIterator;
import it.unimi.dsi.big.webgraph.Transform;
import it.unimi.dsi.fastutil.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.logging.ProgressLogger;
public class ScatteredArcsORCGraph extends ImmutableSequentialGraph {
private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredArcsORCGraph.class);
/** The default number of threads. */
public static final int DEFAULT_NUM_THREADS = Runtime.getRuntime().availableProcessors();
/** The default batch size. */
public static final int DEFAULT_BATCH_SIZE = Math
.min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (DEFAULT_NUM_THREADS * 8 * 2)), Arrays.MAX_ARRAY_SIZE);
/** The batch graph used to return node iterators. */
private final Transform.BatchGraph batchGraph;
/**
* Creates a scattered-arcs ORC graph.
*
* @param dataset the Swh ORC Graph dataset
* @param function an explicitly provided function from string representing nodes to node numbers,
* or null
for the standard behaviour.
* @param n the number of nodes of the graph (used only if function
is not
* null
).
* @param numThreads the number of threads to use.
* @param batchSize the number of integers in a batch; two arrays of integers of this size will be
* allocated by each thread.
* @param tempDir a temporary directory for the batches, or null
for
* {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice.
* @param pl a progress logger, or null
.
*/
public ScatteredArcsORCGraph(final ORCGraphDataset dataset, final Object2LongFunction function,
final long n, final int numThreads, final int batchSize, final File tempDir, final ProgressLogger pl)
throws IOException {
final ObjectArrayList batches = new ObjectArrayList<>();
ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads);
long[][] srcArrays = new long[numThreads][batchSize];
long[][] dstArrays = new long[numThreads][batchSize];
int[] indexes = new int[numThreads];
long[] progressCounts = new long[numThreads];
AtomicInteger pairs = new AtomicInteger(0);
AtomicInteger nextThreadId = new AtomicInteger(0);
ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement);
if (pl != null) {
pl.itemsName = "arcs";
pl.start("Creating sorted batches...");
}
try {
forkJoinPool.submit(() -> {
try {
dataset.readEdges((node) -> {
}, (src, dst, label, perms) -> {
long s = function.getLong(src);
long t = function.getLong(dst);
int threadId = threadLocalId.get();
int idx = indexes[threadId]++;
srcArrays[threadId][idx] = s;
dstArrays[threadId][idx] = t;
if (idx == batchSize - 1) {
pairs.addAndGet(Transform.processBatch(batchSize, srcArrays[threadId], dstArrays[threadId],
tempDir, batches));
indexes[threadId] = 0;
}
if (pl != null && ++progressCounts[threadId] > 1000) {
synchronized (pl) {
pl.update(progressCounts[threadId]);
}
progressCounts[threadId] = 0;
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}).get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
IntStream.range(0, numThreads).parallel().forEach(t -> {
int idx = indexes[t];
if (idx > 0) {
try {
pairs.addAndGet(Transform.processBatch(idx, srcArrays[t], dstArrays[t], tempDir, batches));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
// Trigger the GC to free up the large arrays
for (int i = 0; i < numThreads; i++) {
srcArrays[i] = null;
dstArrays[i] = null;
}
if (pl != null) {
pl.done();
pl.logger().info("Created " + batches.size() + " batches.");
}
batchGraph = new Transform.BatchGraph(n, pairs.get(), batches);
}
@Override
public long numNodes() {
if (batchGraph == null)
throw new UnsupportedOperationException(
"The number of nodes is unknown (you need to generate all the batches first).");
return batchGraph.numNodes();
}
@Override
public long numArcs() {
if (batchGraph == null)
throw new UnsupportedOperationException(
"The number of arcs is unknown (you need to generate all the batches first).");
return batchGraph.numArcs();
}
@Override
public NodeIterator nodeIterator(final long from) {
return batchGraph.nodeIterator(from);
}
@Override
public boolean hasCopiableIterators() {
return batchGraph.hasCopiableIterators();
}
@Override
public ScatteredArcsORCGraph copy() {
return this;
}
@SuppressWarnings("unchecked")
public static void main(final String[] args)
throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException {
final SimpleJSAP jsap = new SimpleJSAP(ScatteredArcsORCGraph.class.getName(),
"Converts a scattered list of arcs from an ORC graph dataset into a BVGraph.",
new Parameter[]{
new FlaggedOption("logInterval", JSAP.LONG_PARSER,
Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l',
"log-interval", "The minimum time interval between activity logs in milliseconds."),
new FlaggedOption("numThreads", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_NUM_THREADS),
JSAP.NOT_REQUIRED, 't', "threads", "The number of threads to use."),
new FlaggedOption("batchSize", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_BATCH_SIZE),
JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in arcs."),
new FlaggedOption("tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T',
"temp-dir", "A directory for all temporary batch files."),
new FlaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f',
"function",
"A serialised function from strings to longs that will be used to translate identifiers to node numbers."),
new FlaggedOption("comp", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'c', "comp",
"A compression flag (may be specified several times).")
.setAllowMultipleDeclarations(true),
new FlaggedOption("windowSize", JSAP.INTEGER_PARSER,
String.valueOf(BVGraph.DEFAULT_WINDOW_SIZE), JSAP.NOT_REQUIRED, 'w', "window-size",
"Reference window size (0 to disable)."),
new FlaggedOption("maxRefCount", JSAP.INTEGER_PARSER,
String.valueOf(BVGraph.DEFAULT_MAX_REF_COUNT), JSAP.NOT_REQUIRED, 'm', "max-ref-count",
"Maximum number of backward references (-1 for ∞)."),
new FlaggedOption("minIntervalLength", JSAP.INTEGER_PARSER,
String.valueOf(BVGraph.DEFAULT_MIN_INTERVAL_LENGTH), JSAP.NOT_REQUIRED, 'i',
"min-interval-length", "Minimum length of an interval (0 to disable)."),
new FlaggedOption("zetaK", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_ZETA_K),
JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes."),
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
JSAP.NOT_GREEDY, "The path to the ORC graph dataset."),
new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
JSAP.NOT_GREEDY, "The basename of the output graph"),});
final JSAPResult jsapResult = jsap.parse(args);
if (jsap.messagePrinted())
System.exit(1);
String basename = jsapResult.getString("basename");
String orcDatasetPath = jsapResult.getString("dataset");
ORCGraphDataset orcDataset = new ORCGraphDataset(orcDatasetPath);
int flags = 0;
for (final String compressionFlag : jsapResult.getStringArray("comp")) {
try {
flags |= BVGraph.class.getField(compressionFlag).getInt(BVGraph.class);
} catch (final Exception notFound) {
throw new JSAPException("Compression method " + compressionFlag + " unknown.");
}
}
final int windowSize = jsapResult.getInt("windowSize");
final int zetaK = jsapResult.getInt("zetaK");
int maxRefCount = jsapResult.getInt("maxRefCount");
if (maxRefCount == -1)
maxRefCount = Integer.MAX_VALUE;
final int minIntervalLength = jsapResult.getInt("minIntervalLength");
if (!jsapResult.userSpecified("function")) {
throw new IllegalArgumentException("Function must be specified.");
}
final Object2LongFunction function = (Object2LongFunction) BinIO
.loadObject(jsapResult.getString("function"));
long n = function instanceof Size64 ? ((Size64) function).size64() : function.size();
File tempDir = null;
if (jsapResult.userSpecified("tempDir")) {
tempDir = new File(jsapResult.getString("tempDir"));
}
final ProgressLogger pl = new ProgressLogger(LOGGER, jsapResult.getLong("logInterval"), TimeUnit.MILLISECONDS);
final int batchSize = jsapResult.getInt("batchSize");
final int numThreads = jsapResult.getInt("numThreads");
final ScatteredArcsORCGraph graph = new ScatteredArcsORCGraph(orcDataset, function, n, numThreads, batchSize,
tempDir, pl);
BVGraph.store(graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
index e55d8a4..f06ba59 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
@@ -1,273 +1,280 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.ints.IntBigArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.shorts.ShortBigArrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.compress.ORCGraphDataset.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
/**
* This class is used to extract the node properties from the graph dataset, and write them to a set
* of property files.
*
* Note: because the nodes are not sorted by type, we have an incentive to minimize the number of
* "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same
* files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file).
* Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all
* the different properties in their own files.
*/
public class WriteNodeProperties {
final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class);
private final ORCGraphDataset dataset;
private final String graphBasename;
private final NodeIdMap nodeIdMap;
private final long numNodes;
public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) {
this.dataset = new ORCGraphDataset(dataset);
this.graphBasename = graphBasename;
this.nodeIdMap = nodeIdMap;
this.numNodes = nodeIdMap.size64();
}
public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped",
"person_ids", "messages", "tag_names",};
private static JSAPResult parseArgs(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"),
new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Basename of the output graph"),
new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties",
"Properties to write, comma separated (default: all). Possible choices: "
+ String.join(",", PROPERTY_WRITERS)),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
System.err.println("Usage error: " + e.getMessage());
System.exit(1);
}
return config;
}
public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException,
InvocationTargetException, IllegalAccessException {
JSAPResult args = parseArgs(argv);
String dataset = args.getString("dataset");
String graphBasename = args.getString("graphBasename");
NodeIdMap nodeIdMap = new NodeIdMap(graphBasename);
Set properties;
if (args.getString("properties").equals("*")) {
properties = Set.of(PROPERTY_WRITERS);
} else {
properties = new HashSet<>(Arrays.asList(args.getString("properties").split(",")));
}
WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap);
if (properties.contains("timestamps")) {
writer.writeTimestamps();
}
if (properties.contains("content_length")) {
writer.writeContentLength();
}
if (properties.contains("content_is_skipped")) {
writer.writeContentIsSkipped();
}
if (properties.contains("person_ids")) {
writer.writePersonIds();
}
if (properties.contains("messages")) {
writer.writeMessages();
}
if (properties.contains("tag_names")) {
writer.writeTagNames();
}
}
public void writeContentLength() throws IOException {
logger.info("Writing content lengths");
long[][] valueArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(valueArray, -1);
for (String tableName : new String[]{"content", "skipped_content"}) {
SwhOrcTable table = dataset.getTable(tableName);
if (table == null) {
continue;
}
table.readLongColumn("length", (swhid, value) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(valueArray, id, value);
});
}
BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin");
}
public void writeContentIsSkipped() throws IOException {
LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes);
SwhOrcTable table = dataset.getTable("skipped_content");
if (table != null) {
table.readIdColumn((swhid) -> {
long id = nodeIdMap.getNodeId(swhid);
isSkippedBitVector.set(id);
});
}
BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin");
}
public void writeTimestamps() throws IOException {
logger.info("Writing author/committer timestamps for release + revision");
SwhOrcTable releaseTable = dataset.getTable("release");
SwhOrcTable revisionTable = dataset.getTable("revision");
long[][] timestampArray = LongBigArrays.newBigArray(numNodes);
short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes);
// Author timestamps
BigArrays.fill(timestampArray, Long.MIN_VALUE);
BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin");
BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin");
// Committer timestamps
BigArrays.fill(timestampArray, Long.MIN_VALUE);
BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin");
BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin");
}
public void writePersonIds() throws IOException {
logger.info("Writing author/committer IDs for release + revision");
Object2LongFunction personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph");
SwhOrcTable releaseTable = dataset.getTable("release");
SwhOrcTable revisionTable = dataset.getTable("revision");
int[][] personArray = IntBigArrays.newBigArray(numNodes);
// Author IDs
BigArrays.fill(personArray, -1);
releaseTable.readBytes64Column("author", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
revisionTable.readBytes64Column("author", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin");
// Committer IDs
BigArrays.fill(personArray, -1);
revisionTable.readBytes64Column("committer", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin");
}
public void writeMessages() throws IOException {
logger.info("Writing messages for release + revision, and URLs for origins");
long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(messageOffsetArray, -1);
FastBufferedOutputStream messageStream = new FastBufferedOutputStream(
new FileOutputStream(graphBasename + ".property.message.bin"));
AtomicLong offset = new AtomicLong(0L);
SwhOrcTable releaseTable = dataset.getTable("release");
releaseTable.readBytes64Column("message", (swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
SwhOrcTable revisionTable = dataset.getTable("revision");
revisionTable.readBytes64Column("message", (swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin");
originTable.readURLs((swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
// TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file
BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin");
// EliasFanoLongBigList messageOffsetEF = new
// EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray));
// BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin");
messageStream.close();
}
public void writeTagNames() throws IOException {
logger.info("Writing tag names for release");
long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(tagNameOffsetArray, -1);
FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream(
new FileOutputStream(graphBasename + ".property.tag_name.bin"));
AtomicLong offset = new AtomicLong(0L);
SwhOrcTable releaseTable = dataset.getTable("release");
releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
tagNameStream.write(tagNameBase64);
tagNameStream.write('\n');
BigArrays.set(tagNameOffsetArray, id, offset.longValue());
offset.addAndGet(tagNameBase64.length + 1);
});
BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin");
// EliasFanoLongBigList tagNameOffsetEF = new
// EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray));
// BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin");
tagNameStream.close();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
index bd5459f..9352853 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
@@ -1,249 +1,256 @@
+/*
+ * Copyright (c) 2019 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.forks;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.logging.ProgressLogger;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class ForkCC {
public Boolean includeRootDir;
private SwhBidirectionalGraph graph;
private Long emptySnapshot;
private LongArrayBitVector visited;
private LongArrayBitVector whitelist;
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ForkCC.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't',
"whitelist", "Whitelist of origins"),
new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'R',
"includerootdir", "Include root directory (default: false)"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private static void printDistribution(ArrayList> components) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
System.out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
System.out.println(node);
}
}
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
this.graph = SwhBidirectionalGraph.loadMapped(graphBasename).symmetrize();
System.err.println("Graph loaded.");
this.emptySnapshot = null;
this.whitelist = null;
this.visited = null;
this.includeRootDir = null;
}
private boolean nodeIsEmptySnapshot(Long node) {
if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private Boolean shouldVisit(Long node) {
Node.Type nt = this.graph.getNodeType(node);
if (nt == Node.Type.CNT) {
return false;
}
if (nt == Node.Type.DIR && !includeRootDir)
return false;
if (this.nodeIsEmptySnapshot(node))
return false;
if (visited.getBoolean(node))
return false;
return true;
}
private ArrayList> compute(ProgressLogger pl) throws IOException {
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
visited = LongArrayBitVector.ofLength(n);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting connected components visit...");
ArrayList> components = new ArrayList<>();
for (long i = 0; i < n; i++) {
if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR)
continue;
ArrayList component = new ArrayList<>();
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
Node.Type cur_nt = this.graph.getNodeType(currentNode);
if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) {
// TODO: add a check that the origin has >=1 non-empty snapshot
component.add(currentNode);
}
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (!shouldVisit(succ))
continue;
if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV)
continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
pl.update();
}
if (component.size() > 0) {
components.add(component);
}
}
pl.done();
queue.close();
return components;
}
private static void printDistribution(ArrayList> components, Formatter out) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components, Formatter out) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
out.format("%d\n", node);
}
}
private static void printAllComponents(ArrayList> components, Formatter out) {
for (int i = 1; i < components.size(); ++i) {
ArrayList component = components.get(i);
for (Long node : component) {
out.format("%d ", node);
}
out.format("\n");
}
}
private void parseWhitelist(String path) {
System.err.println("Loading whitelist " + path + " ...");
this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes());
Scanner scanner;
try {
scanner = new Scanner(new File(path));
while (scanner.hasNextLong()) {
whitelist.set(scanner.nextLong());
}
System.err.println("Whitelist loaded.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String whitelistPath = config.getString("whitelistPath");
boolean includeRootDir = config.getBoolean("includeRootDir");
String outdirPath = config.getString("outdir");
ForkCC forkCc = new ForkCC();
try {
forkCc.load_graph(graphPath);
forkCc.includeRootDir = includeRootDir;
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
if (whitelistPath != null) {
forkCc.parseWhitelist(whitelistPath);
}
ProgressLogger logger = new ProgressLogger();
// noinspection ResultOfMethodCallIgnored
new File(outdirPath).mkdirs();
try {
ArrayList> components = forkCc.compute(logger);
printDistribution(components, new Formatter(outdirPath + "/distribution.txt"));
printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt"));
printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt"));
} catch (IOException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
index aa57ae6..361cce8 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
@@ -1,223 +1,230 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.forks;
import ch.qos.logback.classic.Level;
import ch.qos.logback.classic.Logger;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
public class ForkCliques {
private SwhBidirectionalGraph graph;
private LongArrayBitVector whitelist;
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
this.graph = SwhBidirectionalGraph.loadMapped(graphBasename);
System.err.println("Graph loaded.");
this.whitelist = null;
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ForkCliques.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't',
"whitelist", "Whitelist of origins"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private ArrayList dfsAt(Long baseNode) {
ArrayList res = new ArrayList<>();
final Deque stack = new ArrayDeque<>();
HashSet seen = new HashSet<>();
stack.push(baseNode);
while (!stack.isEmpty()) {
final Long currentNode = stack.pop();
final LazyLongIterator iterator = this.graph.predecessors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (!seen.contains(succ)) {
Node.Type nt = this.graph.getNodeType(succ);
if (nt == Node.Type.DIR || nt == Node.Type.CNT)
continue;
if (nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(succ))) {
res.add(succ);
} else {
stack.push(succ);
seen.add(succ);
}
}
}
}
Collections.sort(res);
return res;
}
private boolean isBaseRevision(Long node) {
if (this.graph.getNodeType(node) != Node.Type.REV)
return false;
final LazyLongIterator iterator = this.graph.successors(node);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (this.graph.getNodeType(succ) == Node.Type.REV)
return false;
}
return true;
}
static private String fingerprint(ArrayList cluster) {
MessageDigest digest;
try {
digest = MessageDigest.getInstance("SHA-256");
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return null;
}
for (Long n : cluster)
digest.update(Longs.toByteArray(n));
return new String(digest.digest());
}
private ArrayList> compute(ProgressLogger pl) {
final long n = this.graph.numNodes();
HashSet fingerprints = new HashSet<>();
ArrayList> clusters = new ArrayList<>();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting topological sort...");
for (long i = 0; i < n; i++) {
if (isBaseRevision(i)) {
ArrayList currentCluster = dfsAt(i);
String clusterFp = fingerprint(currentCluster);
if (!fingerprints.contains(clusterFp)) {
fingerprints.add(clusterFp);
clusters.add(currentCluster);
}
}
pl.update();
}
pl.done();
return clusters;
}
private static void printDistribution(ArrayList> components, Formatter out) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components, Formatter out) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
out.format("%d\n", node);
}
}
private static void printAllComponents(ArrayList> components, Formatter out) {
for (int i = 1; i < components.size(); ++i) {
ArrayList component = components.get(i);
for (Long node : component) {
out.format("%d ", node);
}
out.format("\n");
}
}
private void parseWhitelist(String path) {
System.err.println("Loading whitelist " + path + " ...");
this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes());
Scanner scanner;
try {
scanner = new Scanner(new File(path));
while (scanner.hasNextLong()) {
whitelist.set(scanner.nextLong());
}
System.err.println("Whitelist loaded.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String whitelistPath = config.getString("whitelistPath");
String outdirPath = config.getString("outdir");
ForkCliques forkCliques = new ForkCliques();
try {
forkCliques.load_graph(graphPath);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
if (whitelistPath != null) {
forkCliques.parseWhitelist(whitelistPath);
}
Logger rootLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
rootLogger.setLevel(Level.DEBUG);
ProgressLogger logger = new ProgressLogger(rootLogger);
ArrayList> components = forkCliques.compute(logger);
// noinspection ResultOfMethodCallIgnored
new File(outdirPath).mkdirs();
try {
printDistribution(components, new Formatter(outdirPath + "/distribution.txt"));
printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt"));
printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt"));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
index 8389962..5067c28 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
@@ -1,88 +1,95 @@
+/*
+ * Copyright (c) 2019 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.forks;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
import java.io.IOException;
import java.util.ArrayList;
public class ListEmptyOrigins {
private SwhBidirectionalGraph graph;
private Long emptySnapshot;
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ListEmptyOrigins.class.getName(), "",
new Parameter[]{new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
ListEmptyOrigins leo = new ListEmptyOrigins();
try {
leo.load_graph(graphPath);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
ArrayList badlist = leo.compute(leo.graph);
for (Long bad : badlist) {
System.out.println(bad);
}
}
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
this.graph = SwhBidirectionalGraph.loadMapped(graphBasename);
System.err.println("Graph loaded.");
this.emptySnapshot = null;
}
private boolean nodeIsEmptySnapshot(Long node) {
System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node);
if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private ArrayList compute(ImmutableGraph graph) {
final long n = graph.numNodes();
ArrayList bad = new ArrayList<>();
for (long i = 0; i < n; i++) {
Node.Type nt = this.graph.getNodeType(i);
if (nt != Node.Type.ORI)
continue;
final LazyLongIterator iterator = graph.successors(i);
long succ;
boolean found = false;
while ((succ = iterator.nextLong()) != -1) {
if (this.graph.outdegree(succ) > 0) {
found = true;
}
}
if (!found)
bad.add(i);
}
return bad;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
index 53bcc49..dd8d203 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
@@ -1,188 +1,195 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.XoRoShiRo128PlusRandom;
import org.softwareheritage.graph.*;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import java.util.concurrent.*;
public class AveragePaths {
private final SwhBidirectionalGraph graph;
private final Subgraph subgraph;
private final ConcurrentHashMap result;
private final String outdir;
public AveragePaths(String graphBasename, String allowedNodes, String outdir) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
this.graph = SwhBidirectionalGraph.loadMapped(graphBasename);
this.subgraph = new Subgraph(this.graph, new AllowedNodes(allowedNodes));
this.outdir = outdir;
System.err.println("Graph loaded.");
result = new ConcurrentHashMap<>();
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's',
"nodetypes", "Node type constraints"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),
new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't',
"numthreads", "Number of threads"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private void run(int numThreads) throws InterruptedException {
final long END_OF_QUEUE = -1L;
ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads);
ExecutorService service = Executors.newFixedThreadPool(numThreads + 1);
service.submit(() -> {
try {
SwhBidirectionalGraph thread_graph = graph.copy();
Subgraph thread_subgraph = subgraph.copy();
long[][] randomPerm = Util.identity(thread_graph.numNodes());
LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom());
long n = thread_graph.numNodes();
ProgressLogger pl = new ProgressLogger();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Filling processor queue...");
for (long j = 0; j < n; ++j) {
long node = BigArrays.get(randomPerm, j);
if (thread_subgraph.nodeExists(node) && thread_subgraph.outdegree(node) == 0) {
queue.put(node);
}
if (j % 10000 == 0) {
printResult();
}
pl.update();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
for (int i = 0; i < numThreads; ++i) {
try {
queue.put(END_OF_QUEUE);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
});
for (int i = 0; i < numThreads; ++i) {
service.submit(() -> {
try {
Subgraph thread_subgraph = subgraph.copy();
while (true) {
Long node = null;
try {
node = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
if (node == null || node == END_OF_QUEUE) {
return;
}
bfsAt(thread_subgraph, node);
}
} catch (Exception e) {
e.printStackTrace();
}
});
}
service.shutdown();
service.awaitTermination(365, TimeUnit.DAYS);
}
private void bfsAt(Subgraph graph, long srcNodeId) {
ArrayDeque queue = new ArrayDeque<>();
HashSet visited = new HashSet<>();
long FRONTIER_MARKER = -1;
queue.addLast(srcNodeId);
visited.add(srcNodeId);
long distance = 0;
queue.addLast(FRONTIER_MARKER);
while (!queue.isEmpty()) {
long currentNodeId = queue.removeFirst();
// System.err.println("curr: " + currentNodeId);
if (currentNodeId == FRONTIER_MARKER) {
if (queue.isEmpty()) // avoid infinite loops
break;
++distance;
queue.addLast(FRONTIER_MARKER);
continue;
}
if (graph.indegree(currentNodeId) == 0) {
result.merge(distance, 1L, Long::sum);
}
LazyLongIterator it = graph.predecessors(currentNodeId);
for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
if (!visited.contains(neighborNodeId)) {
queue.addLast(neighborNodeId);
visited.add(neighborNodeId);
}
}
}
}
public void printResult() throws IOException {
new File(outdir).mkdirs();
PrintWriter f = new PrintWriter(new FileWriter(outdir + "/distribution.txt"));
TreeMap sortedDistribution = new TreeMap<>(result);
for (Map.Entry entry : sortedDistribution.entrySet()) {
f.println(entry.getKey() + " " + entry.getValue());
}
f.close();
}
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String outdir = config.getString("outdir");
String allowedNodes = config.getString("nodeTypes");
int numThreads = config.getInt("numThreads");
AveragePaths tp = new AveragePaths(graphPath, allowedNodes, outdir);
tp.run(numThreads);
tp.printResult();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
index 0564463..9195560 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
@@ -1,325 +1,332 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.XoRoShiRo128PlusRandom;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
import java.io.*;
import java.util.*;
import java.util.concurrent.*;
public class ClusteringCoefficient {
private final SwhBidirectionalGraph graph;
private final String outdirPath;
private final ConcurrentHashMap result_full;
private final ConcurrentHashMap result_dircnt;
private final ConcurrentHashMap result_rev;
private final ConcurrentHashMap result_revrel;
private final ConcurrentHashMap result_orisnp;
public ClusteringCoefficient(String graphBasename, String outdirPath) throws IOException {
this.outdirPath = outdirPath;
System.err.println("Loading graph " + graphBasename + " ...");
SwhBidirectionalGraph directedGraph = SwhBidirectionalGraph.loadMapped(graphBasename);
this.graph = directedGraph.symmetrize();
System.err.println("Graph loaded.");
result_full = new ConcurrentHashMap<>();
result_dircnt = new ConcurrentHashMap<>();
result_rev = new ConcurrentHashMap<>();
result_revrel = new ConcurrentHashMap<>();
result_orisnp = new ConcurrentHashMap<>();
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),
new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't',
"numthreads", "Number of threads"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private void run(int numThreads) throws InterruptedException {
final long END_OF_QUEUE = -1L;
ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads);
ExecutorService service = Executors.newFixedThreadPool(numThreads + 1);
service.submit(() -> {
try {
SwhBidirectionalGraph thread_graph = graph.copy();
long[][] randomPerm = Util.identity(thread_graph.numNodes());
LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom());
long n = thread_graph.numNodes();
ProgressLogger pl = new ProgressLogger();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Filling processor queue...");
for (long j = 0; j < n; ++j) {
long node = BigArrays.get(randomPerm, j);
queue.put(node);
if (j % 10000 == 0) {
printResult();
}
pl.update();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
for (int i = 0; i < numThreads; ++i) {
try {
queue.put(END_OF_QUEUE);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
});
for (int i = 0; i < numThreads; ++i) {
service.submit(() -> {
try {
SwhBidirectionalGraph thread_graph = graph.copy();
while (true) {
Long node = null;
try {
node = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
if (node == null || node == END_OF_QUEUE) {
return;
}
computeAt(thread_graph, node);
}
} catch (Exception e) {
e.printStackTrace();
}
});
}
service.shutdown();
service.awaitTermination(365, TimeUnit.DAYS);
}
private void computeAt(SwhBidirectionalGraph graph, long node) {
long d = graph.outdegree(node);
if (d < 2) {
return;
}
Node.Type nodeType = graph.getNodeType(node);
HashSet neighborhood = new HashSet<>();
long succ;
final LazyLongIterator iterator = graph.successors(node);
while ((succ = iterator.nextLong()) != -1) {
neighborhood.add(succ);
}
long triangles_full = 0;
long triangles_dircnt = 0;
long triangles_rev = 0;
long triangles_revrel = 0;
long triangles_orisnp = 0;
for (Long neighbor : neighborhood) {
Node.Type neighborNodeType = graph.getNodeType(neighbor);
final LazyLongIterator it = graph.successors(neighbor);
while ((succ = it.nextLong()) != -1) {
if (neighborhood.contains(succ)) {
Node.Type succNodeType = graph.getNodeType(succ);
triangles_full++;
if ((nodeType == Node.Type.DIR || nodeType == Node.Type.CNT)
&& (neighborNodeType == Node.Type.DIR || neighborNodeType == Node.Type.CNT)
&& (succNodeType == Node.Type.DIR || succNodeType == Node.Type.CNT)) {
triangles_dircnt++;
} else if ((nodeType == Node.Type.REV || nodeType == Node.Type.REL)
&& (neighborNodeType == Node.Type.REV || neighborNodeType == Node.Type.REL)
&& (succNodeType == Node.Type.REV || succNodeType == Node.Type.REL)) {
triangles_revrel++;
if (nodeType == Node.Type.REV && neighborNodeType == Node.Type.REV
&& succNodeType == Node.Type.REV)
triangles_rev++;
} else if ((nodeType == Node.Type.ORI || nodeType == Node.Type.SNP)
&& (neighborNodeType == Node.Type.ORI || neighborNodeType == Node.Type.SNP)
&& (succNodeType == Node.Type.ORI || succNodeType == Node.Type.SNP)) {
triangles_orisnp++;
}
}
}
}
result_full.merge(triangles_full, 1L, Long::sum);
result_dircnt.merge(triangles_dircnt, 1L, Long::sum);
result_rev.merge(triangles_rev, 1L, Long::sum);
result_revrel.merge(triangles_revrel, 1L, Long::sum);
result_orisnp.merge(triangles_orisnp, 1L, Long::sum);
}
public void printSortedDistribution(String distribPath, Map distrib) throws IOException {
PrintWriter f = new PrintWriter(new FileWriter(distribPath));
TreeMap sortedDistribution = new TreeMap<>(distrib);
for (Map.Entry entry : sortedDistribution.entrySet()) {
f.println(entry.getKey() + " " + entry.getValue());
}
f.close();
}
public void printResult() throws IOException {
new File(outdirPath).mkdirs();
printSortedDistribution(outdirPath + "/distribution-full.txt", result_full);
printSortedDistribution(outdirPath + "/distribution-dircnt.txt", result_dircnt);
printSortedDistribution(outdirPath + "/distribution-rev.txt", result_rev);
printSortedDistribution(outdirPath + "/distribution-relrev.txt", result_revrel);
printSortedDistribution(outdirPath + "/distribution-orisnp.txt", result_orisnp);
}
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String outdir = config.getString("outdir");
int numThreads = config.getInt("numThreads");
ClusteringCoefficient cc = new ClusteringCoefficient(graphPath, outdir);
cc.run(numThreads);
cc.printResult();
}
// Old unused functions
private long oppositeEdges(ImmutableGraph graph, long node) {
HashSet neighborhood = new HashSet<>();
long succ;
final LazyLongIterator iterator = graph.successors(node);
while ((succ = iterator.nextLong()) != -1) {
neighborhood.add(succ);
}
long closed_triplets = 0;
for (Long neighbor : neighborhood) {
final LazyLongIterator it = graph.successors(neighbor);
while ((succ = it.nextLong()) != -1) {
if (neighborhood.contains(succ)) {
closed_triplets++;
}
}
}
return closed_triplets;
}
public void compute(ProgressLogger pl, Formatter out_local, Formatter out_global) {
final long n = this.graph.numNodes();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
long nodes_d2 = 0;
double cum_lcc = 0;
double cum_lcc_c0 = 0;
double cum_lcc_c1 = 0;
HashMap distribution = new HashMap<>();
for (long node = 0; node < n; node++) {
long d = graph.outdegree(node);
if (d >= 2) {
double t = (d * (d - 1));
double m = oppositeEdges(graph, node);
double lcc = m / t;
distribution.merge(lcc, 1L, Long::sum);
cum_lcc += lcc;
nodes_d2++;
} else {
cum_lcc_c1++;
}
pl.update();
}
pl.done();
for (Map.Entry entry : distribution.entrySet()) {
out_local.format("%f %d\n", entry.getKey(), entry.getValue());
}
double gC = cum_lcc / nodes_d2;
double gC0 = cum_lcc_c0 / n;
double gC1 = cum_lcc_c1 / n;
out_global.format("C: %f\n", gC);
out_global.format("C0: %f\n", gC0);
out_global.format("C1: %f\n", gC1);
}
public void compute_approx(Formatter out_global) {
final long n = this.graph.numNodes();
long trials = 0;
long triangles = 0;
while (true) {
long node = ThreadLocalRandom.current().nextLong(0, n);
long d = graph.outdegree(node);
if (d >= 2) {
Long u = null;
Long v = null;
long u_idx = ThreadLocalRandom.current().nextLong(0, d);
long v_idx = ThreadLocalRandom.current().nextLong(0, d - 1);
if (v_idx >= u_idx) {
v_idx++;
}
long succ;
final LazyLongIterator node_iterator = graph.successors(node);
for (long succ_idx = 0; (succ = node_iterator.nextLong()) != -1; succ_idx++) {
if (succ_idx == u_idx) {
u = succ;
}
if (succ_idx == v_idx) {
v = succ;
}
}
final LazyLongIterator u_iterator = graph.successors(u);
while ((succ = u_iterator.nextLong()) != -1) {
if (succ == v)
triangles++;
}
}
trials++;
if (trials % 100 == 0 || true) {
double gC = (double) triangles / (double) trials;
out_global.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials);
System.out.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials);
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java
index b351869..b6f6072 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java
@@ -1,200 +1,207 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.logging.ProgressLogger;
import org.softwareheritage.graph.AllowedNodes;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.Subgraph;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
public class ConnectedComponents {
private Subgraph graph;
private void load_graph(String graphBasename, String nodeTypes) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
var underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename);
var underlyingGraphSym = underlyingGraph.symmetrize();
graph = new Subgraph(underlyingGraphSym, new AllowedNodes(nodeTypes));
System.err.println("Graph loaded.");
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ConnectedComponents.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),
new Switch("byOrigins", JSAP.NO_SHORTFLAG, "by-origins",
"Compute size of components by number of origins"),
new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n',
"nodetypes", "Allowed node types (comma-separated)"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private HashMap /* ArrayList> */ compute(ProgressLogger pl, boolean byOrigin)
throws IOException {
final long n = graph.numNodes();
final long maxN = graph.maxNodeNumber();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * maxN);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
LongArrayBitVector visited = LongArrayBitVector.ofLength(maxN);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting connected components visit...");
// ArrayList> components = new ArrayList<>();
HashMap componentDistribution = new HashMap<>();
var it = graph.nodeIterator();
while (it.hasNext()) {
long i = it.nextLong();
if (visited.getBoolean(i))
continue;
// ArrayList component = new ArrayList<>();
long componentNodes = 0;
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
// component.add(currentNode);
if (!byOrigin || graph.getNodeType(currentNode) == Node.Type.ORI)
componentNodes += 1;
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (visited.getBoolean(succ))
continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
pl.update();
}
/*
* if (component.size() > 0) { components.add(component); }
*/
if (componentNodes > 0)
componentDistribution.merge(componentNodes, 1L, Long::sum);
}
pl.done();
// return components;
return componentDistribution;
}
private static void printDistribution(ArrayList> components, Formatter out) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
out.format("%d %d\n", entry.getKey(), entry.getValue());
}
out.close();
}
private static void printLargestComponent(ArrayList> components, Formatter out) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
out.format("%d\n", node);
}
out.close();
}
private static void printAllComponents(ArrayList> components, Formatter out) {
for (int i = 1; i < components.size(); ++i) {
ArrayList component = components.get(i);
for (Long node : component) {
out.format("%d ", node);
}
out.format("\n");
}
out.close();
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String outdirPath = config.getString("outdir");
String nodeTypes = config.getString("nodeTypes");
boolean byOrigin = config.getBoolean("byOrigins");
ConnectedComponents connectedComponents = new ConnectedComponents();
try {
connectedComponents.load_graph(graphPath, nodeTypes);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
ProgressLogger logger = new ProgressLogger();
// noinspection ResultOfMethodCallIgnored
new File(outdirPath).mkdirs();
try {
// ArrayList> components = connectedComponents.compute(logger);
// components.sort(Comparator.comparing(ArrayList::size).reversed());
// printDistribution(components, new Formatter(outdirPath + "/distribution.txt"));
// printLargestComponent(components, new Formatter(outdirPath + "/largest_component.txt"));
// printAllComponents(components, new Formatter(outdirPath + "/all_components.txt"));
HashMap componentDistribution = connectedComponents.compute(logger, byOrigin);
PrintWriter f = new PrintWriter(new FileWriter(outdirPath + "/distribution.txt"));
TreeMap sortedDistribution = new TreeMap<>(componentDistribution);
for (Map.Entry entry : sortedDistribution.entrySet()) {
f.println(entry.getKey() + " " + entry.getValue());
}
f.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java
index a74a31b..54e53cb 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java
@@ -1,239 +1,246 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.logging.ProgressLogger;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
public class InOutDegree {
private InOutDegree() {
}
private static final int NODE_ARRAY_SIZE = Node.Type.values().length + 1;
private static final int TYPE_ALL = Node.Type.values().length;
private static final int TYPE_CNT = Node.Type.toInt(Node.Type.CNT);
private static final int TYPE_DIR = Node.Type.toInt(Node.Type.DIR);
private static final int TYPE_REV = Node.Type.toInt(Node.Type.REV);
private static final int TYPE_REL = Node.Type.toInt(Node.Type.REL);
private static final int TYPE_SNP = Node.Type.toInt(Node.Type.SNP);
private static final int TYPE_ORI = Node.Type.toInt(Node.Type.ORI);
public static long[] outdegreeTypes(final SwhBidirectionalGraph graph, long node) {
long[] out = new long[NODE_ARRAY_SIZE];
var successors = graph.successors(node);
long neighbor;
while ((neighbor = successors.nextLong()) != -1) {
out[Node.Type.toInt(graph.getNodeType(neighbor))]++;
out[TYPE_ALL]++;
}
return out;
}
public static long[] indegreeTypes(final SwhBidirectionalGraph graph, long node) {
return outdegreeTypes(graph.transpose(), node);
}
public static void writeDistribution(HashMap distribution, String filename) throws IOException {
PrintWriter f = new PrintWriter(new FileWriter(filename));
TreeMap sortedDistribution = new TreeMap<>(distribution);
for (Map.Entry entry : sortedDistribution.entrySet()) {
f.println(entry.getKey() + " " + entry.getValue());
}
f.close();
}
public static void run(final SwhBidirectionalGraph graph, String resultsDir) throws IOException {
// Per-type
var cnt_in_dir = new HashMap();
var dir_in_dir = new HashMap();
var dir_in_rev = new HashMap();
var dir_in_all = new HashMap();
var dir_out_all = new HashMap();
var dir_out_dir = new HashMap();
var dir_out_cnt = new HashMap();
var dir_out_rev = new HashMap();
var rev_in_dir = new HashMap();
var rev_in_rel = new HashMap();
var rev_in_rev = new HashMap();
var rev_in_snp = new HashMap();
var rev_in_all = new HashMap();
var rev_out_rev = new HashMap();
var rel_in_snp = new HashMap();
var snp_in_ori = new HashMap();
var snp_out_all = new HashMap();
var snp_out_rel = new HashMap();
var snp_out_rev = new HashMap();
var ori_out_snp = new HashMap();
// Aggregated per layer
var full_in = new HashMap();
var full_out = new HashMap();
var dircnt_in = new HashMap();
var dircnt_out = new HashMap();
var orisnp_in = new HashMap();
var orisnp_out = new HashMap();
var relrev_in = new HashMap();
var relrev_out = new HashMap();
var rev_in = rev_in_rev; // alias for single-type layer
var rev_out = rev_out_rev;
final ProgressLogger pl = new ProgressLogger();
pl.itemsName = "nodes";
pl.expectedUpdates = graph.numNodes();
pl.start("Scanning...");
long[] in;
long[] out;
for (long i = graph.numNodes(); i-- != 0;) {
long d_in = graph.indegree(i);
long d_out = graph.outdegree(i);
full_in.merge(d_in, 1L, Long::sum);
full_out.merge(d_out, 1L, Long::sum);
switch (graph.getNodeType(i)) {
case CNT:
cnt_in_dir.merge(d_in, 1L, Long::sum);
dircnt_in.merge(d_in, 1L, Long::sum);
dircnt_out.merge(0L, 1L, Long::sum);
break;
case DIR:
in = indegreeTypes(graph, i);
out = outdegreeTypes(graph, i);
dir_in_all.merge(in[TYPE_ALL], 1L, Long::sum);
dir_out_all.merge(out[TYPE_ALL], 1L, Long::sum);
dir_in_dir.merge(in[TYPE_DIR], 1L, Long::sum);
dir_in_rev.merge(in[TYPE_REV], 1L, Long::sum);
dir_out_cnt.merge(out[TYPE_CNT], 1L, Long::sum);
dir_out_dir.merge(out[TYPE_DIR], 1L, Long::sum);
dir_out_rev.merge(out[TYPE_REV], 1L, Long::sum);
dircnt_in.merge(in[TYPE_DIR] + in[TYPE_CNT], 1L, Long::sum);
dircnt_out.merge(out[TYPE_DIR] + out[TYPE_CNT], 1L, Long::sum);
break;
case REV:
in = indegreeTypes(graph, i);
out = outdegreeTypes(graph, i);
rev_in_all.merge(in[TYPE_ALL], 1L, Long::sum);
rev_in_dir.merge(in[TYPE_DIR], 1L, Long::sum);
rev_in_rev.merge(in[TYPE_REV], 1L, Long::sum);
rev_in_rel.merge(in[TYPE_REL], 1L, Long::sum);
rev_in_snp.merge(in[TYPE_SNP], 1L, Long::sum);
rev_out_rev.merge(out[TYPE_REV], 1L, Long::sum);
relrev_in.merge(in[TYPE_REL] + in[TYPE_REV], 1L, Long::sum);
relrev_out.merge(out[TYPE_REL] + out[TYPE_REV], 1L, Long::sum);
break;
case REL:
rel_in_snp.merge(d_in, 1L, Long::sum);
relrev_in.merge(0L, 1L, Long::sum);
relrev_out.merge(d_out, 1L, Long::sum);
break;
case SNP:
out = outdegreeTypes(graph, i);
snp_in_ori.merge(d_in, 1L, Long::sum);
snp_out_all.merge(out[TYPE_ALL], 1L, Long::sum);
snp_out_rel.merge(out[TYPE_REL], 1L, Long::sum);
snp_out_rev.merge(out[TYPE_REV], 1L, Long::sum);
orisnp_in.merge(d_in, 1L, Long::sum);
orisnp_out.merge(out[TYPE_REL] + out[TYPE_REV], 1L, Long::sum);
break;
case ORI:
ori_out_snp.merge(d_out, 1L, Long::sum);
orisnp_in.merge(0L, 1L, Long::sum);
orisnp_out.merge(d_out, 1L, Long::sum);
break;
default :
pl.logger().warn("Invalid node type at pos {}", i);
break;
}
pl.update();
}
pl.done();
(new File(resultsDir)).mkdir();
writeDistribution(full_in, resultsDir + "/full_in.txt");
writeDistribution(full_out, resultsDir + "/full_out.txt");
writeDistribution(dircnt_in, resultsDir + "/dir+cnt_in.txt");
writeDistribution(dircnt_out, resultsDir + "/dir+cnt_out.txt");
writeDistribution(relrev_in, resultsDir + "/rel+rev_in.txt");
writeDistribution(relrev_out, resultsDir + "/rel+rev_out.txt");
writeDistribution(orisnp_in, resultsDir + "/ori+snp_in.txt");
writeDistribution(orisnp_out, resultsDir + "/ori+snp_out.txt");
writeDistribution(rev_in, resultsDir + "/rev_in.txt");
writeDistribution(rev_out, resultsDir + "/rev_out.txt");
String resultsTypeDir = resultsDir + "/per_type";
(new File(resultsTypeDir)).mkdir();
writeDistribution(cnt_in_dir, resultsTypeDir + "/cnt_in_dir.txt");
writeDistribution(dir_in_dir, resultsTypeDir + "/dir_in_dir.txt");
writeDistribution(dir_in_rev, resultsTypeDir + "/dir_in_rev.txt");
writeDistribution(dir_in_all, resultsTypeDir + "/dir_in_all.txt");
writeDistribution(dir_out_all, resultsTypeDir + "/dir_out_all.txt");
writeDistribution(dir_out_dir, resultsTypeDir + "/dir_out_dir.txt");
writeDistribution(dir_out_cnt, resultsTypeDir + "/dir_out_cnt.txt");
writeDistribution(dir_out_rev, resultsTypeDir + "/dir_out_rev.txt");
writeDistribution(rev_in_dir, resultsTypeDir + "/rev_in_dir.txt");
writeDistribution(rev_in_rel, resultsTypeDir + "/rev_in_rel.txt");
writeDistribution(rev_in_rev, resultsTypeDir + "/rev_in_rev.txt");
writeDistribution(rev_in_snp, resultsTypeDir + "/rev_in_snp.txt");
writeDistribution(rev_in_all, resultsTypeDir + "/rev_in_all.txt");
writeDistribution(rev_out_rev, resultsTypeDir + "/rev_out_rev.txt");
writeDistribution(rel_in_snp, resultsTypeDir + "/rel_in_snp.txt");
writeDistribution(snp_in_ori, resultsTypeDir + "/snp_in_ori.txt");
writeDistribution(snp_out_all, resultsTypeDir + "/snp_out_all.txt");
writeDistribution(snp_out_rel, resultsTypeDir + "/snp_out_rel.txt");
writeDistribution(snp_out_rev, resultsTypeDir + "/snp_out_rev.txt");
writeDistribution(ori_out_snp, resultsTypeDir + "/ori_out_snp.txt");
}
static public void main(final String[] arg)
throws IllegalArgumentException, SecurityException, IllegalAccessException, InvocationTargetException,
NoSuchMethodException, JSAPException, IOException, ClassNotFoundException {
final SimpleJSAP jsap = new SimpleJSAP(InOutDegree.class.getName(),
"Computes in and out degrees of the given SWHGraph",
new Parameter[]{
new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
JSAP.NOT_GREEDY, "The basename of the graph."),
new UnflaggedOption("resultsDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
JSAP.NOT_GREEDY, "The directory of the resulting files."),});
final JSAPResult jsapResult = jsap.parse(arg);
if (jsap.messagePrinted())
System.exit(1);
final String basename = jsapResult.getString("basename");
final String resultsDir = jsapResult.userSpecified("resultsDir")
? jsapResult.getString("resultsDir")
: basename;
final ProgressLogger pl = new ProgressLogger();
SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(basename);
run(graph, resultsDir);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java
index 3632d32..3f55826 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java
@@ -1,98 +1,105 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.XoRoShiRo128PlusRandom;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.experiments.forks.ForkCC;
import java.io.*;
public class SubdatasetSizeFunction {
private SubdatasetSizeFunction() {
}
public static void run(final SwhBidirectionalGraph graph) throws IOException {
final ProgressLogger pl = new ProgressLogger();
pl.itemsName = "nodes";
pl.expectedUpdates = graph.numNodes();
long n = graph.numNodes();
LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
long[][] randomPerm = Util.identity(graph.numNodes());
LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom());
long visitedNodes = 0;
long visitedEdges = 0;
long visitedOrigins = 0;
long visitedContents = 0;
pl.start("Running traversal starting from origins...");
for (long j = 0; j < n; ++j) {
long i = BigArrays.get(randomPerm, j);
if (visited.getBoolean(i) || graph.getNodeType(i) != Node.Type.ORI) {
continue;
}
visitedOrigins++;
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
visitedNodes++;
if (graph.getNodeType(currentNode) == Node.Type.CNT)
visitedContents++;
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
visitedEdges++;
if (visited.getBoolean(succ))
continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
pl.update();
}
if (visitedOrigins % 10000 == 0)
System.out.println(visitedNodes + " " + visitedEdges + " " + visitedContents);
}
pl.done();
}
static public void main(final String[] arg)
throws IllegalArgumentException, SecurityException, JSAPException, IOException {
final SimpleJSAP jsap = new SimpleJSAP(SubdatasetSizeFunction.class.getName(),
"Computes subdataset size functions using a random uniform order",
new Parameter[]{new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
JSAP.NOT_GREEDY, "The basename of the graph."),});
final JSAPResult jsapResult = jsap.parse(arg);
if (jsap.messagePrinted())
System.exit(1);
final String basename = jsapResult.getString("basename");
SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(basename);
run(graph);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
index 5e4a430..2b30ecf 100644
--- a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
+++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java
@@ -1,147 +1,154 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.labels;
/**
* Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be
* encoded in a single long type, to re-use Webgraph interface.
*
* @author The Software Heritage developers
*/
public class DirEntry {
public long filenameId;
public int permission;
public DirEntry(long filenameId, int permission) {
this.filenameId = filenameId;
this.permission = permission;
}
public DirEntry(long dirEntryEncoded) {
this.filenameId = labelNameFromEncoded(dirEntryEncoded);
this.permission = permissionFromEncoded(dirEntryEncoded);
}
public static long toEncoded(long filenameId, int permission) {
return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission);
}
public static long labelNameFromEncoded(long labelEncoded) {
return labelEncoded >> Permission.NB_BITS_PER_TYPE;
}
public static int permissionFromEncoded(long labelEncoded) {
int dirBytes = (int) (labelEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1));
return Permission.Type.fromEncoded(dirBytes);
}
public long toEncoded() {
return toEncoded(filenameId, permission);
}
public static int labelWidth(long numLabels) {
int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2));
if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) {
System.err.println("FIXME: Too many filenames, we can't handle more than 2^"
+ (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now.");
System.exit(2);
}
return filenameIdWidth + Permission.NB_BITS_PER_TYPE;
}
/**
* Permission types present in the Software Heritage graph.
*
* @author The Software Heritage developers
*/
private static class Permission {
public static final int NB_BITS_PER_TYPE = (int) Math
.ceil(Math.log(Permission.Type.values().length) / Math.log(2));
public enum Type {
NONE, CONTENT, EXECUTABLE_CONTENT, SYMLINK, DIRECTORY, REVISION;
public static Permission.Type fromIntCode(int intCode) {
switch (intCode) {
case 0:
return NONE;
case 1:
return CONTENT;
case 2:
return EXECUTABLE_CONTENT;
case 3:
return SYMLINK;
case 4:
return DIRECTORY;
case 5:
return REVISION;
}
throw new IllegalArgumentException("Unknown node permission code: " + intCode);
}
public static int toIntCode(Permission.Type type) {
switch (type) {
case NONE:
return 0;
case CONTENT:
return 1;
case EXECUTABLE_CONTENT:
return 2;
case SYMLINK:
return 3;
case DIRECTORY:
return 4;
case REVISION:
return 5;
}
throw new IllegalArgumentException("Unknown node permission type: " + type);
}
public static Permission.Type fromIntPerm(int intPerm) {
switch (intPerm) {
case 0:
return NONE;
case 0100644:
return CONTENT;
case 0100755:
return EXECUTABLE_CONTENT;
case 0120000:
return SYMLINK;
case 0040000:
return DIRECTORY;
case 0160000:
return REVISION;
default :
return NONE;
}
// throw new IllegalArgumentException("Unknown node permission: " + intPerm);
// TODO: warning here instead?
}
public static int toIntPerm(Permission.Type type) {
switch (type) {
case NONE:
return 0;
case CONTENT:
return 0100644;
case EXECUTABLE_CONTENT:
return 0100755;
case SYMLINK:
return 0120000;
case DIRECTORY:
return 0040000;
case REVISION:
return 0160000;
}
throw new IllegalArgumentException("Unknown node permission type: " + type);
}
public static int fromEncoded(int encoded) {
return toIntPerm(fromIntCode(encoded));
}
public static int toEncoded(int permission) {
return toIntCode(fromIntPerm(permission));
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
index c84cfec..f1a2c18 100644
--- a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
+++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java
@@ -1,110 +1,117 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.labels;
import it.unimi.dsi.big.webgraph.labelling.AbstractLabel;
import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel;
import it.unimi.dsi.big.webgraph.labelling.Label;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import java.io.IOException;
import java.util.Arrays;
/**
* Software Heritage graph edge labels following Webgraph labels convention.
*
* @author The Software Heritage developers
*/
public class SwhLabel extends AbstractLabel {
private final String key;
private final int width;
// TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry
// to store branch names)
public DirEntry[] value;
// Use existing Webgraph class to represent a list of DirEntry as a list of encoded long
private final FixedWidthLongListLabel longList;
private static final DirEntry[] EMPTY_ARRAY = {};
public SwhLabel(String key, int width, DirEntry[] value) {
this.key = key;
this.width = width;
this.value = value;
long[] valueEncoded = new long[value.length];
for (int i = 0; i < value.length; i++)
valueEncoded[i] = value[i].toEncoded();
this.longList = new FixedWidthLongListLabel(key, width, valueEncoded);
}
public SwhLabel(String key, int width) {
this(key, width, EMPTY_ARRAY);
}
public SwhLabel(String... arg) {
this(arg[0], Integer.parseInt(arg[1]));
}
@Override
public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException {
int ret = longList.fromBitStream(inputBitStream, sourceUnused);
// Decode values from their internal long representation
value = new DirEntry[longList.value.length];
for (int i = 0; i < value.length; i++)
value[i] = new DirEntry(longList.value[i]);
return ret;
}
@Override
public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException {
// Values have already been encoded in the SwhLabel constructor
return longList.toBitStream(outputBitStream, sourceUnused);
}
@Override
public String wellKnownAttributeKey() {
return key;
}
@Override
public String[] attributeKeys() {
return new String[]{key};
}
@Override
public Class>[] attributeTypes() {
return new Class[]{DirEntry[].class};
}
@Override
public Object get(String s) {
if (this.key.equals(s))
return value;
throw new IllegalArgumentException();
}
@Override
public Object get() {
return value;
}
@Override
public Label copy() {
return new SwhLabel(key, width, value.clone());
}
@Override
public int fixedWidth() {
return -1;
}
@Override
public String toString() {
return key + ":" + Arrays.toString(value) + " (width:" + width + ")";
}
@Override
public String toSpec() {
return this.getClass().getName() + "(" + key + "," + width + ")";
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
index 7ca8c77..fb65937 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java
@@ -1,189 +1,196 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.maps;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.bytes.ByteBigList;
import it.unimi.dsi.fastutil.bytes.ByteMappedBigList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongMappedBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.compress.NodeMapBuilder;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.charset.StandardCharsets;
/**
* Mapping between internal long node id and external SWHID.
*
* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using
* an mmap()-ed .order file containing the graph permutation.
*
* The node -> SWHID reverse mapping is pre-computed and dumped on disk in the
* {@link NodeMapBuilder} class, then it is loaded here using mmap().
*
* @author The Software Heritage developers
* @see NodeMapBuilder
*/
public class NodeIdMap implements Size64 {
/** Fixed length of binary SWHID buffer */
public static final int SWHID_BIN_SIZE = 22;
/** File extension for the long node id to SWHID map */
public static final String NODE_TO_SWHID = ".node2swhid.bin";
/** Graph path and basename */
String graphPath;
/** mmap()-ed NODE_TO_SWHID file */
ByteBigList nodeToSwhMap;
/** Minimal perfect hash (MPH) function SWHID -> initial order */
Object2LongFunction mph;
/** mmap()-ed long list with the permutation initial order -> graph order */
LongBigList orderMap;
/**
* Constructor.
*
* @param graphPath full graph path
*/
public NodeIdMap(String graphPath) throws IOException {
this.graphPath = graphPath;
// node -> SWHID
try (RandomAccessFile raf = new RandomAccessFile(graphPath + NODE_TO_SWHID, "r")) {
this.nodeToSwhMap = ByteMappedBigList.map(raf.getChannel());
}
// SWHID -> node
this.mph = loadMph(graphPath + ".mph");
try (RandomAccessFile mapFile = new RandomAccessFile(new File(graphPath + ".order"), "r")) {
this.orderMap = LongMappedBigList.map(mapFile.getChannel());
}
}
@SuppressWarnings("unchecked")
public static Object2LongFunction loadMph(String path) throws IOException {
Object obj;
try {
obj = BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e.getMessage());
}
Object2LongFunction res = (Object2LongFunction) obj;
// Backward-compatibility for old maps parametrized with .
// New maps should be parametrized with , which is faster.
try {
// Try to call it with bytes, will fail if it's a O2LF.
res.getLong("42".getBytes(StandardCharsets.UTF_8));
} catch (ClassCastException e) {
class StringCompatibleByteFunction implements Object2LongFunction, Size64 {
private final Object2LongFunction legacyFunction;
public StringCompatibleByteFunction(Object2LongFunction legacyFunction) {
this.legacyFunction = legacyFunction;
}
@Override
public long getLong(Object o) {
byte[] bi = (byte[]) o;
return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8));
}
@SuppressWarnings("deprecation")
@Override
public int size() {
return legacyFunction.size();
}
@Override
public long size64() {
return (legacyFunction instanceof Size64)
? ((Size64) legacyFunction).size64()
: legacyFunction.size();
}
}
Object2LongFunction mphLegacy = (Object2LongFunction) obj;
return new StringCompatibleByteFunction(mphLegacy);
}
// End of backward-compatibility block
return res;
}
/**
* Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the
* SWHID is valid.
*
* @param swhid node represented as bytes
* @return corresponding node as a long id
*/
public long getNodeId(byte[] swhid) {
// 1. Hash the SWHID with the MPH to get its original ID
long origNodeId = mph.getLong(swhid);
// 2. Use the order permutation to get the position in the permuted graph
return this.orderMap.getLong(origNodeId);
}
/**
* Converts SWHID to corresponding long node id.
*
* @param swhid node represented as a {@link SWHID}
* @param checkExists if true, error if the SWHID is not present in the graph, if false the check
* will be skipped and invalid data will be returned for non-existing SWHIDs.
* @return corresponding node as a long id
* @see SWHID
*/
public long getNodeId(SWHID swhid, boolean checkExists) {
// Convert the SWHID to bytes and call getNodeId()
long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII));
// Check that the position effectively corresponds to a real node using the reverse map.
// This is necessary because the MPH makes no guarantees on whether the input SWHID is valid.
if (!checkExists || getSWHID(nodeId).equals(swhid)) {
return nodeId;
} else {
throw new IllegalArgumentException("Unknown SWHID: " + swhid);
}
}
public long getNodeId(SWHID swhid) {
return getNodeId(swhid, true);
}
/**
* Converts a node long id to corresponding SWHID.
*
* @param nodeId node as a long id
* @return corresponding node as a {@link SWHID}
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
/*
* Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's
* swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid
*/
if (nodeId < 0 || nodeId >= nodeToSwhMap.size64()) {
throw new IllegalArgumentException(
"Node id " + nodeId + " should be between 0 and " + nodeToSwhMap.size64());
}
byte[] swhid = new byte[SWHID_BIN_SIZE];
nodeToSwhMap.getElements(nodeId * SWHID_BIN_SIZE, swhid, 0, SWHID_BIN_SIZE);
return SWHID.fromBytes(swhid);
}
/** Return the number of nodes in the map. */
@Override
public long size64() {
return nodeToSwhMap.size64();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java
index d3da61d..3332607 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java
+++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java
@@ -1,55 +1,62 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.maps;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import org.softwareheritage.graph.Node;
import java.io.IOException;
/**
* Mapping between long node id and SWH node type as described in the
* data model.
*
* The type mapping is pre-computed and dumped on disk in the
* {@link org.softwareheritage.graph.compress.NodeMapBuilder} class, then it is loaded in-memory
* here using fastutil LongBigList. To be
* space-efficient, the mapping is stored as a bitmap using minimum number of bits per
* {@link Node.Type}.
*
* @author The Software Heritage developers
*/
public class NodeTypesMap {
/** File extension for the long node id to node type map */
public static final String NODE_TO_TYPE = ".node2type.map";
/**
* Array storing for each node its type
*/
public LongBigList nodeTypesMap;
/**
* Constructor.
*
* @param graphPath path and basename of the compressed graph
*/
public NodeTypesMap(String graphPath) throws IOException {
try {
nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + NODE_TO_TYPE);
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("Unknown class object: " + e);
}
}
/**
* Returns node type from a node long id.
*
* @param nodeId node as a long id
* @return corresponding {@link Node.Type} value
* @see org.softwareheritage.graph.Node.Type
*/
public Node.Type getType(long nodeId) {
long type = nodeTypesMap.getLong(nodeId);
return Node.Type.fromInt((int) type);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java b/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java
index 64acfba..470f6da 100644
--- a/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java
+++ b/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java
@@ -1,293 +1,300 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.rpc;
import com.google.protobuf.FieldMask;
import com.martiansoftware.jsap.*;
import io.grpc.Server;
import io.grpc.Status;
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
import io.grpc.netty.shaded.io.netty.channel.ChannelOption;
import io.grpc.stub.StreamObserver;
import io.grpc.protobuf.services.ProtoReflectionService;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.compress.LabelMapBuilder;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
/**
* Server that manages startup/shutdown of a {@code Greeter} server.
*/
public class GraphServer {
private final static Logger logger = LoggerFactory.getLogger(GraphServer.class);
private final SwhBidirectionalGraph graph;
private final int port;
private final int threads;
private Server server;
/**
* @param graphBasename the basename of the SWH graph to load
* @param port the port on which the GRPC server will listen
* @param threads the number of threads to use in the server threadpool
*/
public GraphServer(String graphBasename, int port, int threads) throws IOException {
this.graph = loadGraph(graphBasename);
this.port = port;
this.threads = threads;
}
/** Load a graph and all its properties. */
public static SwhBidirectionalGraph loadGraph(String basename) throws IOException {
SwhBidirectionalGraph g = SwhBidirectionalGraph.loadLabelledMapped(basename, new ProgressLogger(logger));
g.loadContentLength();
g.loadContentIsSkipped();
g.loadPersonIds();
g.loadAuthorTimestamps();
g.loadCommitterTimestamps();
g.loadMessages();
g.loadTagNames();
g.loadLabelNames();
return g;
}
/** Start the RPC server. */
private void start() throws IOException {
server = NettyServerBuilder.forPort(port).withChildOption(ChannelOption.SO_REUSEADDR, true)
.executor(Executors.newFixedThreadPool(threads)).addService(new TraversalService(graph))
.addService(ProtoReflectionService.newInstance()).build().start();
logger.info("Server started, listening on " + port);
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
try {
GraphServer.this.stop();
} catch (InterruptedException e) {
e.printStackTrace(System.err);
}
}));
}
private void stop() throws InterruptedException {
if (server != null) {
server.shutdown().awaitTermination(30, TimeUnit.SECONDS);
}
}
/**
* Await termination on the main thread since the grpc library uses daemon threads.
*/
private void blockUntilShutdown() throws InterruptedException {
if (server != null) {
server.awaitTermination();
}
}
private static JSAPResult parseArgs(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "",
new Parameter[]{
new FlaggedOption("port", JSAP.INTEGER_PARSER, "50091", JSAP.NOT_REQUIRED, 'p', "port",
"The port on which the server should listen."),
new FlaggedOption("threads", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 't', "threads",
"The number of concurrent threads. 0 = number of cores."),
new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Basename of the output graph")});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
/** Main launches the server from the command line. */
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult config = parseArgs(args);
String graphBasename = config.getString("graphBasename");
int port = config.getInt("port");
int threads = config.getInt("threads");
if (threads == 0) {
threads = Runtime.getRuntime().availableProcessors();
}
final GraphServer server = new GraphServer(graphBasename, port, threads);
server.start();
server.blockUntilShutdown();
}
/** Implementation of the Traversal service, which contains all the graph querying endpoints. */
static class TraversalService extends TraversalServiceGrpc.TraversalServiceImplBase {
SwhBidirectionalGraph graph;
public TraversalService(SwhBidirectionalGraph graph) {
this.graph = graph;
}
/** Return various statistics on the overall graph. */
@Override
public void stats(StatsRequest request, StreamObserver responseObserver) {
StatsResponse.Builder response = StatsResponse.newBuilder();
response.setNumNodes(graph.numNodes());
response.setNumEdges(graph.numArcs());
Properties properties = new Properties();
try {
properties.load(new FileInputStream(graph.getPath() + ".properties"));
properties.load(new FileInputStream(graph.getPath() + ".stats"));
} catch (IOException e) {
throw new RuntimeException(e);
}
response.setCompressionRatio(Double.parseDouble(properties.getProperty("compratio")));
response.setBitsPerNode(Double.parseDouble(properties.getProperty("bitspernode")));
response.setBitsPerEdge(Double.parseDouble(properties.getProperty("bitsperlink")));
response.setAvgLocality(Double.parseDouble(properties.getProperty("avglocality")));
response.setIndegreeMin(Long.parseLong(properties.getProperty("minindegree")));
response.setIndegreeMax(Long.parseLong(properties.getProperty("maxindegree")));
response.setIndegreeAvg(Double.parseDouble(properties.getProperty("avgindegree")));
response.setOutdegreeMin(Long.parseLong(properties.getProperty("minoutdegree")));
response.setOutdegreeMax(Long.parseLong(properties.getProperty("maxoutdegree")));
response.setOutdegreeAvg(Double.parseDouble(properties.getProperty("avgoutdegree")));
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
/** Return a single node and its properties. */
@Override
public void getNode(GetNodeRequest request, StreamObserver responseObserver) {
SwhBidirectionalGraph g = graph.copy();
long nodeId;
try {
nodeId = g.getNodeId(new SWHID(request.getSwhid()));
} catch (IllegalArgumentException e) {
responseObserver
.onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException());
return;
}
Node.Builder builder = Node.newBuilder();
NodePropertyBuilder.buildNodeProperties(g.getForwardGraph(), request.hasMask() ? request.getMask() : null,
builder, nodeId);
responseObserver.onNext(builder.build());
responseObserver.onCompleted();
}
/** Perform a BFS traversal from a set of source nodes and stream the nodes encountered. */
@Override
public void traverse(TraversalRequest request, StreamObserver responseObserver) {
SwhBidirectionalGraph g = graph.copy();
Traversal.SimpleTraversal t;
try {
t = new Traversal.SimpleTraversal(g, request, responseObserver::onNext);
} catch (IllegalArgumentException e) {
responseObserver
.onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException());
return;
}
t.visit();
responseObserver.onCompleted();
}
/**
* Find the shortest path between a set of source nodes and a node that matches a given criteria
* using a BFS.
*/
@Override
public void findPathTo(FindPathToRequest request, StreamObserver responseObserver) {
SwhBidirectionalGraph g = graph.copy();
Traversal.FindPathTo t;
try {
t = new Traversal.FindPathTo(g, request);
} catch (IllegalArgumentException e) {
responseObserver
.onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException());
return;
}
t.visit();
Path path = t.getPath();
if (path == null) {
responseObserver.onError(Status.NOT_FOUND.asException());
} else {
responseObserver.onNext(path);
responseObserver.onCompleted();
}
}
/**
* Find the shortest path between a set of source nodes and a set of destination nodes using a
* bidirectional BFS.
*/
@Override
public void findPathBetween(FindPathBetweenRequest request, StreamObserver responseObserver) {
SwhBidirectionalGraph g = graph.copy();
Traversal.FindPathBetween t;
try {
t = new Traversal.FindPathBetween(g, request);
} catch (IllegalArgumentException e) {
responseObserver
.onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException());
return;
}
t.visit();
Path path = t.getPath();
if (path == null) {
responseObserver.onError(Status.NOT_FOUND.asException());
} else {
responseObserver.onNext(path);
responseObserver.onCompleted();
}
}
/** Return the number of nodes traversed by a BFS traversal. */
@Override
public void countNodes(TraversalRequest request, StreamObserver responseObserver) {
AtomicLong count = new AtomicLong(0);
SwhBidirectionalGraph g = graph.copy();
TraversalRequest fixedReq = TraversalRequest.newBuilder(request)
// Ignore return fields, just count nodes
.setMask(FieldMask.getDefaultInstance()).build();
Traversal.SimpleTraversal t;
try {
t = new Traversal.SimpleTraversal(g, fixedReq, n -> count.incrementAndGet());
} catch (IllegalArgumentException e) {
responseObserver
.onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException());
return;
}
t.visit();
CountResponse response = CountResponse.newBuilder().setCount(count.get()).build();
responseObserver.onNext(response);
responseObserver.onCompleted();
}
/** Return the number of edges traversed by a BFS traversal. */
@Override
public void countEdges(TraversalRequest request, StreamObserver responseObserver) {
AtomicLong count = new AtomicLong(0);
SwhBidirectionalGraph g = graph.copy();
TraversalRequest fixedReq = TraversalRequest.newBuilder(request)
// Force return empty successors to count the edges
.setMask(FieldMask.newBuilder().addPaths("num_successors").build()).build();
Traversal.SimpleTraversal t;
try {
t = new Traversal.SimpleTraversal(g, fixedReq, n -> count.addAndGet(n.getNumSuccessors()));
} catch (IllegalArgumentException e) {
responseObserver
.onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException());
return;
}
t.visit();
CountResponse response = CountResponse.newBuilder().setCount(count.get()).build();
responseObserver.onNext(response);
responseObserver.onCompleted();
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java b/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java
index 5b5bf8e..bbdf4fa 100644
--- a/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java
+++ b/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java
@@ -1,526 +1,533 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.rpc;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import it.unimi.dsi.big.webgraph.labelling.Label;
import org.softwareheritage.graph.*;
import java.util.*;
/** Traversal contains all the algorithms used for graph traversals */
public class Traversal {
/**
* Wrapper around g.successors(), only follows edges that are allowed by the given
* {@link AllowedEdges} object.
*/
private static ArcLabelledNodeIterator.LabelledArcIterator filterLabelledSuccessors(SwhUnidirectionalGraph g,
long nodeId, AllowedEdges allowedEdges) {
if (allowedEdges.restrictedTo == null) {
// All edges are allowed, bypass edge check
return g.labelledSuccessors(nodeId);
} else {
ArcLabelledNodeIterator.LabelledArcIterator allSuccessors = g.labelledSuccessors(nodeId);
return new ArcLabelledNodeIterator.LabelledArcIterator() {
@Override
public Label label() {
return allSuccessors.label();
}
@Override
public long nextLong() {
long neighbor;
while ((neighbor = allSuccessors.nextLong()) != -1) {
if (allowedEdges.isAllowed(g.getNodeType(nodeId), g.getNodeType(neighbor))) {
return neighbor;
}
}
return -1;
}
@Override
public long skip(final long n) {
long i = 0;
while (i < n && nextLong() != -1)
i++;
return i;
}
};
}
}
/** Helper class to check that a given node is "valid" for some given {@link NodeFilter} */
private static class NodeFilterChecker {
private final SwhUnidirectionalGraph g;
private final NodeFilter filter;
private final AllowedNodes allowedNodes;
private NodeFilterChecker(SwhUnidirectionalGraph graph, NodeFilter filter) {
this.g = graph;
this.filter = filter;
this.allowedNodes = new AllowedNodes(filter.hasTypes() ? filter.getTypes() : "*");
}
public boolean allowed(long nodeId) {
if (filter == null) {
return true;
}
if (!this.allowedNodes.isAllowed(g.getNodeType(nodeId))) {
return false;
}
return true;
}
}
/** Returns the unidirectional graph from a bidirectional graph and a {@link GraphDirection}. */
public static SwhUnidirectionalGraph getDirectedGraph(SwhBidirectionalGraph g, GraphDirection direction) {
switch (direction) {
case FORWARD:
return g.getForwardGraph();
case BACKWARD:
return g.getBackwardGraph();
/*
* TODO: add support for BOTH case BOTH: return new SwhUnidirectionalGraph(g.symmetrize(),
* g.getProperties());
*/
default :
throw new IllegalArgumentException("Unknown direction: " + direction);
}
}
/** Returns the opposite of a given {@link GraphDirection} (equivalent to a graph transposition). */
public static GraphDirection reverseDirection(GraphDirection direction) {
switch (direction) {
case FORWARD:
return GraphDirection.BACKWARD;
case BACKWARD:
return GraphDirection.FORWARD;
/*
* TODO: add support for BOTH case BOTH: return GraphDirection.BOTH;
*/
default :
throw new IllegalArgumentException("Unknown direction: " + direction);
}
}
/** Dummy exception to short-circuit and interrupt a graph traversal. */
static class StopTraversalException extends RuntimeException {
}
/** Generic BFS traversal algorithm. */
static class BFSVisitor {
/** The graph to traverse. */
protected final SwhUnidirectionalGraph g;
/** Depth of the node currently being visited */
protected long depth = 0;
/**
* Number of traversal successors (i.e., successors that will be considered by the traversal) of the
* node currently being visited
*/
protected long traversalSuccessors = 0;
/** Number of edges accessed since the beginning of the traversal */
protected long edgesAccessed = 0;
/**
* Map from a node ID to its parent node ID. The key set can be used as the set of all visited
* nodes.
*/
protected HashMap parents = new HashMap<>();
/** Queue of nodes to visit (also called "frontier", "open set", "wavefront" etc.) */
protected ArrayDeque queue = new ArrayDeque<>();
/** If > 0, the maximum depth of the traversal. */
private long maxDepth = -1;
/** If > 0, the maximum number of edges to traverse. */
private long maxEdges = -1;
BFSVisitor(SwhUnidirectionalGraph g) {
this.g = g;
}
/** Add a new source node to the initial queue. */
public void addSource(long nodeId) {
queue.add(nodeId);
parents.put(nodeId, -1L);
}
/** Set the maximum depth of the traversal. */
public void setMaxDepth(long depth) {
maxDepth = depth;
}
/** Set the maximum number of edges to traverse. */
public void setMaxEdges(long edges) {
maxEdges = edges;
}
/** Setup the visit counters and depth sentinel. */
public void visitSetup() {
edgesAccessed = 0;
depth = 0;
queue.add(-1L); // depth sentinel
}
/** Perform the visit */
public void visit() {
visitSetup();
while (!queue.isEmpty()) {
visitStep();
}
}
/** Single "step" of a visit. Advance the frontier of exactly one node. */
public void visitStep() {
try {
assert !queue.isEmpty();
long curr = queue.poll();
if (curr == -1L) {
++depth;
if (!queue.isEmpty()) {
queue.add(-1L);
visitStep();
}
return;
}
if (maxDepth >= 0 && depth > maxDepth) {
throw new StopTraversalException();
}
edgesAccessed += g.outdegree(curr);
if (maxEdges >= 0 && edgesAccessed > maxEdges) {
throw new StopTraversalException();
}
visitNode(curr);
} catch (StopTraversalException e) {
// Traversal is over, clear the to-do queue.
queue.clear();
}
}
/**
* Get the successors of a node. Override this function if you want to filter which successors are
* considered during the traversal.
*/
protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) {
return g.labelledSuccessors(nodeId);
}
/** Visit a node. Override to do additional processing on the node. */
protected void visitNode(long node) {
ArcLabelledNodeIterator.LabelledArcIterator it = getSuccessors(node);
traversalSuccessors = 0;
for (long succ; (succ = it.nextLong()) != -1;) {
traversalSuccessors++;
visitEdge(node, succ, it.label());
}
}
/** Visit an edge. Override to do additional processing on the edge. */
protected void visitEdge(long src, long dst, Label label) {
if (!parents.containsKey(dst)) {
queue.add(dst);
parents.put(dst, src);
}
}
}
/**
* SimpleTraversal is used by the Traverse endpoint. It extends BFSVisitor with additional
* processing, notably related to graph properties and filters.
*/
static class SimpleTraversal extends BFSVisitor {
private final NodeFilterChecker nodeReturnChecker;
private final AllowedEdges allowedEdges;
private final TraversalRequest request;
private final NodePropertyBuilder.NodeDataMask nodeDataMask;
private final NodeObserver nodeObserver;
private Node.Builder nodeBuilder;
SimpleTraversal(SwhBidirectionalGraph bidirectionalGraph, TraversalRequest request, NodeObserver nodeObserver) {
super(getDirectedGraph(bidirectionalGraph, request.getDirection()));
this.request = request;
this.nodeObserver = nodeObserver;
this.nodeReturnChecker = new NodeFilterChecker(g, request.getReturnNodes());
this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null);
this.allowedEdges = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*");
request.getSrcList().forEach(srcSwhid -> {
long srcNodeId = g.getNodeId(new SWHID(srcSwhid));
addSource(srcNodeId);
});
if (request.hasMaxDepth()) {
setMaxDepth(request.getMaxDepth());
}
if (request.hasMaxEdges()) {
setMaxEdges(request.getMaxEdges());
}
}
@Override
protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) {
return filterLabelledSuccessors(g, nodeId, allowedEdges);
}
@Override
public void visitNode(long node) {
nodeBuilder = null;
if (nodeReturnChecker.allowed(node) && (!request.hasMinDepth() || depth >= request.getMinDepth())) {
nodeBuilder = Node.newBuilder();
NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, node);
}
super.visitNode(node);
if (request.getReturnNodes().hasMinTraversalSuccessors()
&& traversalSuccessors < request.getReturnNodes().getMinTraversalSuccessors()
|| request.getReturnNodes().hasMaxTraversalSuccessors()
&& traversalSuccessors > request.getReturnNodes().getMaxTraversalSuccessors()) {
nodeBuilder = null;
}
if (nodeBuilder != null) {
nodeObserver.onNext(nodeBuilder.build());
}
}
@Override
protected void visitEdge(long src, long dst, Label label) {
super.visitEdge(src, dst, label);
NodePropertyBuilder.buildSuccessorProperties(g, nodeDataMask, nodeBuilder, src, dst, label);
}
}
/**
* FindPathTo searches for a path from a source node to a node matching a given criteria It extends
* BFSVisitor with additional processing, and makes the traversal stop as soon as a node matching
* the given criteria is found.
*/
static class FindPathTo extends BFSVisitor {
private final AllowedEdges allowedEdges;
private final FindPathToRequest request;
private final NodePropertyBuilder.NodeDataMask nodeDataMask;
private final NodeFilterChecker targetChecker;
private Long targetNode = null;
FindPathTo(SwhBidirectionalGraph bidirectionalGraph, FindPathToRequest request) {
super(getDirectedGraph(bidirectionalGraph, request.getDirection()));
this.request = request;
this.targetChecker = new NodeFilterChecker(g, request.getTarget());
this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null);
this.allowedEdges = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*");
if (request.hasMaxDepth()) {
setMaxDepth(request.getMaxDepth());
}
if (request.hasMaxEdges()) {
setMaxEdges(request.getMaxEdges());
}
request.getSrcList().forEach(srcSwhid -> {
long srcNodeId = g.getNodeId(new SWHID(srcSwhid));
addSource(srcNodeId);
});
}
@Override
protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) {
return filterLabelledSuccessors(g, nodeId, allowedEdges);
}
@Override
public void visitNode(long node) {
if (targetChecker.allowed(node)) {
targetNode = node;
throw new StopTraversalException();
}
super.visitNode(node);
}
/**
* Once the visit has been performed and a matching node has been found, return the shortest path
* from the source set to that node. To do so, we need to backtrack the parents of the node until we
* find one of the source nodes (whose parent is -1).
*/
public Path getPath() {
if (targetNode == null) {
return null; // No path found.
}
/* Backtrack from targetNode to a source node */
long curNode = targetNode;
ArrayList path = new ArrayList<>();
while (curNode != -1) {
path.add(curNode);
curNode = parents.get(curNode);
}
Collections.reverse(path);
/* Enrich path with node properties */
Path.Builder pathBuilder = Path.newBuilder();
for (long nodeId : path) {
Node.Builder nodeBuilder = Node.newBuilder();
NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, nodeId);
pathBuilder.addNode(nodeBuilder.build());
}
return pathBuilder.build();
}
}
/**
* FindPathBetween searches for a shortest path between a set of source nodes and a set of
* destination nodes.
*
* It does so by performing a *bidirectional breadth-first search*, i.e., two parallel breadth-first
* searches, one from the source set ("src-BFS") and one from the destination set ("dst-BFS"), until
* both searches find a common node that joins their visited sets. This node is called the "midpoint
* node". The path returned is the path src -> ... -> midpoint -> ... -> dst, which is always a
* shortest path between src and dst.
*
* The graph direction of both BFS can be configured separately. By default, the dst-BFS will use
* the graph in the opposite direction than the src-BFS (if direction = FORWARD, by default
* direction_reverse = BACKWARD, and vice-versa). The default behavior is thus to search for a
* shortest path between two nodes in a given direction. However, one can also specify FORWARD or
* BACKWARD for *both* the src-BFS and the dst-BFS. This will search for a common descendant or a
* common ancestor between the two sets, respectively. These will be the midpoints of the returned
* path.
*/
static class FindPathBetween extends BFSVisitor {
private final FindPathBetweenRequest request;
private final NodePropertyBuilder.NodeDataMask nodeDataMask;
private final AllowedEdges allowedEdgesSrc;
private final AllowedEdges allowedEdgesDst;
private final BFSVisitor srcVisitor;
private final BFSVisitor dstVisitor;
private Long middleNode = null;
FindPathBetween(SwhBidirectionalGraph bidirectionalGraph, FindPathBetweenRequest request) {
super(getDirectedGraph(bidirectionalGraph, request.getDirection()));
this.request = request;
this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null);
GraphDirection direction = request.getDirection();
// if direction_reverse is not specified, use the opposite direction of direction
GraphDirection directionReverse = request.hasDirectionReverse()
? request.getDirectionReverse()
: reverseDirection(request.getDirection());
SwhUnidirectionalGraph srcGraph = getDirectedGraph(bidirectionalGraph, direction);
SwhUnidirectionalGraph dstGraph = getDirectedGraph(bidirectionalGraph, directionReverse);
this.allowedEdgesSrc = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*");
/*
* If edges_reverse is not specified: - If `edges` is not specified either, defaults to "*" - If
* direction == direction_reverse, defaults to `edges` - If direction != direction_reverse, defaults
* to the reverse of `edges` (e.g. "rev:dir" becomes "dir:rev").
*/
this.allowedEdgesDst = request.hasEdgesReverse()
? new AllowedEdges(request.getEdgesReverse())
: (request.hasEdges()
? (direction == directionReverse
? new AllowedEdges(request.getEdges())
: new AllowedEdges(request.getEdges()).reverse())
: new AllowedEdges("*"));
/*
* Source sub-visitor. Aborts as soon as it finds a node already visited by the destination
* sub-visitor.
*/
this.srcVisitor = new BFSVisitor(srcGraph) {
@Override
protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) {
return filterLabelledSuccessors(g, nodeId, allowedEdgesSrc);
}
@Override
public void visitNode(long node) {
if (dstVisitor.parents.containsKey(node)) {
middleNode = node;
throw new StopTraversalException();
}
super.visitNode(node);
}
};
/*
* Destination sub-visitor. Aborts as soon as it finds a node already visited by the source
* sub-visitor.
*/
this.dstVisitor = new BFSVisitor(dstGraph) {
@Override
protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) {
return filterLabelledSuccessors(g, nodeId, allowedEdgesDst);
}
@Override
public void visitNode(long node) {
if (srcVisitor.parents.containsKey(node)) {
middleNode = node;
throw new StopTraversalException();
}
super.visitNode(node);
}
};
if (request.hasMaxDepth()) {
this.srcVisitor.setMaxDepth(request.getMaxDepth());
this.dstVisitor.setMaxDepth(request.getMaxDepth());
}
if (request.hasMaxEdges()) {
this.srcVisitor.setMaxEdges(request.getMaxEdges());
this.dstVisitor.setMaxEdges(request.getMaxEdges());
}
request.getSrcList().forEach(srcSwhid -> {
long srcNodeId = g.getNodeId(new SWHID(srcSwhid));
srcVisitor.addSource(srcNodeId);
});
request.getDstList().forEach(srcSwhid -> {
long srcNodeId = g.getNodeId(new SWHID(srcSwhid));
dstVisitor.addSource(srcNodeId);
});
}
@Override
public void visit() {
/*
* Bidirectional BFS: maintain two sub-visitors, and alternately run a visit step in each of them.
*/
srcVisitor.visitSetup();
dstVisitor.visitSetup();
while (!srcVisitor.queue.isEmpty() || !dstVisitor.queue.isEmpty()) {
if (!srcVisitor.queue.isEmpty()) {
srcVisitor.visitStep();
}
if (!dstVisitor.queue.isEmpty()) {
dstVisitor.visitStep();
}
}
}
public Path getPath() {
if (middleNode == null) {
return null; // No path found.
}
Path.Builder pathBuilder = Path.newBuilder();
ArrayList path = new ArrayList<>();
/* First section of the path: src -> midpoint */
long curNode = middleNode;
while (curNode != -1) {
path.add(curNode);
curNode = srcVisitor.parents.get(curNode);
}
pathBuilder.setMidpointIndex(path.size() - 1);
Collections.reverse(path);
/* Second section of the path: midpoint -> dst */
curNode = dstVisitor.parents.get(middleNode);
while (curNode != -1) {
path.add(curNode);
curNode = dstVisitor.parents.get(curNode);
}
/* Enrich path with node properties */
for (long nodeId : path) {
Node.Builder nodeBuilder = Node.newBuilder();
NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, nodeId);
pathBuilder.addNode(nodeBuilder.build());
}
return pathBuilder.build();
}
}
public interface NodeObserver {
void onNext(Node nodeId);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
index 7b02d76..4f1eda7 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
@@ -1,91 +1,98 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SwhUnidirectionalGraph;
import org.softwareheritage.graph.labels.DirEntry;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
public class DumpProperties {
final static Logger logger = LoggerFactory.getLogger(DumpProperties.class);
public static void main(String[] args) throws IOException {
String graphPath = args[0];
ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
SwhUnidirectionalGraph graph;
if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) {
graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl);
} else {
graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl);
}
graph.loadContentLength();
graph.loadContentIsSkipped();
graph.loadPersonIds();
graph.loadAuthorTimestamps();
graph.loadCommitterTimestamps();
graph.loadMessages();
graph.loadTagNames();
graph.loadLabelNames();
ArcLabelledNodeIterator it = graph.labelledNodeIterator();
while (it.hasNext()) {
long node = it.nextLong();
System.out.format("%s: %s\n", node, graph.getSWHID(node));
var s = it.successors();
System.out.println(" successors:");
for (long succ; (succ = s.nextLong()) >= 0;) {
DirEntry[] labels = (DirEntry[]) s.label().get();
if (labels.length > 0) {
for (DirEntry label : labels) {
System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ),
new String(graph.getLabelName(label.filenameId)), label.permission);
}
} else {
System.out.format(" %s\n", graph.getSWHID(succ));
}
}
switch (graph.getNodeType(node)) {
case CNT:
System.out.format(" length: %s\n", graph.getContentLength(node));
System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node));
break;
case REV:
System.out.format(" author: %s\n", graph.getAuthorId(node));
System.out.format(" committer: %s\n", graph.getCommitterId(node));
System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
graph.getAuthorTimestampOffset(node));
System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node),
graph.getCommitterTimestampOffset(node));
byte[] msg = graph.getMessage(node);
if (msg != null) {
System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n"));
}
break;
case REL:
System.out.format(" author: %s\n", graph.getAuthorId(node));
System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
graph.getAuthorTimestamp(node));
byte[] tagMsg = graph.getMessage(node);
if (tagMsg != null) {
System.out.format(" message: %s\n", (new String(tagMsg)).replace("\n", "\\n"));
}
byte[] tagName = graph.getTagName(node);
if (tagName != null) {
System.out.format(" message: %s\n", (new String(tagName)));
}
break;
case ORI:
System.out.format(" url: %s\n", graph.getUrl(node));
}
System.out.println();
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
index 0f09ccd..a4e017b 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java
@@ -1,76 +1,83 @@
+/*
+ * Copyright (c) 2021 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import com.google.common.primitives.Longs;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import org.softwareheritage.graph.SwhBidirectionalGraph;
import org.softwareheritage.graph.SWHID;
import org.softwareheritage.graph.experiments.topology.ConnectedComponents;
import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class ExportSubdataset {
public static void main(String[] args) throws IOException, ClassNotFoundException {
System.err.print("Loading everything...");
String graphPath = args[0];
SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath);
Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
System.err.println(" done.");
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
LineIterator lineIterator = new LineIterator(buffer);
while (lineIterator.hasNext()) {
String line = lineIterator.next().toString();
long i;
try {
// i = mphMap.getLong(line.getBytes(StandardCharsets.UTF_8));
i = graph.getNodeId(new SWHID(line));
} catch (IllegalArgumentException e) {
continue;
}
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
SWHID currentNodeSWHID = graph.getSWHID(currentNode);
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
System.out.format("%s %s\n", currentNodeSWHID, graph.getSWHID(succ));
if (visited.getBoolean(succ))
continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
index 3623bb0..bf59b6f 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java
@@ -1,113 +1,120 @@
+/*
+ * Copyright (c) 2021 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.*;
import java.io.IOException;
import java.time.Duration;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Stack;
/* sample invocation on granet.internal.softwareheritage.org for benchmarking
* purposes, with the main swh-graph service already running:
*
* $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-0.3.0.jar -Xmx300G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.FindEarliestRevision --timing /dev/shm/swh-graph/default/graph
*
*/
public class FindEarliestRevision {
public static void main(String[] args) throws IOException, ClassNotFoundException {
String graphPath = args[0];
boolean timing = false;
long ts, elapsedNanos;
Duration elapsed;
if (args.length >= 2 && (args[0].equals("-t") || args[0].equals("--timing"))) {
timing = true;
graphPath = args[1];
System.err.println("started with timing option, will keep track of elapsed time");
}
System.err.println("loading transposed graph...");
ts = System.nanoTime();
SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath).transpose();
elapsed = Duration.ofNanos(System.nanoTime() - ts);
System.err.println(String.format("transposed graph loaded (duration: %s).", elapsed));
System.err.println("loading revision timestamps...");
ts = System.nanoTime();
graph.loadCommitterTimestamps();
elapsed = Duration.ofNanos(System.nanoTime() - ts);
System.err.println(String.format("revision timestamps loaded (duration: %s).", elapsed));
Scanner stdin = new Scanner(System.in);
AllowedEdges edges = new AllowedEdges("cnt:dir,dir:dir,dir:rev");
String rawSWHID = null;
SWHID srcSWHID = null;
long lineCount = 0;
long srcNodeId = -1;
if (timing) {
System.err.println("starting SWHID processing...");
elapsed = Duration.ZERO;
}
while (stdin.hasNextLine()) {
if (timing)
ts = System.nanoTime();
rawSWHID = stdin.nextLine().strip();
lineCount++;
try {
srcSWHID = new SWHID(rawSWHID);
srcNodeId = graph.getNodeId(srcSWHID);
} catch (IllegalArgumentException e) {
System.err
.println(String.format("skipping invalid or unknown SWHID %s on line %d", rawSWHID, lineCount));
continue;
}
if (timing)
System.err.println("starting traversal for: " + srcSWHID.toString());
Stack stack = new Stack<>();
HashSet visited = new HashSet<>();
stack.push(srcNodeId);
visited.add(srcNodeId);
long minRevId = -1;
long minTimestamp = Long.MAX_VALUE;
while (!stack.isEmpty()) {
long currentNodeId = stack.pop();
if (graph.getNodeType(currentNodeId) == Node.Type.REV) {
long committerTs = graph.getCommitterTimestamp(currentNodeId);
if (committerTs < minTimestamp) {
minRevId = currentNodeId;
minTimestamp = committerTs;
}
}
LazyLongIterator it = graph.successors(currentNodeId);
for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
if (!edges.isAllowed(graph.getNodeType(currentNodeId), graph.getNodeType(neighborNodeId))) {
continue;
}
if (!visited.contains(neighborNodeId)) {
stack.push(neighborNodeId);
visited.add(neighborNodeId);
}
}
}
if (minRevId == -1) {
System.err.println("no revision found containing: " + srcSWHID.toString());
} else {
System.out.println(srcSWHID.toString() + "\t" + graph.getSWHID(minRevId).toString());
}
if (timing) {
elapsedNanos = System.nanoTime() - ts; // processing time for current SWHID
elapsed = elapsed.plus(Duration.ofNanos(elapsedNanos)); // cumulative processing time for all SWHIDs
System.err.printf("visit time (s):\t%.6f\n", (double) elapsedNanos / 1_000_000_000);
}
}
if (timing)
System.err.printf("processed %d SWHIDs in %s (%s avg)\n", lineCount, elapsed, elapsed.dividedBy(lineCount));
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java
index d316047..dadaa51 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java
@@ -1,197 +1,204 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import it.unimi.dsi.fastutil.BigArrays;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveAction;
public class ForkJoinBigQuickSort2 extends RecursiveAction {
private static final long serialVersionUID = 1L;
private final long from;
private final long to;
private final long[][] x, y;
private static final int QUICKSORT_NO_REC = 16;
private static final int PARALLEL_QUICKSORT_NO_FORK = 8192;
private static final int QUICKSORT_MEDIAN_OF_9 = 128;
public ForkJoinBigQuickSort2(final long[][] x, final long[][] y, final long from, final long to) {
this.from = from;
this.to = to;
this.x = x;
this.y = y;
}
@Override
protected void compute() {
final long[][] x = this.x;
final long[][] y = this.y;
final long len = to - from;
if (len < PARALLEL_QUICKSORT_NO_FORK) {
quickSort(x, y, from, to);
return;
}
// Choose a partition element, v
long m = from + len / 2;
long l = from;
long n = to - 1;
long s = len / 8;
l = med3(x, y, l, l + s, l + 2 * s);
m = med3(x, y, m - s, m, m + s);
n = med3(x, y, n - 2 * s, n - s, n);
m = med3(x, y, l, m, n);
final long xm = BigArrays.get(x, m), ym = BigArrays.get(y, m);
// Establish Invariant: v* (v)* v*
long a = from, b = a, c = to - 1, d = c;
while (true) {
int comparison;
while (b <= c && (comparison = compare(x, y, b, xm, ym)) <= 0) {
if (comparison == 0)
swap(x, y, a++, b);
b++;
}
while (c >= b && (comparison = compare(x, y, c, xm, ym)) >= 0) {
if (comparison == 0)
swap(x, y, c, d--);
c--;
}
if (b > c)
break;
swap(x, y, b++, c--);
}
// Swap partition elements back to middle
long t;
s = Math.min(a - from, b - a);
swap(x, y, from, b - s, s);
s = Math.min(d - c, to - d - 1);
swap(x, y, b, to - s, s);
s = b - a;
t = d - c;
// Recursively sort non-partition-elements
if (s > 1 && t > 1)
invokeAll(new ForkJoinBigQuickSort2(x, y, from, from + s), new ForkJoinBigQuickSort2(x, y, to - t, to));
else if (s > 1)
invokeAll(new ForkJoinBigQuickSort2(x, y, from, from + s));
else
invokeAll(new ForkJoinBigQuickSort2(x, y, to - t, to));
}
public static void quickSort(final long[][] x, final long[][] y, final long from, final long to) {
final long len = to - from;
if (len < QUICKSORT_NO_REC) {
selectionSort(x, y, from, to);
return;
}
// Choose a partition element, v
long m = from + len / 2;
long l = from;
long n = to - 1;
if (len > QUICKSORT_MEDIAN_OF_9) { // Big arrays, pseudomedian of 9
long s = len / 8;
l = med3(x, y, l, l + s, l + 2 * s);
m = med3(x, y, m - s, m, m + s);
n = med3(x, y, n - 2 * s, n - s, n);
}
m = med3(x, y, l, m, n); // Mid-size, med of 3
// Establish Invariant: v* (v)* v*
long a = from, b = a, c = to - 1, d = c;
final long xm = BigArrays.get(x, m), ym = BigArrays.get(y, m);
while (true) {
long comparison;
while (b <= c && (comparison = compare(x, y, b, xm, ym)) <= 0) {
if (comparison == 0)
swap(x, y, a++, b);
b++;
}
while (c >= b && (comparison = compare(x, y, c, xm, ym)) >= 0) {
if (comparison == 0)
swap(x, y, c, d--);
c--;
}
if (b > c)
break;
swap(x, y, b++, c--);
}
// Swap partition elements back to middle
long s;
s = Math.min(a - from, b - a);
swap(x, y, from, b - s, s);
s = Math.min(d - c, to - d - 1);
swap(x, y, b, to - s, s);
// Recursively sort non-partition-elements
if ((s = b - a) > 1)
quickSort(x, y, from, from + s);
if ((s = d - c) > 1)
quickSort(x, y, to - s, to);
}
public static void quickSort(final long[][] x, final long[][] y) {
quickSort(x, y, 0, x.length);
}
private static int compare(final long[][] x, final long[][] y, final long u, final long v) {
int tx;
return (tx = Long.compare(BigArrays.get(x, u), BigArrays.get(x, v))) != 0
? tx
: Long.compare(BigArrays.get(y, u), BigArrays.get(y, v));
}
private static int compare(final long[][] x, final long[][] y, final long i, final long xm, final long ym) {
int tx;
return (tx = Long.compare(BigArrays.get(x, i), xm)) != 0 ? tx : Long.compare(BigArrays.get(y, i), ym);
}
private static void swap(final long[][] x, final long[][] y, final long a, final long b) {
BigArrays.swap(x, a, b);
BigArrays.swap(y, a, b);
}
private static void swap(final long[][] x, final long[][] y, long a, long b, final long n) {
for (long i = 0; i < n; i++, a++, b++)
swap(x, y, a, b);
}
private static long med3(final long[][] x, final long[][] y, final long a, final long b, final long c) {
final int ab = compare(x, y, a, b);
final int ac = compare(x, y, a, c);
final int bc = compare(x, y, b, c);
return (ab < 0 ? (bc < 0 ? b : ac < 0 ? c : a) : (bc > 0 ? b : ac > 0 ? c : a));
}
public static void selectionSort(final long[][] a, final long[][] b, final long from, final long to) {
for (long i = from; i < to - 1; i++) {
long m = i;
for (long j = i + 1; j < to; j++)
if (compare(a, b, j, m) < 0)
m = j;
if (m != i) {
BigArrays.swap(a, i, m);
BigArrays.swap(b, i, m);
}
}
}
public static void selectionSort(final long[][] x, final long[][] y) {
selectionSort(x, y, 0, x.length);
}
public static ForkJoinPool getPool() {
ForkJoinPool current = ForkJoinTask.getPool();
return current == null ? ForkJoinPool.commonPool() : current;
}
public static void parallelQuickSort(final long[][] x, final long[][] y) {
BigArrays.ensureSameLength(x, y);
parallelQuickSort(x, y, 0, x.length);
}
public static void parallelQuickSort(final long[][] x, final long[][] y, final long from, final long to) {
ForkJoinPool pool = getPool();
if (to - from < PARALLEL_QUICKSORT_NO_FORK || pool.getParallelism() == 1)
quickSort(x, y, from, to);
else {
pool.invoke(new ForkJoinBigQuickSort2(x, y, from, to));
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java
index f423369..57ae71d 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java
@@ -1,217 +1,224 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveAction;
import static it.unimi.dsi.fastutil.longs.LongArrays.ensureSameLength;
public class ForkJoinQuickSort3 extends RecursiveAction {
private static final long serialVersionUID = 1L;
private final int from;
private final int to;
private final long[] x, y, z;
private static final int QUICKSORT_NO_REC = 16;
private static final int PARALLEL_QUICKSORT_NO_FORK = 8192;
private static final int QUICKSORT_MEDIAN_OF_9 = 128;
public ForkJoinQuickSort3(final long[] x, final long[] y, final long z[], final int from, final int to) {
this.from = from;
this.to = to;
this.x = x;
this.y = y;
this.z = z;
}
@Override
protected void compute() {
final long[] x = this.x;
final long[] y = this.y;
final long[] z = this.z;
final int len = to - from;
if (len < PARALLEL_QUICKSORT_NO_FORK) {
quickSort(x, y, z, from, to);
return;
}
// Choose a partition element, v
int m = from + len / 2;
int l = from;
int n = to - 1;
int s = len / 8;
l = med3(x, y, z, l, l + s, l + 2 * s);
m = med3(x, y, z, m - s, m, m + s);
n = med3(x, y, z, n - 2 * s, n - s, n);
m = med3(x, y, z, l, m, n);
final long xm = x[m], ym = y[m], zm = z[m];
// Establish Invariant: v* (v)* v*
int a = from, b = a, c = to - 1, d = c;
while (true) {
int comparison, t;
while (b <= c && (comparison = compare(x, y, z, b, xm, ym, zm)) <= 0) {
if (comparison == 0)
swap(x, y, z, a++, b);
b++;
}
while (c >= b && (comparison = compare(x, y, z, c, xm, ym, zm)) >= 0) {
if (comparison == 0)
swap(x, y, z, c, d--);
c--;
}
if (b > c)
break;
swap(x, y, z, b++, c--);
}
// Swap partition elements back to middle
int t;
s = Math.min(a - from, b - a);
swap(x, y, z, from, b - s, s);
s = Math.min(d - c, to - d - 1);
swap(x, y, z, b, to - s, s);
s = b - a;
t = d - c;
// Recursively sort non-partition-elements
if (s > 1 && t > 1)
invokeAll(new ForkJoinQuickSort3(x, y, z, from, from + s), new ForkJoinQuickSort3(x, y, z, to - t, to));
else if (s > 1)
invokeAll(new ForkJoinQuickSort3(x, y, z, from, from + s));
else
invokeAll(new ForkJoinQuickSort3(x, y, z, to - t, to));
}
public static void quickSort(final long[] x, final long[] y, final long[] z, final int from, final int to) {
final int len = to - from;
if (len < QUICKSORT_NO_REC) {
selectionSort(x, y, z, from, to);
return;
}
// Choose a partition element, v
int m = from + len / 2;
int l = from;
int n = to - 1;
if (len > QUICKSORT_MEDIAN_OF_9) { // Big arrays, pseudomedian of 9
int s = len / 8;
l = med3(x, y, z, l, l + s, l + 2 * s);
m = med3(x, y, z, m - s, m, m + s);
n = med3(x, y, z, n - 2 * s, n - s, n);
}
m = med3(x, y, z, l, m, n); // Mid-size, med of 3
// Establish Invariant: v* (v)* v*
int a = from, b = a, c = to - 1, d = c;
final long xm = x[m], ym = y[m], zm = z[m];
while (true) {
int comparison;
while (b <= c && (comparison = compare(x, y, z, b, xm, ym, zm)) <= 0) {
if (comparison == 0)
swap(x, y, z, a++, b);
b++;
}
while (c >= b && (comparison = compare(x, y, z, c, xm, ym, zm)) >= 0) {
if (comparison == 0)
swap(x, y, z, c, d--);
c--;
}
if (b > c)
break;
swap(x, y, z, b++, c--);
}
// Swap partition elements back to middle
int s;
s = Math.min(a - from, b - a);
swap(x, y, z, from, b - s, s);
s = Math.min(d - c, to - d - 1);
swap(x, y, z, b, to - s, s);
// Recursively sort non-partition-elements
if ((s = b - a) > 1)
quickSort(x, y, z, from, from + s);
if ((s = d - c) > 1)
quickSort(x, y, z, to - s, to);
}
public static void quickSort(final long[] x, final long[] y, final long[] z) {
quickSort(x, y, z, 0, x.length);
}
private static int compare(final long[] x, final long[] y, final long[] z, final int u, final int v) {
int tx, ty;
return (tx = Long.compare(x[u], x[v])) != 0
? tx
: ((ty = Long.compare(y[u], y[v])) != 0 ? ty : Long.compare(z[u], z[v]));
}
private static int compare(final long[] x, final long[] y, final long[] z, final int i, final long xm,
final long ym, final long zm) {
int tx, ty;
return (tx = Long.compare(x[i], xm)) != 0
? tx
: ((ty = Long.compare(y[i], ym)) != 0 ? ty : Long.compare(z[i], zm));
}
private static void swap(final long[] x, final long[] y, final long[] z, final int a, final int b) {
final long t = x[a];
final long u = y[a];
final long v = z[a];
x[a] = x[b];
y[a] = y[b];
z[a] = z[b];
x[b] = t;
y[b] = u;
z[b] = v;
}
private static void swap(final long[] x, final long[] y, final long[] z, int a, int b, final int n) {
for (int i = 0; i < n; i++, a++, b++)
swap(x, y, z, a, b);
}
private static int med3(final long[] x, final long[] y, final long[] z, final int a, final int b, final int c) {
final int ab = compare(x, y, z, a, b);
final int ac = compare(x, y, z, a, c);
final int bc = compare(x, y, z, b, c);
return (ab < 0 ? (bc < 0 ? b : ac < 0 ? c : a) : (bc > 0 ? b : ac > 0 ? c : a));
}
public static void selectionSort(final long[] a, final long[] b, long[] c, final int from, final int to) {
for (int i = from; i < to - 1; i++) {
int m = i;
for (int j = i + 1; j < to; j++)
if (compare(a, b, c, j, m) < 0)
m = j;
if (m != i) {
long t = a[i];
a[i] = a[m];
a[m] = t;
t = b[i];
b[i] = b[m];
b[m] = t;
t = c[i];
c[i] = c[m];
c[m] = t;
}
}
}
public static void selectionSort(final long[] x, final long[] y, final long[] z) {
selectionSort(x, y, z, 0, x.length);
}
public static ForkJoinPool getPool() {
ForkJoinPool current = ForkJoinTask.getPool();
return current == null ? ForkJoinPool.commonPool() : current;
}
public static void parallelQuickSort(final long[] x, final long[] y, final long[] z) {
ensureSameLength(x, y);
ensureSameLength(x, z);
parallelQuickSort(x, y, z, 0, x.length);
}
public static void parallelQuickSort(final long[] x, final long[] y, final long[] z, final int from, final int to) {
ForkJoinPool pool = getPool();
if (to - from < PARALLEL_QUICKSORT_NO_FORK || pool.getParallelism() == 1)
quickSort(x, y, z, from, to);
else {
pool.invoke(new ForkJoinQuickSort3(x, y, z, from, to));
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
index 0d672e2..71d6dab 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java
@@ -1,46 +1,53 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import org.softwareheritage.graph.maps.NodeIdMap;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
public class MPHTranslate {
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(MPHTranslate.class.getName(), "",
new Parameter[]{new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Filename of the serialized MPH"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
JSAPResult config = parse_args(args);
String mphPath = config.getString("function");
Object2LongFunction mphMap = NodeIdMap.loadMph(mphPath);
// TODO: wasteful to convert to/from bytes
FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
LineIterator lineIterator = new LineIterator(buffer);
while (lineIterator.hasNext()) {
String line = lineIterator.next().toString();
System.out.println(mphMap.getLong(line.getBytes(StandardCharsets.US_ASCII)));
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
index c760032..7daec23 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java
@@ -1,40 +1,47 @@
+/*
+ * Copyright (c) 2020-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import it.unimi.dsi.big.webgraph.NodeIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SwhUnidirectionalGraph;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
public class ReadGraph {
final static Logger logger = LoggerFactory.getLogger(ReadLabelledGraph.class);
public static void main(String[] args) throws IOException {
String graphPath = args[0];
SwhUnidirectionalGraph graph;
ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) {
graph = SwhUnidirectionalGraph.loadMapped(graphPath, pl);
} else {
graph = SwhUnidirectionalGraph.load(graphPath, pl);
}
pl.expectedUpdates = graph.numArcs();
pl.start("Reading graph...");
NodeIterator it = graph.nodeIterator();
while (it.hasNext()) {
long srcNode = it.nextLong();
var s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode));
pl.lightUpdate();
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
index 3c64bbd..c8e0a9f 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java
@@ -1,48 +1,55 @@
+/*
+ * Copyright (c) 2020-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SwhUnidirectionalGraph;
import org.softwareheritage.graph.labels.DirEntry;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
public class ReadLabelledGraph {
final static Logger logger = LoggerFactory.getLogger(ReadLabelledGraph.class);
public static void main(String[] args) throws IOException, ClassNotFoundException {
String graphPath = args[0];
ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
SwhUnidirectionalGraph graph;
if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) {
graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl);
} else {
graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl);
}
graph.properties.loadLabelNames();
ArcLabelledNodeIterator it = graph.labelledNodeIterator();
while (it.hasNext()) {
long srcNode = it.nextLong();
ArcLabelledNodeIterator.LabelledArcIterator s = it.successors();
long dstNode;
while ((dstNode = s.nextLong()) >= 0) {
DirEntry[] labels = (DirEntry[]) s.label().get();
if (labels.length > 0) {
for (DirEntry label : labels) {
System.out.format("%s %s %s %d\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode),
new String(graph.properties.getLabelName(label.filenameId)), label.permission);
}
} else {
System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode));
}
}
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/Sort.java b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java
index 2181a53..9a69b94 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/Sort.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java
@@ -1,32 +1,39 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.utils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class Sort {
public static Process spawnSort(String sortBufferSize, String sortTmpDir) throws IOException {
return spawnSort(sortBufferSize, sortTmpDir, null);
}
public static Process spawnSort(String sortBufferSize, String sortTmpDir, List options) throws IOException {
ProcessBuilder sortProcessBuilder = new ProcessBuilder();
sortProcessBuilder.redirectError(ProcessBuilder.Redirect.INHERIT);
ArrayList command = new ArrayList<>(List.of("sort", "-u", "--buffer-size", sortBufferSize));
if (sortTmpDir != null) {
command.add("--temporary-directory");
command.add(sortTmpDir);
}
if (options != null) {
command.addAll(options);
}
sortProcessBuilder.command(command);
Map env = sortProcessBuilder.environment();
env.put("LC_ALL", "C");
env.put("LC_COLLATE", "C");
env.put("LANG", "C");
return sortProcessBuilder.start();
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java b/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java
index f91f6ed..022f2b6 100644
--- a/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java
@@ -1,113 +1,120 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
public class AllowedEdgesTest extends GraphTest {
static class EdgeType {
Node.Type src;
Node.Type dst;
public EdgeType(Node.Type src, Node.Type dst) {
this.src = src;
this.dst = dst;
}
@Override
public boolean equals(Object otherObj) {
if (otherObj == this)
return true;
if (!(otherObj instanceof EdgeType))
return false;
EdgeType other = (EdgeType) otherObj;
return src == other.src && dst == other.dst;
}
}
void assertEdgeRestriction(AllowedEdges edges, ArrayList expectedAllowed) {
Node.Type[] nodeTypes = Node.Type.values();
for (Node.Type src : nodeTypes) {
for (Node.Type dst : nodeTypes) {
EdgeType edge = new EdgeType(src, dst);
boolean isAllowed = edges.isAllowed(src, dst);
boolean isExpected = false;
for (EdgeType expected : expectedAllowed) {
if (expected.equals(edge)) {
isExpected = true;
break;
}
}
Assertions.assertEquals(isAllowed, isExpected, "Edge type: " + src + " -> " + dst);
}
}
}
@Test
public void dirToDirDirToCntEdges() {
AllowedEdges edges = new AllowedEdges("dir:dir,dir:cnt");
ArrayList expected = new ArrayList<>();
expected.add(new EdgeType(Node.Type.DIR, Node.Type.DIR));
expected.add(new EdgeType(Node.Type.DIR, Node.Type.CNT));
assertEdgeRestriction(edges, expected);
}
@Test
public void relToRevRevToRevRevToDirEdges() {
AllowedEdges edges = new AllowedEdges("rel:rev,rev:rev,rev:dir");
ArrayList expected = new ArrayList<>();
expected.add(new EdgeType(Node.Type.REL, Node.Type.REV));
expected.add(new EdgeType(Node.Type.REV, Node.Type.REV));
expected.add(new EdgeType(Node.Type.REV, Node.Type.DIR));
assertEdgeRestriction(edges, expected);
}
@Test
public void revToAllDirToDirEdges() {
AllowedEdges edges = new AllowedEdges("rev:*,dir:dir");
ArrayList expected = new ArrayList<>();
for (Node.Type dst : Node.Type.values()) {
expected.add(new EdgeType(Node.Type.REV, dst));
}
expected.add(new EdgeType(Node.Type.DIR, Node.Type.DIR));
assertEdgeRestriction(edges, expected);
}
@Test
public void allToCntEdges() {
AllowedEdges edges = new AllowedEdges("*:cnt");
ArrayList expected = new ArrayList<>();
for (Node.Type src : Node.Type.values()) {
expected.add(new EdgeType(src, Node.Type.CNT));
}
assertEdgeRestriction(edges, expected);
}
@Test
public void allEdges() {
AllowedEdges edges = new AllowedEdges("*:*");
ArrayList expected = new ArrayList<>();
for (Node.Type src : Node.Type.values()) {
for (Node.Type dst : Node.Type.values()) {
expected.add(new EdgeType(src, dst));
}
}
assertEdgeRestriction(edges, expected);
// Special null value used to quickly bypass edge check when no restriction
AllowedEdges edges2 = new AllowedEdges("*");
Assertions.assertNull(edges2.restrictedTo);
}
@Test
public void noEdges() {
AllowedEdges edges = new AllowedEdges("");
AllowedEdges edges2 = new AllowedEdges(null);
ArrayList expected = new ArrayList<>();
assertEdgeRestriction(edges, expected);
assertEdgeRestriction(edges2, expected);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java b/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java
index ca6479f..7d66391 100644
--- a/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java
@@ -1,53 +1,60 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Set;
public class AllowedNodesTest extends GraphTest {
void assertNodeRestriction(AllowedNodes nodes, Set expectedAllowed) {
Node.Type[] nodeTypes = Node.Type.values();
for (Node.Type t : nodeTypes) {
boolean isAllowed = nodes.isAllowed(t);
boolean isExpected = expectedAllowed.contains(t);
Assertions.assertEquals(isAllowed, isExpected, "Node type: " + t);
}
}
@Test
public void dirCntNodes() {
AllowedNodes edges = new AllowedNodes("dir,cnt");
Set expected = Set.of(Node.Type.DIR, Node.Type.CNT);
assertNodeRestriction(edges, expected);
}
@Test
public void revDirNodes() {
AllowedNodes edges = new AllowedNodes("rev,dir");
Set expected = Set.of(Node.Type.DIR, Node.Type.REV);
assertNodeRestriction(edges, expected);
}
@Test
public void relSnpCntNodes() {
AllowedNodes edges = new AllowedNodes("rel,snp,cnt");
Set expected = Set.of(Node.Type.REL, Node.Type.SNP, Node.Type.CNT);
assertNodeRestriction(edges, expected);
}
@Test
public void allNodes() {
AllowedNodes edges = new AllowedNodes("*");
Set expected = Set.of(Node.Type.REL, Node.Type.SNP, Node.Type.CNT, Node.Type.DIR, Node.Type.REV,
Node.Type.ORI);
assertNodeRestriction(edges, expected);
}
@Test
public void noNodes() {
AllowedNodes edges = new AllowedNodes("");
Set expected = Set.of();
assertNodeRestriction(edges, expected);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/GraphTest.java b/java/src/test/java/org/softwareheritage/graph/GraphTest.java
index 94df365..872784f 100644
--- a/java/src/test/java/org/softwareheritage/graph/GraphTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/GraphTest.java
@@ -1,60 +1,67 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import com.github.luben.zstd.ZstdInputStream;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.LazyLongIterators;
import org.junit.jupiter.api.BeforeAll;
import static org.junit.Assert.assertEquals;
public class GraphTest {
static SwhBidirectionalGraph graph;
final protected String TEST_ORIGIN_ID = "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054";
@BeforeAll
public static void setUp() throws IOException {
graph = SwhBidirectionalGraph.loadLabelled(getGraphPath().toString());
}
public static Path getGraphPath() {
return Paths.get("..", "swh", "graph", "tests", "dataset", "compressed", "example");
}
public static SwhBidirectionalGraph getGraph() {
return graph;
}
public static SWHID fakeSWHID(String type, int num) {
return new SWHID(String.format("swh:1:%s:%040d", type, num));
}
public static void assertEqualsAnyOrder(Collection expected, Collection actual) {
ArrayList expectedList = new ArrayList<>(expected);
ArrayList actualList = new ArrayList<>(actual);
expectedList.sort(Comparator.comparing(Object::toString));
actualList.sort(Comparator.comparing(Object::toString));
assertEquals(expectedList, actualList);
}
public static ArrayList lazyLongIteratorToList(LazyLongIterator input) {
ArrayList inputList = new ArrayList<>();
Iterator inputIt = LazyLongIterators.eager(input);
inputIt.forEachRemaining(inputList::add);
return inputList;
}
public static String[] readZstFile(Path zstFile) throws IOException {
ZstdInputStream zis = new ZstdInputStream(new FileInputStream(zstFile.toFile()));
return (new String(zis.readAllBytes())).split("\n");
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
index e471799..cce1a45 100644
--- a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java
@@ -1,85 +1,92 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import java.util.*;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class SubgraphTest extends GraphTest {
@Test
public void noFilter() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("*"));
for (long i = 0; i < g.numNodes(); ++i) {
Assertions.assertEquals(g.outdegree(i), sg.outdegree(i));
}
}
@Test
public void missingNode() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
SWHID rev1 = fakeSWHID("rev", 18);
Assertions.assertThrows(IllegalArgumentException.class, () -> {
sg.outdegree(sg.getNodeId(rev1));
});
Assertions.assertThrows(IllegalArgumentException.class, () -> {
sg.successors(sg.getNodeId(rev1));
});
}
@Test
public void outdegreeOnlyDirOri() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
SWHID dir1 = fakeSWHID("dir", 17);
Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir1)));
Assertions.assertEquals(1, sg.outdegree(sg.getNodeId(dir1)));
SWHID dir2 = fakeSWHID("dir", 6);
Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir2)));
Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(dir2)));
SWHID ori1 = new SWHID(TEST_ORIGIN_ID);
Assertions.assertEquals(1, g.outdegree(g.getNodeId(ori1)));
Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(ori1)));
}
@Test
public void successorsOnlyDirOri() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
SWHID dir1 = fakeSWHID("dir", 17);
assertEqualsAnyOrder(Collections.singletonList(sg.getNodeId(fakeSWHID("dir", 16))),
lazyLongIteratorToList(sg.successors(sg.getNodeId(dir1))));
SWHID dir2 = fakeSWHID("dir", 6);
assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(dir2))));
SWHID ori1 = new SWHID(TEST_ORIGIN_ID);
assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(ori1))));
}
@Test
public void nodeIteratorOnlyOriDir() {
SwhBidirectionalGraph g = getGraph();
Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori"));
ArrayList nodeList = new ArrayList<>();
Iterator nodeIt = sg.nodeIterator();
nodeIt.forEachRemaining(nodeList::add);
assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(new SWHID(TEST_ORIGIN_ID)), sg.getNodeId(fakeSWHID("dir", 2)),
sg.getNodeId(fakeSWHID("dir", 6)), sg.getNodeId(fakeSWHID("dir", 8)),
sg.getNodeId(fakeSWHID("dir", 12)), sg.getNodeId(fakeSWHID("dir", 16)),
sg.getNodeId(fakeSWHID("dir", 17))), nodeList);
sg = new Subgraph(g, new AllowedNodes("snp,rel"));
nodeList = new ArrayList<>();
nodeIt = sg.nodeIterator();
nodeIt.forEachRemaining(nodeList::add);
assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(fakeSWHID("snp", 20)), sg.getNodeId(fakeSWHID("rel", 10)),
sg.getNodeId(fakeSWHID("rel", 19))), nodeList);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java
index d9713f8..4576aae 100644
--- a/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java
@@ -1,106 +1,113 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import org.apache.commons.codec.digest.DigestUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.softwareheritage.graph.GraphTest;
import org.softwareheritage.graph.Node;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.TreeSet;
public class ExtractNodesTest extends GraphTest {
/** Generate a fake SWHID for a given node type and numeric ID */
private static byte[] f(String type, int id) {
String hash = new String(DigestUtils.sha1Hex(type + id).getBytes());
return String.format("swh:1:%s:%s", type, hash).getBytes();
}
static class FakeDataset implements GraphDataset {
@Override
public void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException {
// For each node type, write nodes {1..4} as present in the graph
for (Node.Type type : Node.Type.values()) {
for (int i = 1; i <= 4; i++) {
byte[] node = f(type.toString().toLowerCase(), i);
nodeCb.onNode(node);
}
}
edgeCb.onEdge(f("ori", 1), f("snp", 1), null, -1);
edgeCb.onEdge(f("ori", 2), f("snp", 2), null, -1);
edgeCb.onEdge(f("ori", 3), f("snp", 3), null, -1);
edgeCb.onEdge(f("ori", 4), f("snp", 404), null, -1);
edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup1".getBytes(), -1);
edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup2".getBytes(), -1);
edgeCb.onEdge(f("snp", 3), f("cnt", 1), "c1".getBytes(), -1);
edgeCb.onEdge(f("snp", 4), f("rel", 1), "r1".getBytes(), -1);
edgeCb.onEdge(f("rel", 1), f("rel", 2), null, -1);
edgeCb.onEdge(f("rel", 2), f("rev", 1), null, -1);
edgeCb.onEdge(f("rel", 3), f("rev", 2), null, -1);
edgeCb.onEdge(f("rel", 4), f("dir", 1), null, -1);
edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1);
edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1);
edgeCb.onEdge(f("rev", 1), f("rev", 2), null, -1);
edgeCb.onEdge(f("rev", 2), f("rev", 404), null, -1);
edgeCb.onEdge(f("rev", 3), f("rev", 2), null, -1);
edgeCb.onEdge(f("rev", 4), f("dir", 1), null, -1);
edgeCb.onEdge(f("dir", 1), f("cnt", 1), "c1".getBytes(), 42);
edgeCb.onEdge(f("dir", 1), f("dir", 1), "d1".getBytes(), 1337);
edgeCb.onEdge(f("dir", 1), f("rev", 1), "r1".getBytes(), 0);
}
}
@Test
public void testExtractNodes(@TempDir Path outputDir, @TempDir Path sortTmpDir)
throws IOException, InterruptedException {
FakeDataset dataset = new FakeDataset();
ExtractNodes.extractNodes(dataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toFile());
// Check count files
Long nodeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.nodes.count.txt")).strip());
Long edgeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.edges.count.txt")).strip());
Long labelCount = Long.parseLong(Files.readString(outputDir.resolve("graph.labels.count.txt")).strip());
Assertions.assertEquals(26L, nodeCount);
Assertions.assertEquals(21L, edgeCount);
Assertions.assertEquals(5L, labelCount);
// Check stat files
List nodeStats = Files.readAllLines(outputDir.resolve("graph.nodes.stats.txt"));
List edgeStats = Files.readAllLines(outputDir.resolve("graph.edges.stats.txt"));
Assertions.assertEquals(nodeStats, List.of("cnt 4", "dir 4", "ori 4", "rel 4", "rev 5", "snp 5"));
Assertions.assertEquals(edgeStats, List.of("dir:cnt 1", "dir:dir 1", "dir:rev 1", "ori:snp 4", "rel:dir 1",
"rel:rel 1", "rel:rev 2", "rev:dir 1", "rev:rev 5", "snp:cnt 1", "snp:rel 1", "snp:rev 2"));
// Build ordered set of expected node IDs
TreeSet expectedNodes = new TreeSet<>();
for (Node.Type type : Node.Type.values()) {
for (int i = 1; i <= 4; i++) {
byte[] node = f(type.toString().toLowerCase(), i);
expectedNodes.add(new String(node));
}
}
expectedNodes.add(new String(f("snp", 404)));
expectedNodes.add(new String(f("rev", 404)));
String[] nodeLines = readZstFile(outputDir.resolve("graph.nodes.csv.zst"));
Assertions.assertArrayEquals(expectedNodes.toArray(new String[0]), nodeLines);
// Build ordered set of expected label IDs
TreeSet expectedLabels = new TreeSet<>();
expectedLabels.add("dup1");
expectedLabels.add("dup2");
expectedLabels.add("c1");
expectedLabels.add("r1");
expectedLabels.add("d1");
String[] labelLines = readZstFile(outputDir.resolve("graph.labels.csv.zst"));
Assertions.assertArrayEquals(expectedLabels.toArray(new String[0]), labelLines);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java
index 9089d0d..142d849 100644
--- a/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java
@@ -1,76 +1,83 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.compress;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.softwareheritage.graph.GraphTest;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
public class ExtractPersonsTest extends GraphTest {
private static class FakeORCDataset extends ORCGraphDataset {
private static class FakeSwhOrcTable extends ORCGraphDataset.SwhOrcTable {
private final String tableName;
public FakeSwhOrcTable(String tableName) {
this.tableName = tableName;
}
@Override
public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException {
if (tableName.equals("revision") && longColumn.equals("author")) {
cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_author_1".getBytes());
cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_author_1".getBytes());
cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_author_2".getBytes());
cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_1".getBytes());
cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_3".getBytes());
} else if (tableName.equals("revision") && longColumn.equals("committer")) {
cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_committer_1".getBytes());
cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_committer_1".getBytes());
cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_committer_2".getBytes());
cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_2".getBytes());
cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_1".getBytes());
cb.onBytes(fakeSWHID("rev", 6).toBytes(), "rev_committer_1".getBytes());
} else if (tableName.equals("release") && longColumn.equals("author")) {
cb.onBytes(fakeSWHID("rel", 1).toBytes(), "rel_committer_1".getBytes());
cb.onBytes(fakeSWHID("rel", 2).toBytes(), "rel_committer_1".getBytes());
cb.onBytes(fakeSWHID("rel", 3).toBytes(), "rel_committer_2".getBytes());
cb.onBytes(fakeSWHID("rel", 4).toBytes(), "rev_author_2".getBytes());
cb.onBytes(fakeSWHID("rel", 5).toBytes(), "rev_author_1".getBytes());
cb.onBytes(fakeSWHID("rel", 6).toBytes(), "rev_committer_1".getBytes());
cb.onBytes(fakeSWHID("rel", 7).toBytes(), "rel_committer_1".getBytes());
} else {
throw new RuntimeException("Unknown table/column: " + tableName + "/" + longColumn);
}
}
}
public SwhOrcTable getTable(String tableName) {
return new FakeSwhOrcTable(tableName);
}
}
@Test
public void testExtractPersons(@TempDir Path outputDir, @TempDir Path sortTmpDir)
throws IOException, InterruptedException {
FakeORCDataset fakeORCDataset = new FakeORCDataset();
ExtractPersons.extractPersons(fakeORCDataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toString());
ArrayList expectedPersons = new ArrayList<>(Arrays.asList("rev_author_1", "rev_author_2",
"rev_author_3", "rev_committer_1", "rev_committer_2", "rel_committer_1", "rel_committer_2"));
// Check count files
Long personsCount = Long.parseLong(Files.readString(outputDir.resolve("graph.persons.count.txt")).strip());
Assertions.assertEquals(expectedPersons.size(), personsCount);
// Check persons
expectedPersons.sort(String::compareTo);
String[] personLines = readZstFile(outputDir.resolve("graph.persons.csv.zst"));
Assertions.assertArrayEquals(expectedPersons.toArray(new String[0]), personLines);
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java
index be76492..218a79c 100644
--- a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java
@@ -1,203 +1,210 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.rpc;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.softwareheritage.graph.SWHID;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
public class FindPathBetweenTest extends TraversalServiceTest {
private FindPathBetweenRequest.Builder getRequestBuilder(SWHID src, SWHID dst) {
return FindPathBetweenRequest.newBuilder().addSrc(src.toString()).addDst(dst.toString());
}
@Test
public void testSwhidErrors() {
StatusRuntimeException thrown;
thrown = assertThrows(StatusRuntimeException.class, () -> client
.findPathBetween(FindPathBetweenRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest
.newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest
.newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
thrown = assertThrows(StatusRuntimeException.class,
() -> client.findPathBetween(FindPathBetweenRequest.newBuilder().addSrc(TEST_ORIGIN_ID)
.addDst("swh:1:cnt:000000000000000000000000000000000000000z").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
}
@Test
public void testEdgeErrors() {
StatusRuntimeException thrown;
thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest
.newBuilder().addSrc(TEST_ORIGIN_ID).addDst(TEST_ORIGIN_ID).setEdges("batracien:reptile").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
}
// Test path between ori 1 and cnt 4 (forward graph)
@Test
public void forwardRootToLeaf() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 4)).build()));
List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("snp", 20), fakeSWHID("rev", 9),
fakeSWHID("dir", 8), fakeSWHID("dir", 6), fakeSWHID("cnt", 4));
Assertions.assertEquals(expected, actual);
}
// Test path between rev 18 and rev 3 (forward graph)
@Test
public void forwardRevToRev() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 18), fakeSWHID("rev", 3)).build()));
List expected = List.of(fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("rev", 9),
fakeSWHID("rev", 3));
Assertions.assertEquals(expected, actual);
}
// Test path between rev 3 and rev 18 (backward graph)
@Test
public void backwardRevToRev() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 3), fakeSWHID("rev", 18))
.setDirection(GraphDirection.BACKWARD).build()));
List expected = List.of(fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("rev", 13),
fakeSWHID("rev", 18));
Assertions.assertEquals(expected, actual);
}
// Test path between cnt 4 and itself (forward graph)
@Test
public void forwardCntToItself() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("cnt", 4)).build()));
List expected = List.of(fakeSWHID("cnt", 4));
Assertions.assertEquals(expected, actual);
}
// Start from ori and rel 19 and find cnt 14 or cnt 7 (forward graph)
@Test
public void forwardMultipleSourcesDest() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 14))
.addSrc(TEST_ORIGIN_ID).addDst(fakeSWHID("cnt", 7).toString()).build()));
List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17),
fakeSWHID("cnt", 14));
}
// Start from cnt 4 and cnt 11 and find rev 13 or rev 9 (backward graph)
@Test
public void backwardMultipleSourcesDest() {
ArrayList actual = getSWHIDs(client.findPathBetween(
getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("rev", 13)).setDirection(GraphDirection.BACKWARD)
.addSrc(fakeSWHID("cnt", 11).toString()).addDst(fakeSWHID("rev", 9).toString()).build()));
List expected = List.of(fakeSWHID("cnt", 11), fakeSWHID("dir", 12), fakeSWHID("rev", 13));
Assertions.assertEquals(expected, actual);
}
// Start from all directories and find the origin (backward graph)
@Test
public void backwardMultipleSourcesAllDirToOri() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(fakeSWHID("dir", 2), new SWHID(TEST_ORIGIN_ID))
.addSrc(fakeSWHID("dir", 6).toString()).addSrc(fakeSWHID("dir", 8).toString())
.addSrc(fakeSWHID("dir", 12).toString()).addSrc(fakeSWHID("dir", 16).toString())
.addSrc(fakeSWHID("dir", 17).toString()).setDirection(GraphDirection.BACKWARD).build()));
List expected = List.of(fakeSWHID("dir", 8), fakeSWHID("rev", 9), fakeSWHID("snp", 20),
new SWHID(TEST_ORIGIN_ID));
Assertions.assertEquals(expected, actual);
}
// Start from cnt 4 and find any rev (backward graph)
@Test
public void backwardCntToAnyRev() {
ArrayList actual = getSWHIDs(
client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("rev", 3))
.addDst(fakeSWHID("rev", 9).toString()).addDst(fakeSWHID("rev", 13).toString())
.addDst(fakeSWHID("rev", 18).toString()).setDirection(GraphDirection.BACKWARD).build()));
List expected = List.of(fakeSWHID("cnt", 4), fakeSWHID("dir", 6), fakeSWHID("dir", 8),
fakeSWHID("rev", 9));
Assertions.assertEquals(expected, actual);
}
// Impossible path between rev 9 and cnt 14
@Test
public void forwardImpossiblePath() {
StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> {
client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 9), fakeSWHID("cnt", 14)).build());
});
Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode());
// Reverse direction
thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> {
client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 14), fakeSWHID("rev", 9))
.setDirection(GraphDirection.BACKWARD).build());
});
Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode());
}
// Common ancestor between cnt 4 and cnt 15 : rev 18
@Test
public void commonAncestorBackwardBackward() {
Path p = client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("cnt", 15))
.setDirection(GraphDirection.BACKWARD).setDirectionReverse(GraphDirection.BACKWARD).build());
ArrayList actual = getSWHIDs(p);
SWHID expected = fakeSWHID("rev", 18);
Assertions.assertEquals(expected, actual.get(p.getMidpointIndex()));
}
// Common descendant between rev 13 and rev 3 : cnt 1 (with rev:dir,dir:dir,dir:cnt)
@Test
public void commonDescendantForwardForward() {
Path p = client.findPathBetween(
getRequestBuilder(fakeSWHID("rev", 13), fakeSWHID("rev", 3)).setDirection(GraphDirection.FORWARD)
.setDirectionReverse(GraphDirection.FORWARD).setEdges("rev:dir,dir:dir,dir:cnt").build());
ArrayList actual = getSWHIDs(p);
SWHID expected = fakeSWHID("cnt", 1);
Assertions.assertEquals(expected, actual.get(p.getMidpointIndex()));
}
// Path between rel 19 and cnt 15 with various max depths
@Test
public void maxDepth() {
// Works with max_depth = 2
ArrayList actual = getSWHIDs(client
.findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxDepth(2).build()));
List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17),
fakeSWHID("dir", 16), fakeSWHID("cnt", 15));
Assertions.assertEquals(expected, actual);
// Check that it throws NOT_FOUND with max depth = 1
StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> {
client.findPathBetween(
getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxDepth(1).build());
});
Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode());
}
// Path between rel 19 and cnt 15 with various max edges
@Test
public void maxEdges() {
// Works with max_edges = 3
ArrayList actual = getSWHIDs(client
.findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxEdges(3).build()));
List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17),
fakeSWHID("dir", 16), fakeSWHID("cnt", 15));
Assertions.assertEquals(expected, actual);
// Check that it throws NOT_FOUND with max_edges = 2
StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> {
client.findPathBetween(
getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxEdges(2).build());
});
Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode());
}
}
diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java
index ebec7fc..54d358f 100644
--- a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java
+++ b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java
@@ -1,162 +1,169 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.rpc;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.softwareheritage.graph.SWHID;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
public class FindPathToTest extends TraversalServiceTest {
private FindPathToRequest.Builder getRequestBuilder(SWHID src, String allowedNodes) {
return FindPathToRequest.newBuilder().addSrc(src.toString())
.setTarget(NodeFilter.newBuilder().setTypes(allowedNodes).build());
}
@Test
public void testSrcErrors() {
StatusRuntimeException thrown;
thrown = assertThrows(StatusRuntimeException.class, () -> client
.findPathTo(FindPathToRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo(
FindPathToRequest.newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo(
FindPathToRequest.newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
}
@Test
public void testEdgeErrors() {
StatusRuntimeException thrown;
thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo(
FindPathToRequest.newBuilder().addSrc(TEST_ORIGIN_ID).setEdges("batracien:reptile").build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
}
@Test
public void testTargetErrors() {
StatusRuntimeException thrown;
thrown = assertThrows(StatusRuntimeException.class,
() -> client.findPathTo(FindPathToRequest.newBuilder().addSrc(TEST_ORIGIN_ID)
.setTarget(NodeFilter.newBuilder().setTypes("argoumante,eglomatique").build()).build()));
assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode());
}
// Test path between ori 1 and any dir (forward graph)
@Test
public void forwardOriToFirstDir() {
ArrayList actual = getSWHIDs(
client.findPathTo(getRequestBuilder(new SWHID(TEST_ORIGIN_ID), "dir").build()));
List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("snp", 20), fakeSWHID("rev", 9),
fakeSWHID("dir", 8));
Assertions.assertEquals(expected, actual);
}
// Test path between rel 19 and any cnt (forward graph)
@Test
public void forwardRelToFirstCnt() {
ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("rel", 19), "cnt").build()));
List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17),
fakeSWHID("cnt", 14));
Assertions.assertEquals(expected, actual);
}
// Test path between dir 16 and any rel (backward graph)
@Test
public void backwardDirToFirstRel() {
ArrayList actual = getSWHIDs(client.findPathTo(
getRequestBuilder(fakeSWHID("dir", 16), "rel").setDirection(GraphDirection.BACKWARD).build()));
List expected = List.of(fakeSWHID("dir", 16), fakeSWHID("dir", 17), fakeSWHID("rev", 18),
fakeSWHID("rel", 19));
Assertions.assertEquals(expected, actual);
}
// Test path between cnt 4 and itself (forward graph)
@Test
public void forwardCntToItself() {
ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 4), "cnt").build()));
List expected = List.of(fakeSWHID("cnt", 4));
Assertions.assertEquals(expected, actual);
}
// Start from ori and rel 19 and find any cnt (forward graph)
@Test
public void forwardMultipleSources() {
ArrayList actual = getSWHIDs(
client.findPathTo(getRequestBuilder(fakeSWHID("rel", 19), "cnt").addSrc(TEST_ORIGIN_ID).build()));
List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17),
fakeSWHID("cnt", 14));
}
// Start from cnt 4 and cnt 11 and find any rev (backward graph)
@Test
public void backwardMultipleSources() {
ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 4), "rev")
.addSrc(fakeSWHID("cnt", 11).toString()).setDirection(GraphDirection.BACKWARD).build()));
List