diff --git a/java/pom.xml b/java/pom.xml index 3bdb6fa..5513168 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1,361 +1,361 @@ 4.0.0 org.softwareheritage.graph swh-graph ${git.closest.tag.name} swh-graph https://forge.softwareheritage.org/source/swh-graph/ UTF-8 11 ch.qos.logback logback-classic 1.2.3 org.junit.jupiter junit-jupiter-api 5.7.0 test org.junit.vintage junit-vintage-engine 5.7.0 junit junit 4.12 org.junit.jupiter junit-jupiter-engine 5.7.0 test org.hamcrest hamcrest 2.2 test io.javalin javalin 3.0.0 org.slf4j slf4j-simple 1.7.26 com.fasterxml.jackson.core jackson-databind 2.13.0 it.unimi.dsi webgraph-big - 3.6.6 + 3.6.7 it.unimi.dsi fastutil - 8.5.6 + 8.5.8 it.unimi.dsi dsiutils - 2.6.17 + 2.7.0 it.unimi.dsi sux4j - 5.2.3 + 5.3.0 it.unimi.dsi law 2.7.2 org.apache.hadoop hadoop-common org.umlgraph umlgraph org.eclipse.jetty.aggregate jetty-all it.unimi.di mg4j it.unimi.di mg4j-big com.martiansoftware jsap 2.1 net.sf.py4j py4j 0.10.9.3 commons-codec commons-codec 1.15 com.github.luben zstd-jni 1.5.1-1 org.apache.orc orc-core 1.7.1 org.apache.hadoop hadoop-common 3.3.1 org.apache.hadoop hadoop-client-runtime 3.3.1 maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 11 11 -verbose -Xlint:all maven-surefire-plugin 2.22.2 maven-failsafe-plugin 2.22.2 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0 maven-assembly-plugin 3.3.0 org.softwareheritage.graph.server.App jar-with-dependencies false make-assembly package single com.diffplug.spotless spotless-maven-plugin 2.4.1 *.md .gitignore true 4 4.16.0 .coding-style.xml pl.project13.maven git-commit-id-plugin 3.0.1 get-the-git-infos revision initialize true true true true v* git.closest.tag.name ^v true maven-source-plugin 2.1.1 bundle-sources package jar-no-fork test-jar-no-fork org.apache.maven.plugins maven-javadoc-plugin 3.3.1 resource-bundles package resource-bundle test-resource-bundle false javadoc-jar package jar true it.unimi.dsi:webgraph-big:* https://webgraph.di.unimi.it/docs-big/ https://dsiutils.di.unimi.it/docs/ https://fastutil.di.unimi.it/docs/ https://law.di.unimi.it/software/law-docs/ implSpec a Implementation Requirements: implNote a Implementation Note: diff --git a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java b/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java deleted file mode 100644 index 815b426..0000000 --- a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java +++ /dev/null @@ -1,128 +0,0 @@ -package org.softwareheritage.graph; - -import it.unimi.dsi.big.webgraph.ImmutableGraph; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.big.webgraph.Transform; -import it.unimi.dsi.fastutil.longs.LongIterator; - -/** - * A directed immutable graph which can be iterated in both directions (forward and backward). It - * exposes the backward equivalents of the ImmutableGraph primitives (indegree() and - * predecessors()). This is implemented by passing two graphs, one in the forward and one in the - * backward direction. - */ -public class BidirectionalImmutableGraph extends ImmutableGraph { - private final ImmutableGraph forwardGraph; - private final ImmutableGraph backwardGraph; - - /** - * Creates a bidirectional immutable graph - * - * @param forwardGraph The graph in the forward direction - * @param backwardGraph The graph in the backward direction - */ - public BidirectionalImmutableGraph(ImmutableGraph forwardGraph, ImmutableGraph backwardGraph) { - this.forwardGraph = forwardGraph; - this.backwardGraph = backwardGraph; - } - - @Override - public long numNodes() { - assert forwardGraph.numNodes() == backwardGraph.numNodes(); - return this.forwardGraph.numNodes(); - } - - @Override - public long numArcs() { - assert forwardGraph.numArcs() == backwardGraph.numArcs(); - return this.forwardGraph.numArcs(); - } - - @Override - public boolean randomAccess() { - return this.forwardGraph.randomAccess() && this.backwardGraph.randomAccess(); - } - - @Override - public boolean hasCopiableIterators() { - return forwardGraph.hasCopiableIterators() && backwardGraph.hasCopiableIterators(); - } - - @Override - public BidirectionalImmutableGraph copy() { - return new BidirectionalImmutableGraph(this.forwardGraph.copy(), this.backwardGraph.copy()); - } - - /** - * Returns the transposed version of the bidirectional graph. Successors become predecessors, and - * vice-versa. - */ - public BidirectionalImmutableGraph transpose() { - return new BidirectionalImmutableGraph(backwardGraph, forwardGraph); - } - - /** - * Returns the symmetric version of the bidirectional graph. It returns the (lazy) union of the - * forward graph and the backward graph. This is equivalent to removing the directionality of the - * edges: the successors of a node are also its predecessors. - * - * @return a symmetric, undirected BidirectionalImmutableGraph. - */ - public BidirectionalImmutableGraph symmetrize() { - ImmutableGraph symmetric = Transform.union(forwardGraph, backwardGraph); - return new BidirectionalImmutableGraph(symmetric, symmetric); - } - - /** - * Returns the simplified version of the bidirectional graph. Works like symmetrize(), but also - * removes the loop edges. - * - * @return a simplified (loopless and symmetric) BidirectionalImmutableGraph - */ - public BidirectionalImmutableGraph simplify() { - ImmutableGraph simplified = Transform.simplify(forwardGraph, backwardGraph); - return new BidirectionalImmutableGraph(simplified, simplified); - } - - /** Returns the outdegree of a node */ - @Override - public long outdegree(long l) { - return forwardGraph.outdegree(l); - } - - /** Returns the indegree of a node */ - public long indegree(long l) { - return backwardGraph.outdegree(l); - } - - /** Returns a lazy iterator over the successors of a given node. */ - @Override - public LazyLongIterator successors(long nodeId) { - return forwardGraph.successors(nodeId); - } - - /** Returns a lazy iterator over the predecessors of a given node. */ - public LazyLongIterator predecessors(long nodeId) { - return backwardGraph.successors(nodeId); - } - - /** Returns a reference to an array containing the predecessors of a given node. */ - public long[][] predecessorBigArray(long x) { - return backwardGraph.successorBigArray(x); - } - - /** Returns an iterator enumerating the indegrees of the nodes of this graph. */ - public LongIterator indegrees() { - return backwardGraph.outdegrees(); - } - - /** Returns the underlying ImmutableGraph in the forward direction. */ - public ImmutableGraph getForwardGraph() { - return forwardGraph; - } - - /** Returns the underlying ImmutableGraph in the backward direction. */ - public ImmutableGraph getBackwardGraph() { - return backwardGraph; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java index bfb59cc..824a389 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java @@ -1,190 +1,180 @@ package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import it.unimi.dsi.big.webgraph.BidirectionalImmutableGraph; import it.unimi.dsi.logging.ProgressLogger; import java.io.IOException; import java.io.InputStream; /** * Class representing the compressed Software Heritage graph in both directions (forward and * backward). * * This class uses the {@link BidirectionalImmutableGraph} class internally to implement the * backward equivalent of graph operations ({@link SwhBidirectionalGraph#indegree(long)}, * {@link SwhBidirectionalGraph#predecessors(long)}, etc.) by holding a reference to two * {@link SwhUnidirectionalGraph} (a forward graph and a backward graph). * * Both graphs share their graph properties in memory by storing references to the same * {@link SwhGraphProperties} object. * *
  *                 ┌──────────────┐
  *                 │ImmutableGraph◄────────┐
  *                 └────▲─────────┘        │extends
  *                      │                  │
  *                      │       ┌──────────┴────────────────┐
  *               extends│       │BidirectionalImmutableGraph│
  *                      │       └────────────▲──────────────┘
  *                      │                    │extends
  *       ┌──────────────┴───────┐     ┌──────┴──────────────┐
  *       │SwhUnidirectionalGraph│◄────┤SwhBidirectionalGraph│
  *       └──┬──────────────┬────┘     └────────┬───────────┬┘
  *          │              │    contains x2    │           │
  *          │              │                   │           │
  *          │    implements│                   │implements │
  *          │             ┌▼──────────┐        │           │
  *          │             │SwhGraph(I)◄────────┘           │
  * contains │             └───────────┘                    │contains
  *          │                                              │
  *          │            ┌──────────────────┐              │
  *          └────────────►SwhGraphProperties◄──────────────┘
  *                       └──────────────────┘
  * 
* * @author The Software Heritage developers * @see SwhUnidirectionalGraph */ public class SwhBidirectionalGraph extends BidirectionalImmutableGraph implements SwhGraph { /** Property data of the graph (id/type mappings etc.) */ - private final SwhGraphProperties properties; + public final SwhGraphProperties properties; private final SwhUnidirectionalGraph forwardGraph; private final SwhUnidirectionalGraph backwardGraph; public SwhBidirectionalGraph(SwhUnidirectionalGraph forwardGraph, SwhUnidirectionalGraph backwardGraph, SwhGraphProperties properties) { super(forwardGraph, backwardGraph); this.forwardGraph = forwardGraph; this.backwardGraph = backwardGraph; this.properties = properties; } private SwhBidirectionalGraph(BidirectionalImmutableGraph graph, SwhGraphProperties properties) { - super(graph.getForwardGraph(), graph.getBackwardGraph()); - this.forwardGraph = (SwhUnidirectionalGraph) graph.getForwardGraph(); - this.backwardGraph = (SwhUnidirectionalGraph) graph.getBackwardGraph(); + super(graph.forward, graph.backward); + this.forwardGraph = (SwhUnidirectionalGraph) graph.forward; + this.backwardGraph = (SwhUnidirectionalGraph) graph.backward; this.properties = properties; } public static SwhBidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadGraphOnly(method, path, is, pl); SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadGraphOnly(method, path + "-transposed", is, pl); SwhGraphProperties properties = SwhGraphProperties.load(path); forward.setProperties(properties); backward.setProperties(properties); return new SwhBidirectionalGraph(forward, backward, properties); } public static SwhBidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path, is, pl); SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path + "-transposed", is, pl); SwhGraphProperties properties = SwhGraphProperties.load(path); forward.setProperties(properties); backward.setProperties(properties); return new SwhBidirectionalGraph(forward, backward, properties); } // loadXXX methods from ImmutableGraph public static SwhBidirectionalGraph load(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.STANDARD, path, null, pl); } public static SwhBidirectionalGraph load(String path) throws IOException { return load(LoadMethod.STANDARD, path, null, null); } public static SwhBidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.MAPPED, path, null, pl); } public static SwhBidirectionalGraph loadMapped(String path) throws IOException { return load(LoadMethod.MAPPED, path, null, null); } public static SwhBidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.OFFLINE, path, null, pl); } public static SwhBidirectionalGraph loadOffline(String path) throws IOException { return load(LoadMethod.OFFLINE, path, null, null); } // Labelled versions of the loadXXX methods from ImmutableGraph public static SwhBidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, pl); } public static SwhBidirectionalGraph loadLabelled(String path) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, null); } public static SwhBidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, pl); } public static SwhBidirectionalGraph loadLabelledMapped(String path) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, null); } public static SwhBidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, pl); } public static SwhBidirectionalGraph loadLabelledOffline(String path) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, null); } @Override public SwhBidirectionalGraph copy() { return new SwhBidirectionalGraph(forwardGraph, backwardGraph, this.properties); } @Override public SwhBidirectionalGraph transpose() { return new SwhBidirectionalGraph(super.transpose(), this.properties); } @Override public SwhBidirectionalGraph symmetrize() { return new SwhBidirectionalGraph(super.symmetrize(), this.properties); } public SwhUnidirectionalGraph getForwardGraph() { return this.forwardGraph; } public SwhUnidirectionalGraph getBackwardGraph() { return this.backwardGraph; } /** * Returns a *labelled* lazy iterator over the successors of a given node. The iteration terminates * when -1 is returned. */ public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) { return forwardGraph.labelledSuccessors(x); } /** * Returns a *labelled* lazy iterator over the predecessors of a given node. The iteration * terminates when -1 is returned. */ public ArcLabelledNodeIterator.LabelledArcIterator labelledPredecessors(long x) { return backwardGraph.labelledSuccessors(x); } public void close() throws IOException { this.properties.close(); } - public String getPath() { - return properties.getPath(); - } - - public long getNodeId(SWHID swhid) { - return properties.getNodeId(swhid); - } - - public SWHID getSWHID(long nodeId) { - return properties.getSWHID(nodeId); - } - - public Node.Type getNodeType(long nodeId) { - return properties.getNodeType(nodeId); + @Override + public SwhGraphProperties getProperties() { + return properties; } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java index b16e677..6d6848d 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java @@ -1,47 +1,144 @@ package org.softwareheritage.graph; import java.io.IOException; /** * Common interface for SWH graph classes. + * + * This interface forwards all property loading/access methods to the SwhGraphProperties object + * returned by the getProperties() method of the implementing class. This allows API users to write + * graph.getNodeType() instead of graph.getProperties().getNodeType(). */ public interface SwhGraph { /** * Cleans up graph resources after use. */ void close() throws IOException; /** - * Converts {@link SWHID} node to long. + * Returns the SWH graph properties object of this graph. * - * @param swhid node specified as a {@link SWHID} - * @return internal long node id - * @see SWHID + * @return graph properties */ - long getNodeId(SWHID swhid); + SwhGraphProperties getProperties(); - /** - * Converts long id node to {@link SWHID}. - * - * @param nodeId node specified as a long id - * @return external SWHID - * @see SWHID - */ - SWHID getSWHID(long nodeId); + /** @see SwhGraphProperties#getPath() */ + default String getPath() { + return getProperties().getPath(); + } - /** - * Returns node type. - * - * @param nodeId node specified as a long id - * @return corresponding node type - * @see Node.Type - */ - Node.Type getNodeType(long nodeId); + /** @see SwhGraphProperties#getNodeId(SWHID) */ + default long getNodeId(SWHID swhid) { + return getProperties().getNodeId(swhid); + } - /** - * Returns the graph full path. - * - * @return graph full path - */ - String getPath(); + /** @see SwhGraphProperties#getSWHID(long) */ + default SWHID getSWHID(long nodeId) { + return getProperties().getSWHID(nodeId); + } + + /** @see SwhGraphProperties#getNodeType(long) */ + default Node.Type getNodeType(long nodeId) { + return getProperties().getNodeType(nodeId); + } + + /** @see SwhGraphProperties#loadContentLength() */ + default void loadContentLength() throws IOException { + getProperties().loadContentLength(); + } + + /** @see SwhGraphProperties#getContentLength(long) */ + default long getContentLength(long nodeId) { + return getProperties().getContentLength(nodeId); + } + + /** @see SwhGraphProperties#loadPersonIds() */ + default void loadPersonIds() throws IOException { + getProperties().loadPersonIds(); + } + + /** @see SwhGraphProperties#getAuthorId(long) */ + default long getAuthorId(long nodeId) { + return getProperties().getAuthorId(nodeId); + } + + /** @see SwhGraphProperties#getCommitterId(long) */ + default long getCommitterId(long nodeId) { + return getProperties().getCommitterId(nodeId); + } + + /** @see SwhGraphProperties#loadContentIsSkipped() */ + default void loadContentIsSkipped() throws IOException { + getProperties().loadContentIsSkipped(); + } + + /** @see SwhGraphProperties#isContentSkipped(long) */ + default boolean isContentSkipped(long nodeId) { + return getProperties().isContentSkipped(nodeId); + } + + /** @see SwhGraphProperties#loadAuthorTimestamps() */ + default void loadAuthorTimestamps() throws IOException { + getProperties().loadAuthorTimestamps(); + } + + /** @see SwhGraphProperties#getAuthorTimestamp(long) */ + default long getAuthorTimestamp(long nodeId) { + return getProperties().getAuthorTimestamp(nodeId); + } + + /** @see SwhGraphProperties#getAuthorTimestampOffset(long) */ + default short getAuthorTimestampOffset(long nodeId) { + return getProperties().getAuthorTimestampOffset(nodeId); + } + + /** @see SwhGraphProperties#loadCommitterTimestamps() */ + default void loadCommitterTimestamps() throws IOException { + getProperties().loadCommitterTimestamps(); + } + + /** @see SwhGraphProperties#getCommitterTimestamp(long) */ + default long getCommitterTimestamp(long nodeId) { + return getProperties().getCommitterTimestamp(nodeId); + } + + /** @see SwhGraphProperties#getCommitterTimestampOffset(long) */ + default short getCommitterTimestampOffset(long nodeId) { + return getProperties().getCommitterTimestampOffset(nodeId); + } + + /** @see SwhGraphProperties#loadMessages() */ + default void loadMessages() throws IOException { + getProperties().loadMessages(); + } + + /** @see SwhGraphProperties#getMessage(long) */ + default byte[] getMessage(long nodeId) throws IOException { + return getProperties().getMessage(nodeId); + } + + /** @see SwhGraphProperties#getUrl(long) */ + default String getUrl(long nodeId) throws IOException { + return getProperties().getUrl(nodeId); + } + + /** @see SwhGraphProperties#loadTagNames() */ + default void loadTagNames() throws IOException { + getProperties().loadTagNames(); + } + + /** @see SwhGraphProperties#getTagName(long) */ + default byte[] getTagName(long nodeId) throws IOException { + return getProperties().getTagName(nodeId); + } + + /** @see SwhGraphProperties#loadLabelNames() */ + default void loadLabelNames() throws IOException { + getProperties().loadLabelNames(); + } + + /** @see SwhGraphProperties#getLabelName(long) */ + default byte[] getLabelName(long labelId) { + return getProperties().getLabelName(labelId); + } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java index ef5e304..be43b61 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java @@ -1,85 +1,313 @@ package org.softwareheritage.graph; +import it.unimi.dsi.big.util.MappedFrontCodedStringBigList; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.bytes.ByteBigList; +import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; +import it.unimi.dsi.fastutil.ints.IntBigList; +import it.unimi.dsi.fastutil.ints.IntMappedBigList; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigList; +import it.unimi.dsi.fastutil.longs.LongMappedBigList; +import it.unimi.dsi.fastutil.shorts.ShortBigList; +import it.unimi.dsi.fastutil.shorts.ShortMappedBigList; +import it.unimi.dsi.sux4j.util.EliasFanoLongBigList; +import org.apache.commons.configuration2.ex.ConfigurationException; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Base64; /** * This objects contains SWH graph properties such as node labels. * * Some property mappings are necessary because Software Heritage uses string based persistent * identifiers (SWHID) while WebGraph uses integers internally. * * The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph * using SWHID) and the output (convert back to SWHID for users results). * * Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a * long id → node type map is stored as well to avoid a full SWHID lookup. * * @see NodeIdMap * @see NodeTypesMap */ public class SwhGraphProperties { - /** Path and basename of the compressed graph */ - String path; - /** Mapping long id ↔ SWHIDs */ - NodeIdMap nodeIdMap; - /** Mapping long id → node types */ - NodeTypesMap nodeTypesMap; + private final String path; + + private final NodeIdMap nodeIdMap; + private final NodeTypesMap nodeTypesMap; + private LongBigList authorTimestamp; + private ShortBigList authorTimestampOffset; + private LongBigList committerTimestamp; + private ShortBigList committerTimestampOffset; + private LongBigList contentLength; + private LongArrayBitVector contentIsSkipped; + private IntBigList authorId; + private IntBigList committerId; + private ByteBigList messageBuffer; + private LongBigList messageOffsets; + private ByteBigList tagNameBuffer; + private LongBigList tagNameOffsets; + private MappedFrontCodedStringBigList edgeLabelNames; protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { this.path = path; this.nodeIdMap = nodeIdMap; this.nodeTypesMap = nodeTypesMap; } public static SwhGraphProperties load(String path) throws IOException { return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path)); } /** - * Cleans up graph resources after use. + * Cleans up resources after use. */ public void close() throws IOException { - this.nodeIdMap.close(); + nodeIdMap.close(); + edgeLabelNames.close(); } + /** Return the basename of the compressed graph */ public String getPath() { return path; } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return nodeIdMap.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return nodeIdMap.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } + + private static LongBigList loadMappedLongs(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return LongMappedBigList.map(raf.getChannel()); + } + } + + private static IntBigList loadMappedInts(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return IntMappedBigList.map(raf.getChannel()); + } + } + + private static ShortBigList loadMappedShorts(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return ShortMappedBigList.map(raf.getChannel()); + } + } + + private static ByteBigList loadMappedBytes(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return ByteMappedBigList.map(raf.getChannel()); + } + } + + private static LongBigList loadEFLongs(String path) throws IOException { + try { + return (EliasFanoLongBigList) BinIO.loadObject(path); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + private static byte[] getLine(ByteBigList byteArray, long start) { + long end = start; + while (end < byteArray.size64() && byteArray.getByte(end) != '\n') { + end++; + } + int length = (int) (end - start); + byte[] buffer = new byte[length]; + byteArray.getElements(start, buffer, 0, length); + return buffer; + } + + /** Load the sizes of the content nodes */ + public void loadContentLength() throws IOException { + contentLength = loadMappedLongs(path + ".property.content.length.bin"); + } + + /** Get the size (in bytes) of the given content node */ + public long getContentLength(long nodeId) { + return contentLength.getLong(nodeId); + } + + /** Load the IDs of the persons (authors and committers) */ + public void loadPersonIds() throws IOException { + authorId = loadMappedInts(path + ".property.author_id.bin"); + committerId = loadMappedInts(path + ".property.committer_id.bin"); + } + + /** Get a unique integer ID representing the author of the given revision or release node */ + public long getAuthorId(long nodeId) { + if (authorId == null) { + throw new IllegalStateException("Author IDs not loaded"); + } + return authorId.getInt(nodeId); + } + + /** Get a unique integer ID representing the committer of the given revision node */ + public long getCommitterId(long nodeId) { + if (committerId == null) { + throw new IllegalStateException("Committer IDs not loaded"); + } + return committerId.getInt(nodeId); + } + + /** + * Loads a boolean array indicating whether the given content node was skipped during archive + * ingestion + */ + public void loadContentIsSkipped() throws IOException { + try { + contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin"); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + /** Returns whether the given content node was skipped during archive ingestion */ + public boolean isContentSkipped(long nodeId) { + if (contentIsSkipped == null) { + throw new IllegalStateException("Skipped content array not loaded"); + } + return contentIsSkipped.getBoolean(nodeId); + } + + /** Load the timestamps at which the releases and revisions were authored */ + public void loadAuthorTimestamps() throws IOException { + authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin"); + authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin"); + } + + /** Return the timestamp at which the given revision or release was authored */ + public long getAuthorTimestamp(long nodeId) { + if (authorTimestamp == null) { + throw new IllegalStateException("Author timestamps not loaded"); + } + return authorTimestamp.getLong(nodeId); + } + + /** Return the timestamp offset at which the given revision or release was authored */ + public short getAuthorTimestampOffset(long nodeId) { + if (authorTimestampOffset == null) { + throw new IllegalStateException("Author timestamp offsets not loaded"); + } + return authorTimestampOffset.getShort(nodeId); + } + + /** Load the timestamps at which the releases and revisions were committed */ + public void loadCommitterTimestamps() throws IOException { + committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin"); + committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin"); + } + + /** Return the timestamp at which the given revision was committed */ + public long getCommitterTimestamp(long nodeId) { + if (committerTimestamp == null) { + throw new IllegalStateException("Committer timestamps not loaded"); + } + return committerTimestamp.getLong(nodeId); + } + + /** Return the timestamp offset at which the given revision was committed */ + public short getCommitterTimestampOffset(long nodeId) { + if (committerTimestampOffset == null) { + throw new IllegalStateException("Committer timestamp offsets not loaded"); + } + return committerTimestampOffset.getShort(nodeId); + } + + /** Load the revision messages, the release messages and the origin URLs */ + public void loadMessages() throws IOException { + messageBuffer = loadMappedBytes(path + ".property.message.bin"); + messageOffsets = loadMappedLongs(path + ".property.message.offset.bin"); + } + + /** Get the message of the given revision or release node */ + public byte[] getMessage(long nodeId) throws IOException { + if (messageBuffer == null || messageOffsets == null) { + throw new IllegalStateException("Messages not loaded"); + } + long startOffset = messageOffsets.getLong(nodeId); + if (startOffset == -1) { + return null; + } + return Base64.getDecoder().decode(getLine(messageBuffer, startOffset)); + } + + /** Get the URL of the given origin node */ + public String getUrl(long nodeId) throws IOException { + return new String(getMessage(nodeId)); + } + + /** Load the release names */ + public void loadTagNames() throws IOException { + tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin"); + tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin"); + } + + /** Get the name of the given release node */ + public byte[] getTagName(long nodeId) throws IOException { + if (tagNameBuffer == null || tagNameOffsets == null) { + throw new IllegalStateException("Tag names not loaded"); + } + long startOffset = tagNameOffsets.getLong(nodeId); + if (startOffset == -1) { + return null; + } + return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset)); + } + + /** Load the arc label names (directory entry names and snapshot branch names) */ + public void loadLabelNames() throws IOException { + try { + edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl"); + } catch (ConfigurationException e) { + throw new IOException(e); + } + } + + /** + * Get the arc label name (either a directory entry name or snapshot branch name) associated with + * the given label ID + */ + public byte[] getLabelName(long labelId) { + if (edgeLabelNames == null) { + throw new IllegalStateException("Label names not loaded"); + } + return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId)); + } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java index 35c35f7..8ed87b6 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java @@ -1,221 +1,213 @@ package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import java.io.IOException; import java.io.InputStream; /** * Class representing the compressed Software Heritage graph in a single direction. *

* The compressed graph is stored using the WebGraph * framework. This class contains an {@link ImmutableGraph} representing the graph itself, as well * as a reference to the object containing the graph properties (e.g. node labels). Optionally, * arc labels (properties stored on the graph edges) can also be loaded with the * loadLabelled...() function family. * * @author The Software Heritage developers * @see SwhGraphProperties * @see SwhUnidirectionalGraph */ public class SwhUnidirectionalGraph extends ImmutableGraph implements SwhGraph { /** Underlying ImmutableGraph */ private final ImmutableGraph graph; /** Labelled ImmutableGraph, null if labels are not loaded */ private ArcLabelledImmutableGraph labelledGraph; /** Property data of the graph (id/type mappings etc.) */ - private SwhGraphProperties properties; + public SwhGraphProperties properties; protected SwhUnidirectionalGraph(ImmutableGraph graph, SwhGraphProperties properties) { this.graph = graph; this.properties = properties; } protected SwhUnidirectionalGraph(ArcLabelledImmutableGraph graph, SwhGraphProperties properties) { this.graph = graph; this.labelledGraph = graph; this.properties = properties; } /** * Load the (unlabelled) graph only, without the SWH properties. */ public static SwhUnidirectionalGraph loadGraphOnly(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { return new SwhUnidirectionalGraph(ImmutableGraph.load(method, path, is, pl), null); } /** * Load the labelled graph only, without the SWH properties. */ public static SwhUnidirectionalGraph loadLabelledGraphOnly(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { BitStreamArcLabelledImmutableGraph g = (BitStreamArcLabelledImmutableGraph) BitStreamArcLabelledImmutableGraph - .load(method, path, is, pl); + .load(method, path + "-labelled", is, pl); return new SwhUnidirectionalGraph(g, null); } /** * Load the SWH properties of the graph from a given path. */ public void loadProperties(String path) throws IOException { properties = SwhGraphProperties.load(path); } /** * Setter for the SWH graph properties. * * @param properties The {@link SwhGraphProperties} object containing the graph properties */ public void setProperties(SwhGraphProperties properties) { this.properties = properties; } /** * Load the unlabelled graph and its SWH properties. */ public static SwhUnidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph g = loadGraphOnly(method, path, is, pl); g.loadProperties(path); return g; } /** * Load the labelled graph and its SWH properties. */ public static SwhUnidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl) throws IOException { SwhUnidirectionalGraph g = loadLabelledGraphOnly(method, path, is, pl); g.loadProperties(path); return g; } // loadXXX methods of ImmutableGraph public static SwhUnidirectionalGraph load(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.STANDARD, path, null, pl); } public static SwhUnidirectionalGraph load(String path) throws IOException { return load(LoadMethod.STANDARD, path, null, null); } public static SwhUnidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.MAPPED, path, null, pl); } public static SwhUnidirectionalGraph loadMapped(String path) throws IOException { return load(LoadMethod.MAPPED, path, null, null); } public static SwhUnidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException { return load(LoadMethod.OFFLINE, path, null, pl); } public static SwhUnidirectionalGraph loadOffline(String path) throws IOException { return load(LoadMethod.OFFLINE, path, null, null); } // Labelled versions of the loadXXX methods from ImmutableGraph public static SwhUnidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, pl); } public static SwhUnidirectionalGraph loadLabelled(String path) throws IOException { return loadLabelled(LoadMethod.STANDARD, path, null, null); } public static SwhUnidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, pl); } public static SwhUnidirectionalGraph loadLabelledMapped(String path) throws IOException { return loadLabelled(LoadMethod.MAPPED, path, null, null); } public static SwhUnidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, pl); } public static SwhUnidirectionalGraph loadLabelledOffline(String path) throws IOException { return loadLabelled(LoadMethod.OFFLINE, path, null, null); } @Override public SwhUnidirectionalGraph copy() { return new SwhUnidirectionalGraph(this.graph.copy(), this.properties); } @Override public boolean randomAccess() { return graph.randomAccess(); } public void close() throws IOException { this.properties.close(); } @Override public long numNodes() { return graph.numNodes(); } @Override public long numArcs() { return graph.numArcs(); } @Override public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } /** * Returns a labelled node iterator for scanning the graph sequentially, starting from the * first node. */ public ArcLabelledNodeIterator labelledNodeIterator() { if (labelledGraph == null) { throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); } return labelledGraph.nodeIterator(); } /** * Returns a labelled node iterator for scanning the graph sequentially, starting from a * given node. */ public ArcLabelledNodeIterator labelledNodeIterator(long from) { if (labelledGraph == null) { throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); } return labelledGraph.nodeIterator(from); } /** * Returns a labelled lazy iterator over the successors of a given node. The iteration * terminates when -1 is returned. */ public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) { if (labelledGraph == null) { throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); } return labelledGraph.successors(x); } @Override public long outdegree(long nodeId) { return graph.outdegree(nodeId); } - public String getPath() { - return properties.getPath(); - } - public long getNodeId(SWHID swhid) { - return properties.getNodeId(swhid); - } - public SWHID getSWHID(long nodeId) { - return properties.getSWHID(nodeId); - } - public Node.Type getNodeType(long nodeId) { - return properties.getNodeType(nodeId); + @Override + public SwhGraphProperties getProperties() { + return properties; } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java index 707b44f..8d86f6d 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java @@ -1,292 +1,283 @@ package org.softwareheritage.graph.compress; import com.github.luben.zstd.ZstdOutputStream; import com.martiansoftware.jsap.*; import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.utils.Sort; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; /** * Read a graph dataset and extract all the unique node SWHIDs it contains, including the ones that * are not stored as actual objects in the graph, but only referred to by the edges. * Additionally, extract the set of all unique edge labels in the graph. * *

* *

* Rationale: Because the graph can contain holes, loose objects and dangling * objects, some nodes that are referred to as destinations in the edge relationships might not * actually be stored in the graph itself. However, to compress the graph using a graph compression * library, it is necessary to have a list of all the nodes in the graph, including the * ones that are simply referred to by the edges but not actually stored as concrete objects. *

* *

* This class reads the entire graph dataset, and uses sort -u to extract the set of * all the unique nodes and unique labels that will be needed as an input for the compression * process. *

*/ public class ExtractNodes { private static JSAPResult parseArgs(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the edges dataset"), new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output files"), new FlaggedOption("format", JSAP.STRING_PARSER, "orc", JSAP.NOT_REQUIRED, 'f', "format", "Format of the input dataset (orc, csv)"), new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, "30%", JSAP.NOT_REQUIRED, 'S', "sort-buffer-size", "Size of the memory buffer used by sort"), - new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', - "sort-temporary-directory", "Path to the temporary directory used by sort")}); + new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir", + "Path to the temporary directory used by sort")}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { System.err.println("Usage error: " + e.getMessage()); System.exit(1); } return config; } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult parsedArgs = parseArgs(args); String datasetPath = parsedArgs.getString("dataset"); String outputBasename = parsedArgs.getString("outputBasename"); String datasetFormat = parsedArgs.getString("format"); String sortBufferSize = parsedArgs.getString("sortBufferSize"); String sortTmpDir = parsedArgs.getString("sortTmpDir", null); + (new File(sortTmpDir)).mkdirs(); + // Open edge dataset GraphDataset dataset; if (datasetFormat.equals("orc")) { dataset = new ORCGraphDataset(datasetPath); } else if (datasetFormat.equals("csv")) { dataset = new CSVEdgeDataset(datasetPath); } else { throw new IllegalArgumentException("Unknown dataset format: " + datasetFormat); } + extractNodes(dataset, outputBasename, sortBufferSize, sortTmpDir); + } + + public static void extractNodes(GraphDataset dataset, String outputBasename, String sortBufferSize, + String sortTmpDir) throws IOException, InterruptedException { // Spawn node sorting process - Process nodeSort = spawnSort(sortBufferSize, sortTmpDir); + Process nodeSort = Sort.spawnSort(sortBufferSize, sortTmpDir); BufferedOutputStream nodeSortStdin = new BufferedOutputStream(nodeSort.getOutputStream()); BufferedInputStream nodeSortStdout = new BufferedInputStream(nodeSort.getInputStream()); OutputStream nodesFileOutputStream = new ZstdOutputStream( new BufferedOutputStream(new FileOutputStream(outputBasename + ".nodes.csv.zst"))); NodesOutputThread nodesOutputThread = new NodesOutputThread(nodeSortStdout, nodesFileOutputStream); nodesOutputThread.start(); // Spawn label sorting process - Process labelSort = spawnSort(sortBufferSize, sortTmpDir); + Process labelSort = Sort.spawnSort(sortBufferSize, sortTmpDir); BufferedOutputStream labelSortStdin = new BufferedOutputStream(labelSort.getOutputStream()); BufferedInputStream labelSortStdout = new BufferedInputStream(labelSort.getInputStream()); OutputStream labelsFileOutputStream = new ZstdOutputStream( new BufferedOutputStream(new FileOutputStream(outputBasename + ".labels.csv.zst"))); LabelsOutputThread labelsOutputThread = new LabelsOutputThread(labelSortStdout, labelsFileOutputStream); labelsOutputThread.start(); // Read the dataset and write the nodes and labels to the sorting processes long[] edgeCount = {0}; long[][] edgeCountByType = new long[Node.Type.values().length][Node.Type.values().length]; dataset.readEdges((node) -> { nodeSortStdin.write(node); nodeSortStdin.write('\n'); }, (src, dst, label, perm) -> { nodeSortStdin.write(src); nodeSortStdin.write('\n'); nodeSortStdin.write(dst); nodeSortStdin.write('\n'); if (label != null) { labelSortStdin.write(label); labelSortStdin.write('\n'); } edgeCount[0]++; // Extract type of src and dst from their SWHID: swh:1:XXX byte[] srcTypeBytes = Arrays.copyOfRange(src, 6, 6 + 3); byte[] dstTypeBytes = Arrays.copyOfRange(dst, 6, 6 + 3); int srcType = Node.Type.byteNameToInt(srcTypeBytes); int dstType = Node.Type.byteNameToInt(dstTypeBytes); if (srcType != -1 && dstType != -1) { edgeCountByType[srcType][dstType]++; } else { System.err .println("Invalid edge type: " + new String(srcTypeBytes) + " -> " + new String(dstTypeBytes)); System.exit(1); } }); // Wait for sorting processes to finish nodeSortStdin.close(); nodeSort.waitFor(); labelSortStdin.close(); labelSort.waitFor(); nodesOutputThread.join(); labelsOutputThread.join(); // Write node, edge and label counts/statistics printEdgeCounts(outputBasename, edgeCount[0], edgeCountByType); printNodeCounts(outputBasename, nodesOutputThread.getNodeCount(), nodesOutputThread.getNodeTypeCounts()); printLabelCounts(outputBasename, labelsOutputThread.getLabelCount()); } - private static Process spawnSort(String sortBufferSize, String sortTmpDir) throws IOException { - ProcessBuilder sortProcessBuilder = new ProcessBuilder(); - sortProcessBuilder.redirectError(ProcessBuilder.Redirect.INHERIT); - ArrayList command = new ArrayList<>(List.of("sort", "-u", "--buffer-size", sortBufferSize)); - if (sortTmpDir != null) { - command.add("--temporary-directory"); - command.add(sortTmpDir); - } - sortProcessBuilder.command(command); - Map env = sortProcessBuilder.environment(); - env.put("LC_ALL", "C"); - env.put("LC_COLLATE", "C"); - env.put("LANG", "C"); - - return sortProcessBuilder.start(); - } - private static void printEdgeCounts(String basename, long edgeCount, long[][] edgeTypeCounts) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".edges.count.txt"); nodeCountWriter.println(edgeCount); nodeCountWriter.close(); PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".edges.stats.txt"); TreeMap edgeTypeCountsMap = new TreeMap<>(); for (Node.Type src : Node.Type.values()) { for (Node.Type dst : Node.Type.values()) { long cnt = edgeTypeCounts[Node.Type.toInt(src)][Node.Type.toInt(dst)]; if (cnt > 0) edgeTypeCountsMap.put(src.toString().toLowerCase() + ":" + dst.toString().toLowerCase(), cnt); } } for (Map.Entry entry : edgeTypeCountsMap.entrySet()) { nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue()); } nodeTypesCountWriter.close(); } private static void printNodeCounts(String basename, long nodeCount, long[] nodeTypeCounts) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".nodes.count.txt"); nodeCountWriter.println(nodeCount); nodeCountWriter.close(); PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".nodes.stats.txt"); TreeMap nodeTypeCountsMap = new TreeMap<>(); for (Node.Type v : Node.Type.values()) { nodeTypeCountsMap.put(v.toString().toLowerCase(), nodeTypeCounts[Node.Type.toInt(v)]); } for (Map.Entry entry : nodeTypeCountsMap.entrySet()) { nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue()); } nodeTypesCountWriter.close(); } private static void printLabelCounts(String basename, long labelCount) throws IOException { PrintWriter nodeCountWriter = new PrintWriter(basename + ".labels.count.txt"); nodeCountWriter.println(labelCount); nodeCountWriter.close(); } private static class NodesOutputThread extends Thread { private final InputStream sortedNodesStream; private final OutputStream nodesOutputStream; private long nodeCount = 0; private final long[] nodeTypeCounts = new long[Node.Type.values().length]; NodesOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) { this.sortedNodesStream = sortedNodesStream; this.nodesOutputStream = nodesOutputStream; } @Override public void run() { BufferedReader reader = new BufferedReader( new InputStreamReader(sortedNodesStream, StandardCharsets.UTF_8)); try { String line; while ((line = reader.readLine()) != null) { nodesOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); nodesOutputStream.write('\n'); nodeCount++; try { Node.Type nodeType = Node.Type.fromStr(line.split(":")[2]); nodeTypeCounts[Node.Type.toInt(nodeType)]++; } catch (ArrayIndexOutOfBoundsException e) { System.err.println("Error parsing SWHID: " + line); System.exit(1); } } nodesOutputStream.close(); } catch (IOException e) { throw new RuntimeException(e); } } public long getNodeCount() { return nodeCount; } public long[] getNodeTypeCounts() { return nodeTypeCounts; } } private static class LabelsOutputThread extends Thread { private final InputStream sortedLabelsStream; private final OutputStream labelsOutputStream; private long labelCount = 0; LabelsOutputThread(InputStream sortedLabelsStream, OutputStream labelsOutputStream) { this.labelsOutputStream = labelsOutputStream; this.sortedLabelsStream = sortedLabelsStream; } @Override public void run() { BufferedReader reader = new BufferedReader( new InputStreamReader(sortedLabelsStream, StandardCharsets.UTF_8)); try { String line; while ((line = reader.readLine()) != null) { labelsOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); labelsOutputStream.write('\n'); labelCount++; } labelsOutputStream.close(); } catch (IOException e) { throw new RuntimeException(e); } } public long getLabelCount() { return labelCount; } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java new file mode 100644 index 0000000..6bf20e4 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java @@ -0,0 +1,129 @@ +package org.softwareheritage.graph.compress; + +import com.github.luben.zstd.ZstdOutputStream; +import com.martiansoftware.jsap.*; +import org.softwareheritage.graph.utils.Sort; + +import java.io.*; +import java.nio.charset.StandardCharsets; + +/** + * Read a graph dataset and extract all the unique authors it contains. + * + *

+ * This class reads the revision and release tables of the graph dataset, and uses + * sort -u to extract the set of all the unique persons (name + email, potentially + * pseudonymized) and store them in a file. + *

+ */ +public class ExtractPersons { + private static JSAPResult parseArgs(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC dataset"), + new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, + "Basename of the output files"), + + new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, "30%", JSAP.NOT_REQUIRED, 'S', + "sort-buffer-size", "Size of the memory buffer used by sort"), + new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir", + "Path to the temporary directory used by sort")}); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + System.err.println("Usage error: " + e.getMessage()); + System.exit(1); + } + return config; + } + + private static void processAuthorColumn(ORCGraphDataset.SwhOrcTable table, String columnName, OutputStream stream) + throws IOException { + table.readBytes64Column(columnName, (swhid, personBase64) -> { + stream.write(personBase64); + stream.write('\n'); + }); + } + + public static void main(String[] args) throws IOException, InterruptedException { + JSAPResult parsedArgs = parseArgs(args); + String datasetPath = parsedArgs.getString("dataset"); + String outputBasename = parsedArgs.getString("outputBasename"); + + String sortBufferSize = parsedArgs.getString("sortBufferSize"); + String sortTmpDir = parsedArgs.getString("sortTmpDir", null); + + ORCGraphDataset dataset = new ORCGraphDataset(datasetPath); + + extractPersons(dataset, outputBasename, sortBufferSize, sortTmpDir); + } + + public static void extractPersons(ORCGraphDataset dataset, String outputBasename, String sortBufferSize, + String sortTmpDir) throws IOException, InterruptedException { + (new File(sortTmpDir)).mkdirs(); + + // Spawn person sorting process + Process personSort = Sort.spawnSort(sortBufferSize, sortTmpDir); + BufferedOutputStream personSortStdin = new BufferedOutputStream(personSort.getOutputStream()); + BufferedInputStream personSortStdout = new BufferedInputStream(personSort.getInputStream()); + OutputStream personsFileOutputStream = new ZstdOutputStream( + new BufferedOutputStream(new FileOutputStream(outputBasename + ".persons.csv.zst"))); + PersonsOutputThread personsOutputThread = new PersonsOutputThread(personSortStdout, personsFileOutputStream); + personsOutputThread.start(); + + processAuthorColumn(dataset.getTable("release"), "author", personSortStdin); + processAuthorColumn(dataset.getTable("revision"), "author", personSortStdin); + processAuthorColumn(dataset.getTable("revision"), "committer", personSortStdin); + + // Wait for sorting processes to finish + personSortStdin.close(); + personSort.waitFor(); + personsOutputThread.join(); + + // Write person count statistics + printPersonsCounts(outputBasename, personsOutputThread.getPersonCount()); + } + + private static void printPersonsCounts(String basename, long labelCount) throws IOException { + PrintWriter nodeCountWriter = new PrintWriter(basename + ".persons.count.txt"); + nodeCountWriter.println(labelCount); + nodeCountWriter.close(); + } + + private static class PersonsOutputThread extends Thread { + private final InputStream sortedPersonsStream; + private final OutputStream personsOutputStream; + + private long personCount = 0; + + PersonsOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) { + this.sortedPersonsStream = sortedNodesStream; + this.personsOutputStream = nodesOutputStream; + } + + @Override + public void run() { + BufferedReader reader = new BufferedReader( + new InputStreamReader(sortedPersonsStream, StandardCharsets.UTF_8)); + try { + String line; + while ((line = reader.readLine()) != null) { + personsOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); + personsOutputStream.write('\n'); + personCount++; + } + personsOutputStream.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public long getPersonCount() { + return personCount; + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java index ada0319..ebd9adb 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java @@ -1,44 +1,60 @@ package org.softwareheritage.graph.compress; import java.io.IOException; /** * GraphDataset is a common interface to represent on-disk graph datasets in various formats, * usually extracted from the SWH archive with the swh-dataset tool. */ public interface GraphDataset { interface NodeCallback { void onNode(byte[] node) throws IOException; } interface EdgeCallback { - void onEdge(byte[] src, byte[] dst, byte[] label, long permission) throws IOException; + void onEdge(byte[] src, byte[] dst, byte[] label, int permission) throws IOException; } /** * Read the graph dataset and call the callback methods for each node and edge encountered. * *
    *
  • The node callback is called for each object stored in the graph.
  • *
  • The edge callback is called for each relationship (between two nodes) stored in the * graph.
  • *
* *

* Note that because the graph can contain holes, loose objects and dangling objects, the edge * callback may be called with parameters representing nodes that are not stored in the graph. This * is because some nodes that are referred to as destinations in the dataset might not be present in * the archive (e.g., a revision entry in a directory pointing to a revision that we have not * crawled yet). *

* *

* In order to generate a complete set of all the nodes that are referred to in the graph * dataset, see the {@link ExtractNodes} class. *

* * @param nodeCb callback for each node * @param edgeCb callback for each edge */ void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException; + + interface TimestampCallback { + void onTimestamp(byte[] swhid, long timestamp, short offset) throws IOException; + } + + interface LongCallback { + void onLong(byte[] swhid, long value) throws IOException; + } + + interface BytesCallback { + void onBytes(byte[] swhid, byte[] value) throws IOException; + } + + interface HashedEdgeCallback { + void onHashedEdge(long src, long dst, long label, int permission) throws IOException; + } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java index 8b824e1..a6b3186 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java @@ -1,528 +1,446 @@ package org.softwareheritage.graph.compress; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.big.webgraph.BVGraph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.labels.DirEntry; import org.softwareheritage.graph.labels.SwhLabel; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.*; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; import java.util.*; import java.util.concurrent.TimeUnit; public class LabelMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); + String orcDatasetPath; String graphPath; String outputGraphPath; String debugPath; String tmpDir; ImmutableGraph graph; long numNodes; long numArcs; NodeIdMap nodeIdMap; Object2LongFunction filenameMph; long numFilenames; int totalLabelWidth; - public LabelMapBuilder(String graphPath, String debugPath, String outputGraphPath, String tmpDir) - throws IOException { + public LabelMapBuilder(String orcDatasetPath, String graphPath, String debugPath, String outputGraphPath, + String tmpDir) throws IOException { + this.orcDatasetPath = orcDatasetPath; this.graphPath = graphPath; if (outputGraphPath == null) { this.outputGraphPath = graphPath; } else { this.outputGraphPath = outputGraphPath; } this.debugPath = debugPath; this.tmpDir = tmpDir; - // Load the graph in offline mode to retrieve the number of nodes/edges, - // then immediately destroy it. XXX: not even needed? - // ImmutableGraph graphOffline = BVGraph.loadMapped(graphPath); - graph = BVGraph.loadMapped(graphPath); numArcs = graph.numArcs(); numNodes = graph.numNodes(); nodeIdMap = new NodeIdMap(graphPath); - filenameMph = NodeIdMap.loadMph(graphPath + "-labels.mph"); + filenameMph = NodeIdMap.loadMph(graphPath + ".labels.mph"); numFilenames = getMPHSize(filenameMph); totalLabelWidth = DirEntry.labelWidth(numFilenames); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", - "Basename of the compressed graph"), + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"), + new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"), new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd', "debug-path", "Store the intermediate representation here for debug"), new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "output-graph", "Basename of the output graph, same as --graph if not specified"), - new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp", + new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 'T', "temp-dir", "Temporary directory path"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); + String orcDataset = config.getString("dataset"); String graphPath = config.getString("graphPath"); String outputGraphPath = config.getString("outputGraphPath"); String tmpDir = config.getString("tmpDir"); String debugPath = config.getString("debugPath"); - LabelMapBuilder builder = new LabelMapBuilder(graphPath, debugPath, outputGraphPath, tmpDir); + LabelMapBuilder builder = new LabelMapBuilder(orcDataset, graphPath, debugPath, outputGraphPath, tmpDir); logger.info("Loading graph and MPH functions..."); builder.computeLabelMap(); } static long getMPHSize(Object2LongFunction mph) { return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); } void computeLabelMap() throws IOException, InterruptedException { this.loadGraph(); - // this.computeLabelMapSort(); - this.computeLabelMapBsort(); + if (true) { + this.computeLabelMapSort(); + } else { + this.computeLabelMapBsort(); + } } void computeLabelMapSort() throws IOException { // Pass the intermediate representation to sort(1) so that we see the labels in the order they will // appear in the label file. ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "-k1,1n", "-k2,2n", // Numerical sort "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); - // BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); FastBufferedInputStream sort_stdout = new FastBufferedInputStream(sort.getInputStream()); - final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); - hashLabelStream(fbis, new EdgeLabelLineWriter() { - @Override - public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { - sort_stdin.write((src + "\t" + dst + "\t" + filenameId + "\t" + permission + "\n") - .getBytes(StandardCharsets.US_ASCII)); - } - }); + readHashedEdgeLabels((src, dst, label, perms) -> sort_stdin + .write((src + "\t" + dst + "\t" + label + "\t" + perms + "\n").getBytes(StandardCharsets.US_ASCII))); sort_stdin.close(); EdgeLabelLineIterator mapLines = new TextualEdgeLabelLineIterator(sort_stdout); writeLabels(mapLines); logger.info("Done"); } void computeLabelMapBsort() throws IOException, InterruptedException { // Pass the intermediate representation to bsort(1) so that we see the labels in the order they will // appear in the label file. String tmpFile = tmpDir + "/labelsToSort.bin"; - final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmpFile))); // Number of bytes to represent a node. final int nodeBytes = (Long.SIZE - Long.numberOfLeadingZeros(graph.numNodes())) / 8 + 1; ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); logger.info("Writing labels to a packed binary files (node bytes: {})", nodeBytes); - hashLabelStream(fbis, new EdgeLabelLineWriter() { - @Override - public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { - buffer.putLong(0, src); - out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); - buffer.putLong(0, dst); - out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); - out.writeLong(filenameId); - out.writeInt(permission); - } + readHashedEdgeLabels((src, dst, label, perms) -> { + buffer.putLong(0, src); + out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); + buffer.putLong(0, dst); + out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); + out.writeLong(label); + out.writeInt(perms); }); ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("/home/seirl/bsort/src/bsort", "-v", "-r", String.valueOf(nodeBytes * 2 + Long.BYTES + Integer.BYTES), "-k", String.valueOf(nodeBytes * 2), tmpFile); Process sort = processBuilder.start(); sort.waitFor(); final DataInputStream sortedLabels = new DataInputStream(new BufferedInputStream(new FileInputStream(tmpFile))); BinaryEdgeLabelLineIterator mapLines = new BinaryEdgeLabelLineIterator(sortedLabels, nodeBytes); writeLabels(mapLines); logger.info("Done"); } void loadGraph() throws IOException { } - void hashLabelStream(FastBufferedInputStream input, EdgeLabelLineWriter writer) throws IOException { - // Compute intermediate representation and write it on : - // "