diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
index 6d6848d..6d4db39 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
@@ -1,144 +1,144 @@
package org.softwareheritage.graph;
import java.io.IOException;
/**
* Common interface for SWH graph classes.
*
* This interface forwards all property loading/access methods to the SwhGraphProperties object
* returned by the getProperties() method of the implementing class. This allows API users to write
* graph.getNodeType() instead of graph.getProperties().getNodeType().
*/
public interface SwhGraph {
/**
* Cleans up graph resources after use.
*/
void close() throws IOException;
/**
* Returns the SWH graph properties object of this graph.
*
* @return graph properties
*/
SwhGraphProperties getProperties();
/** @see SwhGraphProperties#getPath() */
default String getPath() {
return getProperties().getPath();
}
/** @see SwhGraphProperties#getNodeId(SWHID) */
default long getNodeId(SWHID swhid) {
return getProperties().getNodeId(swhid);
}
/** @see SwhGraphProperties#getSWHID(long) */
default SWHID getSWHID(long nodeId) {
return getProperties().getSWHID(nodeId);
}
/** @see SwhGraphProperties#getNodeType(long) */
default Node.Type getNodeType(long nodeId) {
return getProperties().getNodeType(nodeId);
}
/** @see SwhGraphProperties#loadContentLength() */
default void loadContentLength() throws IOException {
getProperties().loadContentLength();
}
/** @see SwhGraphProperties#getContentLength(long) */
- default long getContentLength(long nodeId) {
+ default Long getContentLength(long nodeId) {
return getProperties().getContentLength(nodeId);
}
/** @see SwhGraphProperties#loadPersonIds() */
default void loadPersonIds() throws IOException {
getProperties().loadPersonIds();
}
/** @see SwhGraphProperties#getAuthorId(long) */
- default long getAuthorId(long nodeId) {
+ default Long getAuthorId(long nodeId) {
return getProperties().getAuthorId(nodeId);
}
/** @see SwhGraphProperties#getCommitterId(long) */
- default long getCommitterId(long nodeId) {
+ default Long getCommitterId(long nodeId) {
return getProperties().getCommitterId(nodeId);
}
/** @see SwhGraphProperties#loadContentIsSkipped() */
default void loadContentIsSkipped() throws IOException {
getProperties().loadContentIsSkipped();
}
/** @see SwhGraphProperties#isContentSkipped(long) */
default boolean isContentSkipped(long nodeId) {
return getProperties().isContentSkipped(nodeId);
}
/** @see SwhGraphProperties#loadAuthorTimestamps() */
default void loadAuthorTimestamps() throws IOException {
getProperties().loadAuthorTimestamps();
}
/** @see SwhGraphProperties#getAuthorTimestamp(long) */
- default long getAuthorTimestamp(long nodeId) {
+ default Long getAuthorTimestamp(long nodeId) {
return getProperties().getAuthorTimestamp(nodeId);
}
/** @see SwhGraphProperties#getAuthorTimestampOffset(long) */
- default short getAuthorTimestampOffset(long nodeId) {
+ default Short getAuthorTimestampOffset(long nodeId) {
return getProperties().getAuthorTimestampOffset(nodeId);
}
/** @see SwhGraphProperties#loadCommitterTimestamps() */
default void loadCommitterTimestamps() throws IOException {
getProperties().loadCommitterTimestamps();
}
/** @see SwhGraphProperties#getCommitterTimestamp(long) */
- default long getCommitterTimestamp(long nodeId) {
+ default Long getCommitterTimestamp(long nodeId) {
return getProperties().getCommitterTimestamp(nodeId);
}
/** @see SwhGraphProperties#getCommitterTimestampOffset(long) */
- default short getCommitterTimestampOffset(long nodeId) {
+ default Short getCommitterTimestampOffset(long nodeId) {
return getProperties().getCommitterTimestampOffset(nodeId);
}
/** @see SwhGraphProperties#loadMessages() */
default void loadMessages() throws IOException {
getProperties().loadMessages();
}
/** @see SwhGraphProperties#getMessage(long) */
default byte[] getMessage(long nodeId) throws IOException {
return getProperties().getMessage(nodeId);
}
/** @see SwhGraphProperties#getUrl(long) */
default String getUrl(long nodeId) throws IOException {
return getProperties().getUrl(nodeId);
}
/** @see SwhGraphProperties#loadTagNames() */
default void loadTagNames() throws IOException {
getProperties().loadTagNames();
}
/** @see SwhGraphProperties#getTagName(long) */
default byte[] getTagName(long nodeId) throws IOException {
return getProperties().getTagName(nodeId);
}
/** @see SwhGraphProperties#loadLabelNames() */
default void loadLabelNames() throws IOException {
getProperties().loadLabelNames();
}
/** @see SwhGraphProperties#getLabelName(long) */
default byte[] getLabelName(long labelId) {
return getProperties().getLabelName(labelId);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
index be43b61..b569bb5 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
@@ -1,313 +1,324 @@
package org.softwareheritage.graph;
import it.unimi.dsi.big.util.MappedFrontCodedStringBigList;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.bytes.ByteBigList;
import it.unimi.dsi.fastutil.bytes.ByteMappedBigList;
import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.ints.IntMappedBigList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongMappedBigList;
import it.unimi.dsi.fastutil.shorts.ShortBigList;
import it.unimi.dsi.fastutil.shorts.ShortMappedBigList;
import it.unimi.dsi.sux4j.util.EliasFanoLongBigList;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.maps.NodeTypesMap;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Base64;
/**
* This objects contains SWH graph properties such as node labels.
*
* Some property mappings are necessary because Software Heritage uses string based persistent
* identifiers (SWHID) while WebGraph uses integers internally.
*
* The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph
* using SWHID) and the output (convert back to SWHID for users results).
*
* Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a
* long id → node type map is stored as well to avoid a full SWHID lookup.
*
* @see NodeIdMap
* @see NodeTypesMap
*/
public class SwhGraphProperties {
private final String path;
private final NodeIdMap nodeIdMap;
private final NodeTypesMap nodeTypesMap;
private LongBigList authorTimestamp;
private ShortBigList authorTimestampOffset;
private LongBigList committerTimestamp;
private ShortBigList committerTimestampOffset;
private LongBigList contentLength;
private LongArrayBitVector contentIsSkipped;
private IntBigList authorId;
private IntBigList committerId;
private ByteBigList messageBuffer;
private LongBigList messageOffsets;
private ByteBigList tagNameBuffer;
private LongBigList tagNameOffsets;
private MappedFrontCodedStringBigList edgeLabelNames;
protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
this.path = path;
this.nodeIdMap = nodeIdMap;
this.nodeTypesMap = nodeTypesMap;
}
public static SwhGraphProperties load(String path) throws IOException {
return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path));
}
/**
* Cleans up resources after use.
*/
public void close() throws IOException {
nodeIdMap.close();
edgeLabelNames.close();
}
/** Return the basename of the compressed graph */
public String getPath() {
return path;
}
/**
* Converts {@link SWHID} node to long.
*
* @param swhid node specified as a {@link SWHID}
* @return internal long node id
* @see SWHID
*/
public long getNodeId(SWHID swhid) {
return nodeIdMap.getNodeId(swhid);
}
/**
* Converts long id node to {@link SWHID}.
*
* @param nodeId node specified as a long id
* @return external SWHID
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
return nodeIdMap.getSWHID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return nodeTypesMap.getType(nodeId);
}
private static LongBigList loadMappedLongs(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return LongMappedBigList.map(raf.getChannel());
}
}
private static IntBigList loadMappedInts(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return IntMappedBigList.map(raf.getChannel());
}
}
private static ShortBigList loadMappedShorts(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return ShortMappedBigList.map(raf.getChannel());
}
}
private static ByteBigList loadMappedBytes(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return ByteMappedBigList.map(raf.getChannel());
}
}
private static LongBigList loadEFLongs(String path) throws IOException {
try {
return (EliasFanoLongBigList) BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
private static byte[] getLine(ByteBigList byteArray, long start) {
long end = start;
while (end < byteArray.size64() && byteArray.getByte(end) != '\n') {
end++;
}
int length = (int) (end - start);
byte[] buffer = new byte[length];
byteArray.getElements(start, buffer, 0, length);
return buffer;
}
/** Load the sizes of the content nodes */
public void loadContentLength() throws IOException {
contentLength = loadMappedLongs(path + ".property.content.length.bin");
}
/** Get the size (in bytes) of the given content node */
- public long getContentLength(long nodeId) {
- return contentLength.getLong(nodeId);
+ public Long getContentLength(long nodeId) {
+ if (contentLength == null) {
+ throw new IllegalStateException("Content lengths not loaded");
+ }
+ long res = contentLength.getLong(nodeId);
+ return (res >= 0) ? res : null;
}
/** Load the IDs of the persons (authors and committers) */
public void loadPersonIds() throws IOException {
authorId = loadMappedInts(path + ".property.author_id.bin");
committerId = loadMappedInts(path + ".property.committer_id.bin");
}
/** Get a unique integer ID representing the author of the given revision or release node */
- public long getAuthorId(long nodeId) {
+ public Long getAuthorId(long nodeId) {
if (authorId == null) {
throw new IllegalStateException("Author IDs not loaded");
}
- return authorId.getInt(nodeId);
+ long res = authorId.getInt(nodeId);
+ return (res >= 0) ? res : null;
}
/** Get a unique integer ID representing the committer of the given revision node */
- public long getCommitterId(long nodeId) {
+ public Long getCommitterId(long nodeId) {
if (committerId == null) {
throw new IllegalStateException("Committer IDs not loaded");
}
- return committerId.getInt(nodeId);
+ long res = committerId.getInt(nodeId);
+ return (res >= 0) ? res : null;
}
/**
* Loads a boolean array indicating whether the given content node was skipped during archive
* ingestion
*/
public void loadContentIsSkipped() throws IOException {
try {
contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin");
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
/** Returns whether the given content node was skipped during archive ingestion */
public boolean isContentSkipped(long nodeId) {
if (contentIsSkipped == null) {
throw new IllegalStateException("Skipped content array not loaded");
}
return contentIsSkipped.getBoolean(nodeId);
}
/** Load the timestamps at which the releases and revisions were authored */
public void loadAuthorTimestamps() throws IOException {
authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin");
authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin");
}
/** Return the timestamp at which the given revision or release was authored */
- public long getAuthorTimestamp(long nodeId) {
+ public Long getAuthorTimestamp(long nodeId) {
if (authorTimestamp == null) {
throw new IllegalStateException("Author timestamps not loaded");
}
- return authorTimestamp.getLong(nodeId);
+ long res = authorTimestamp.getLong(nodeId);
+ return (res > Long.MIN_VALUE) ? res : null;
}
/** Return the timestamp offset at which the given revision or release was authored */
- public short getAuthorTimestampOffset(long nodeId) {
+ public Short getAuthorTimestampOffset(long nodeId) {
if (authorTimestampOffset == null) {
throw new IllegalStateException("Author timestamp offsets not loaded");
}
- return authorTimestampOffset.getShort(nodeId);
+ short res = authorTimestampOffset.getShort(nodeId);
+ return (res > Short.MIN_VALUE) ? res : null;
}
/** Load the timestamps at which the releases and revisions were committed */
public void loadCommitterTimestamps() throws IOException {
committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin");
committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin");
}
/** Return the timestamp at which the given revision was committed */
- public long getCommitterTimestamp(long nodeId) {
+ public Long getCommitterTimestamp(long nodeId) {
if (committerTimestamp == null) {
throw new IllegalStateException("Committer timestamps not loaded");
}
- return committerTimestamp.getLong(nodeId);
+ long res = committerTimestamp.getLong(nodeId);
+ return (res > Long.MIN_VALUE) ? res : null;
}
/** Return the timestamp offset at which the given revision was committed */
- public short getCommitterTimestampOffset(long nodeId) {
+ public Short getCommitterTimestampOffset(long nodeId) {
if (committerTimestampOffset == null) {
throw new IllegalStateException("Committer timestamp offsets not loaded");
}
- return committerTimestampOffset.getShort(nodeId);
+ short res = committerTimestampOffset.getShort(nodeId);
+ return (res > Short.MIN_VALUE) ? res : null;
}
/** Load the revision messages, the release messages and the origin URLs */
public void loadMessages() throws IOException {
messageBuffer = loadMappedBytes(path + ".property.message.bin");
messageOffsets = loadMappedLongs(path + ".property.message.offset.bin");
}
/** Get the message of the given revision or release node */
public byte[] getMessage(long nodeId) throws IOException {
if (messageBuffer == null || messageOffsets == null) {
throw new IllegalStateException("Messages not loaded");
}
long startOffset = messageOffsets.getLong(nodeId);
if (startOffset == -1) {
return null;
}
return Base64.getDecoder().decode(getLine(messageBuffer, startOffset));
}
/** Get the URL of the given origin node */
public String getUrl(long nodeId) throws IOException {
- return new String(getMessage(nodeId));
+ byte[] url = getMessage(nodeId);
+ return (url != null) ? new String(url) : null;
}
/** Load the release names */
public void loadTagNames() throws IOException {
tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin");
tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin");
}
/** Get the name of the given release node */
public byte[] getTagName(long nodeId) throws IOException {
if (tagNameBuffer == null || tagNameOffsets == null) {
throw new IllegalStateException("Tag names not loaded");
}
long startOffset = tagNameOffsets.getLong(nodeId);
if (startOffset == -1) {
return null;
}
return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset));
}
/** Load the arc label names (directory entry names and snapshot branch names) */
public void loadLabelNames() throws IOException {
try {
edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl");
} catch (ConfigurationException e) {
throw new IOException(e);
}
}
/**
* Get the arc label name (either a directory entry name or snapshot branch name) associated with
* the given label ID
*/
public byte[] getLabelName(long labelId) {
if (edgeLabelNames == null) {
throw new IllegalStateException("Label names not loaded");
}
return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId));
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
index dd20648..fed8608 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
@@ -1,268 +1,268 @@
package org.softwareheritage.graph.compress;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.ints.IntBigArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.shorts.ShortBigArrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.compress.ORCGraphDataset.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
/**
* This class is used to extract the node properties from the graph dataset, and write them to a set
* of property files.
*
* Note: because the nodes are not sorted by type, we have an incentive to minimize the number of
* "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same
* files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file).
* Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all
* the different properties in their own files.
*/
public class WriteNodeProperties {
final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class);
private final ORCGraphDataset dataset;
private final String graphBasename;
private final NodeIdMap nodeIdMap;
private final long numNodes;
public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) {
this.dataset = new ORCGraphDataset(dataset);
this.graphBasename = graphBasename;
this.nodeIdMap = nodeIdMap;
this.numNodes = nodeIdMap.size64();
}
public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped",
"person_ids", "messages", "tag_names",};
private static JSAPResult parseArgs(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"),
new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Basename of the output graph"),
new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties",
"Properties to write, comma separated (default: all). Possible choices: "
+ String.join(",", PROPERTY_WRITERS)),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
System.err.println("Usage error: " + e.getMessage());
System.exit(1);
}
return config;
}
public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException,
InvocationTargetException, IllegalAccessException {
JSAPResult args = parseArgs(argv);
String dataset = args.getString("dataset");
String graphBasename = args.getString("graphBasename");
NodeIdMap nodeIdMap = new NodeIdMap(graphBasename);
Set properties;
if (args.getString("properties").equals("*")) {
properties = Set.of(PROPERTY_WRITERS);
} else {
properties = new HashSet<>(Arrays.asList(args.getString("properties").split(",")));
}
WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap);
if (properties.contains("timestamps")) {
writer.writeTimestamps();
}
if (properties.contains("content_length")) {
writer.writeContentLength();
}
if (properties.contains("content_is_skipped")) {
writer.writeContentIsSkipped();
}
if (properties.contains("person_ids")) {
writer.writePersonIds();
}
if (properties.contains("messages")) {
writer.writeMessages();
}
if (properties.contains("tag_names")) {
writer.writeTagNames();
}
}
public void writeContentLength() throws IOException {
logger.info("Writing content lengths");
long[][] valueArray = LongBigArrays.newBigArray(numNodes);
- BigArrays.fill(valueArray, Long.MIN_VALUE);
+ BigArrays.fill(valueArray, -1);
for (String tableName : new String[]{"content", "skipped_content"}) {
SwhOrcTable table = dataset.getTable(tableName);
table.readLongColumn("length", (swhid, value) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(valueArray, id, value);
});
}
BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin");
}
public void writeContentIsSkipped() throws IOException {
LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes);
SwhOrcTable table = dataset.getTable("skipped_content");
table.readIdColumn((swhid) -> {
long id = nodeIdMap.getNodeId(swhid);
isSkippedBitVector.set(id);
});
BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin");
}
public void writeTimestamps() throws IOException {
logger.info("Writing author/committer timestamps for release + revision");
SwhOrcTable releaseTable = dataset.getTable("release");
SwhOrcTable revisionTable = dataset.getTable("revision");
long[][] timestampArray = LongBigArrays.newBigArray(numNodes);
short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes);
// Author timestamps
BigArrays.fill(timestampArray, Long.MIN_VALUE);
BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin");
BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin");
// Committer timestamps
BigArrays.fill(timestampArray, Long.MIN_VALUE);
BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin");
BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin");
}
public void writePersonIds() throws IOException {
logger.info("Writing author/committer IDs for release + revision");
Object2LongFunction personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph");
SwhOrcTable releaseTable = dataset.getTable("release");
SwhOrcTable revisionTable = dataset.getTable("revision");
int[][] personArray = IntBigArrays.newBigArray(numNodes);
// Author IDs
BigArrays.fill(personArray, -1);
releaseTable.readBytes64Column("author", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
revisionTable.readBytes64Column("author", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin");
// Committer IDs
BigArrays.fill(personArray, -1);
revisionTable.readBytes64Column("committer", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin");
}
public void writeMessages() throws IOException {
logger.info("Writing messages for release + revision, and URLs for origins");
long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(messageOffsetArray, -1);
FastBufferedOutputStream messageStream = new FastBufferedOutputStream(
new FileOutputStream(graphBasename + ".property.message.bin"));
AtomicLong offset = new AtomicLong(0L);
SwhOrcTable releaseTable = dataset.getTable("release");
releaseTable.readBytes64Column("message", (swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
SwhOrcTable revisionTable = dataset.getTable("revision");
revisionTable.readBytes64Column("message", (swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin");
originTable.readURLs((swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
// TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file
BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin");
// EliasFanoLongBigList messageOffsetEF = new
// EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray));
// BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin");
messageStream.close();
}
public void writeTagNames() throws IOException {
logger.info("Writing tag names for release");
long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(tagNameOffsetArray, -1);
FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream(
new FileOutputStream(graphBasename + ".property.tag_name.bin"));
AtomicLong offset = new AtomicLong(0L);
SwhOrcTable releaseTable = dataset.getTable("release");
releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
tagNameStream.write(tagNameBase64);
tagNameStream.write('\n');
BigArrays.set(tagNameOffsetArray, id, offset.longValue());
offset.addAndGet(tagNameBase64.length + 1);
});
BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin");
// EliasFanoLongBigList tagNameOffsetEF = new
// EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray));
// BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin");
tagNameStream.close();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
index cfb47a4..7b02d76 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
@@ -1,79 +1,91 @@
package org.softwareheritage.graph.utils;
-import it.unimi.dsi.big.webgraph.NodeIterator;
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
+import it.unimi.dsi.logging.ProgressLogger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SwhUnidirectionalGraph;
import org.softwareheritage.graph.labels.DirEntry;
import java.io.IOException;
+import java.util.concurrent.TimeUnit;
public class DumpProperties {
+ final static Logger logger = LoggerFactory.getLogger(DumpProperties.class);
+
public static void main(String[] args) throws IOException {
String graphPath = args[0];
+ ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
SwhUnidirectionalGraph graph;
if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) {
- graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath);
+ graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl);
} else {
- graph = SwhUnidirectionalGraph.loadLabelled(graphPath);
+ graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl);
}
graph.loadContentLength();
graph.loadContentIsSkipped();
graph.loadPersonIds();
graph.loadAuthorTimestamps();
graph.loadCommitterTimestamps();
graph.loadMessages();
graph.loadTagNames();
graph.loadLabelNames();
- NodeIterator it = graph.nodeIterator();
+ ArcLabelledNodeIterator it = graph.labelledNodeIterator();
while (it.hasNext()) {
long node = it.nextLong();
System.out.format("%s: %s\n", node, graph.getSWHID(node));
- var s = graph.labelledSuccessors(node);
- long succ;
+ var s = it.successors();
System.out.println(" successors:");
- while ((succ = s.nextLong()) >= 0) {
+ for (long succ; (succ = s.nextLong()) >= 0;) {
DirEntry[] labels = (DirEntry[]) s.label().get();
if (labels.length > 0) {
for (DirEntry label : labels) {
System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ),
new String(graph.getLabelName(label.filenameId)), label.permission);
}
} else {
System.out.format(" %s\n", graph.getSWHID(succ));
}
}
switch (graph.getNodeType(node)) {
case CNT:
System.out.format(" length: %s\n", graph.getContentLength(node));
System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node));
break;
case REV:
System.out.format(" author: %s\n", graph.getAuthorId(node));
System.out.format(" committer: %s\n", graph.getCommitterId(node));
System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
graph.getAuthorTimestampOffset(node));
System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node),
graph.getCommitterTimestampOffset(node));
byte[] msg = graph.getMessage(node);
if (msg != null) {
System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n"));
}
break;
case REL:
System.out.format(" author: %s\n", graph.getAuthorId(node));
System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
graph.getAuthorTimestamp(node));
- System.out.format(" message: %s\n", (new String(graph.getMessage(node))).replace("\n", "\\n"));
- System.out.format(" tag name: %s\n", new String(graph.getTagName(node)));
+ byte[] tagMsg = graph.getMessage(node);
+ if (tagMsg != null) {
+ System.out.format(" message: %s\n", (new String(tagMsg)).replace("\n", "\\n"));
+ }
+ byte[] tagName = graph.getTagName(node);
+ if (tagName != null) {
+ System.out.format(" message: %s\n", (new String(tagName)));
+ }
break;
case ORI:
System.out.format(" url: %s\n", graph.getUrl(node));
}
System.out.println();
}
}
}