Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9124872
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
36 KB
Subscribers
None
View Options
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
index 6d6848d..6d4db39 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
@@ -1,144 +1,144 @@
package org.softwareheritage.graph;
import java.io.IOException;
/**
* Common interface for SWH graph classes.
*
* This interface forwards all property loading/access methods to the SwhGraphProperties object
* returned by the getProperties() method of the implementing class. This allows API users to write
* graph.getNodeType() instead of graph.getProperties().getNodeType().
*/
public interface SwhGraph {
/**
* Cleans up graph resources after use.
*/
void close() throws IOException;
/**
* Returns the SWH graph properties object of this graph.
*
* @return graph properties
*/
SwhGraphProperties getProperties();
/** @see SwhGraphProperties#getPath() */
default String getPath() {
return getProperties().getPath();
}
/** @see SwhGraphProperties#getNodeId(SWHID) */
default long getNodeId(SWHID swhid) {
return getProperties().getNodeId(swhid);
}
/** @see SwhGraphProperties#getSWHID(long) */
default SWHID getSWHID(long nodeId) {
return getProperties().getSWHID(nodeId);
}
/** @see SwhGraphProperties#getNodeType(long) */
default Node.Type getNodeType(long nodeId) {
return getProperties().getNodeType(nodeId);
}
/** @see SwhGraphProperties#loadContentLength() */
default void loadContentLength() throws IOException {
getProperties().loadContentLength();
}
/** @see SwhGraphProperties#getContentLength(long) */
- default long getContentLength(long nodeId) {
+ default Long getContentLength(long nodeId) {
return getProperties().getContentLength(nodeId);
}
/** @see SwhGraphProperties#loadPersonIds() */
default void loadPersonIds() throws IOException {
getProperties().loadPersonIds();
}
/** @see SwhGraphProperties#getAuthorId(long) */
- default long getAuthorId(long nodeId) {
+ default Long getAuthorId(long nodeId) {
return getProperties().getAuthorId(nodeId);
}
/** @see SwhGraphProperties#getCommitterId(long) */
- default long getCommitterId(long nodeId) {
+ default Long getCommitterId(long nodeId) {
return getProperties().getCommitterId(nodeId);
}
/** @see SwhGraphProperties#loadContentIsSkipped() */
default void loadContentIsSkipped() throws IOException {
getProperties().loadContentIsSkipped();
}
/** @see SwhGraphProperties#isContentSkipped(long) */
default boolean isContentSkipped(long nodeId) {
return getProperties().isContentSkipped(nodeId);
}
/** @see SwhGraphProperties#loadAuthorTimestamps() */
default void loadAuthorTimestamps() throws IOException {
getProperties().loadAuthorTimestamps();
}
/** @see SwhGraphProperties#getAuthorTimestamp(long) */
- default long getAuthorTimestamp(long nodeId) {
+ default Long getAuthorTimestamp(long nodeId) {
return getProperties().getAuthorTimestamp(nodeId);
}
/** @see SwhGraphProperties#getAuthorTimestampOffset(long) */
- default short getAuthorTimestampOffset(long nodeId) {
+ default Short getAuthorTimestampOffset(long nodeId) {
return getProperties().getAuthorTimestampOffset(nodeId);
}
/** @see SwhGraphProperties#loadCommitterTimestamps() */
default void loadCommitterTimestamps() throws IOException {
getProperties().loadCommitterTimestamps();
}
/** @see SwhGraphProperties#getCommitterTimestamp(long) */
- default long getCommitterTimestamp(long nodeId) {
+ default Long getCommitterTimestamp(long nodeId) {
return getProperties().getCommitterTimestamp(nodeId);
}
/** @see SwhGraphProperties#getCommitterTimestampOffset(long) */
- default short getCommitterTimestampOffset(long nodeId) {
+ default Short getCommitterTimestampOffset(long nodeId) {
return getProperties().getCommitterTimestampOffset(nodeId);
}
/** @see SwhGraphProperties#loadMessages() */
default void loadMessages() throws IOException {
getProperties().loadMessages();
}
/** @see SwhGraphProperties#getMessage(long) */
default byte[] getMessage(long nodeId) throws IOException {
return getProperties().getMessage(nodeId);
}
/** @see SwhGraphProperties#getUrl(long) */
default String getUrl(long nodeId) throws IOException {
return getProperties().getUrl(nodeId);
}
/** @see SwhGraphProperties#loadTagNames() */
default void loadTagNames() throws IOException {
getProperties().loadTagNames();
}
/** @see SwhGraphProperties#getTagName(long) */
default byte[] getTagName(long nodeId) throws IOException {
return getProperties().getTagName(nodeId);
}
/** @see SwhGraphProperties#loadLabelNames() */
default void loadLabelNames() throws IOException {
getProperties().loadLabelNames();
}
/** @see SwhGraphProperties#getLabelName(long) */
default byte[] getLabelName(long labelId) {
return getProperties().getLabelName(labelId);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
index be43b61..b569bb5 100644
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
@@ -1,313 +1,324 @@
package org.softwareheritage.graph;
import it.unimi.dsi.big.util.MappedFrontCodedStringBigList;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.bytes.ByteBigList;
import it.unimi.dsi.fastutil.bytes.ByteMappedBigList;
import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.ints.IntMappedBigList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongMappedBigList;
import it.unimi.dsi.fastutil.shorts.ShortBigList;
import it.unimi.dsi.fastutil.shorts.ShortMappedBigList;
import it.unimi.dsi.sux4j.util.EliasFanoLongBigList;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.maps.NodeTypesMap;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Base64;
/**
* This objects contains SWH graph properties such as node labels.
*
* Some property mappings are necessary because Software Heritage uses string based <a href=
* "https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html">persistent
* identifiers</a> (SWHID) while WebGraph uses integers internally.
*
* The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph
* using SWHID) and the output (convert back to SWHID for users results).
*
* Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a
* long id → node type map is stored as well to avoid a full SWHID lookup.
*
* @see NodeIdMap
* @see NodeTypesMap
*/
public class SwhGraphProperties {
private final String path;
private final NodeIdMap nodeIdMap;
private final NodeTypesMap nodeTypesMap;
private LongBigList authorTimestamp;
private ShortBigList authorTimestampOffset;
private LongBigList committerTimestamp;
private ShortBigList committerTimestampOffset;
private LongBigList contentLength;
private LongArrayBitVector contentIsSkipped;
private IntBigList authorId;
private IntBigList committerId;
private ByteBigList messageBuffer;
private LongBigList messageOffsets;
private ByteBigList tagNameBuffer;
private LongBigList tagNameOffsets;
private MappedFrontCodedStringBigList edgeLabelNames;
protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
this.path = path;
this.nodeIdMap = nodeIdMap;
this.nodeTypesMap = nodeTypesMap;
}
public static SwhGraphProperties load(String path) throws IOException {
return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path));
}
/**
* Cleans up resources after use.
*/
public void close() throws IOException {
nodeIdMap.close();
edgeLabelNames.close();
}
/** Return the basename of the compressed graph */
public String getPath() {
return path;
}
/**
* Converts {@link SWHID} node to long.
*
* @param swhid node specified as a {@link SWHID}
* @return internal long node id
* @see SWHID
*/
public long getNodeId(SWHID swhid) {
return nodeIdMap.getNodeId(swhid);
}
/**
* Converts long id node to {@link SWHID}.
*
* @param nodeId node specified as a long id
* @return external SWHID
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
return nodeIdMap.getSWHID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
* @see Node.Type
*/
public Node.Type getNodeType(long nodeId) {
return nodeTypesMap.getType(nodeId);
}
private static LongBigList loadMappedLongs(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return LongMappedBigList.map(raf.getChannel());
}
}
private static IntBigList loadMappedInts(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return IntMappedBigList.map(raf.getChannel());
}
}
private static ShortBigList loadMappedShorts(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return ShortMappedBigList.map(raf.getChannel());
}
}
private static ByteBigList loadMappedBytes(String path) throws IOException {
try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
return ByteMappedBigList.map(raf.getChannel());
}
}
private static LongBigList loadEFLongs(String path) throws IOException {
try {
return (EliasFanoLongBigList) BinIO.loadObject(path);
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
private static byte[] getLine(ByteBigList byteArray, long start) {
long end = start;
while (end < byteArray.size64() && byteArray.getByte(end) != '\n') {
end++;
}
int length = (int) (end - start);
byte[] buffer = new byte[length];
byteArray.getElements(start, buffer, 0, length);
return buffer;
}
/** Load the sizes of the content nodes */
public void loadContentLength() throws IOException {
contentLength = loadMappedLongs(path + ".property.content.length.bin");
}
/** Get the size (in bytes) of the given content node */
- public long getContentLength(long nodeId) {
- return contentLength.getLong(nodeId);
+ public Long getContentLength(long nodeId) {
+ if (contentLength == null) {
+ throw new IllegalStateException("Content lengths not loaded");
+ }
+ long res = contentLength.getLong(nodeId);
+ return (res >= 0) ? res : null;
}
/** Load the IDs of the persons (authors and committers) */
public void loadPersonIds() throws IOException {
authorId = loadMappedInts(path + ".property.author_id.bin");
committerId = loadMappedInts(path + ".property.committer_id.bin");
}
/** Get a unique integer ID representing the author of the given revision or release node */
- public long getAuthorId(long nodeId) {
+ public Long getAuthorId(long nodeId) {
if (authorId == null) {
throw new IllegalStateException("Author IDs not loaded");
}
- return authorId.getInt(nodeId);
+ long res = authorId.getInt(nodeId);
+ return (res >= 0) ? res : null;
}
/** Get a unique integer ID representing the committer of the given revision node */
- public long getCommitterId(long nodeId) {
+ public Long getCommitterId(long nodeId) {
if (committerId == null) {
throw new IllegalStateException("Committer IDs not loaded");
}
- return committerId.getInt(nodeId);
+ long res = committerId.getInt(nodeId);
+ return (res >= 0) ? res : null;
}
/**
* Loads a boolean array indicating whether the given content node was skipped during archive
* ingestion
*/
public void loadContentIsSkipped() throws IOException {
try {
contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin");
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
/** Returns whether the given content node was skipped during archive ingestion */
public boolean isContentSkipped(long nodeId) {
if (contentIsSkipped == null) {
throw new IllegalStateException("Skipped content array not loaded");
}
return contentIsSkipped.getBoolean(nodeId);
}
/** Load the timestamps at which the releases and revisions were authored */
public void loadAuthorTimestamps() throws IOException {
authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin");
authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin");
}
/** Return the timestamp at which the given revision or release was authored */
- public long getAuthorTimestamp(long nodeId) {
+ public Long getAuthorTimestamp(long nodeId) {
if (authorTimestamp == null) {
throw new IllegalStateException("Author timestamps not loaded");
}
- return authorTimestamp.getLong(nodeId);
+ long res = authorTimestamp.getLong(nodeId);
+ return (res > Long.MIN_VALUE) ? res : null;
}
/** Return the timestamp offset at which the given revision or release was authored */
- public short getAuthorTimestampOffset(long nodeId) {
+ public Short getAuthorTimestampOffset(long nodeId) {
if (authorTimestampOffset == null) {
throw new IllegalStateException("Author timestamp offsets not loaded");
}
- return authorTimestampOffset.getShort(nodeId);
+ short res = authorTimestampOffset.getShort(nodeId);
+ return (res > Short.MIN_VALUE) ? res : null;
}
/** Load the timestamps at which the releases and revisions were committed */
public void loadCommitterTimestamps() throws IOException {
committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin");
committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin");
}
/** Return the timestamp at which the given revision was committed */
- public long getCommitterTimestamp(long nodeId) {
+ public Long getCommitterTimestamp(long nodeId) {
if (committerTimestamp == null) {
throw new IllegalStateException("Committer timestamps not loaded");
}
- return committerTimestamp.getLong(nodeId);
+ long res = committerTimestamp.getLong(nodeId);
+ return (res > Long.MIN_VALUE) ? res : null;
}
/** Return the timestamp offset at which the given revision was committed */
- public short getCommitterTimestampOffset(long nodeId) {
+ public Short getCommitterTimestampOffset(long nodeId) {
if (committerTimestampOffset == null) {
throw new IllegalStateException("Committer timestamp offsets not loaded");
}
- return committerTimestampOffset.getShort(nodeId);
+ short res = committerTimestampOffset.getShort(nodeId);
+ return (res > Short.MIN_VALUE) ? res : null;
}
/** Load the revision messages, the release messages and the origin URLs */
public void loadMessages() throws IOException {
messageBuffer = loadMappedBytes(path + ".property.message.bin");
messageOffsets = loadMappedLongs(path + ".property.message.offset.bin");
}
/** Get the message of the given revision or release node */
public byte[] getMessage(long nodeId) throws IOException {
if (messageBuffer == null || messageOffsets == null) {
throw new IllegalStateException("Messages not loaded");
}
long startOffset = messageOffsets.getLong(nodeId);
if (startOffset == -1) {
return null;
}
return Base64.getDecoder().decode(getLine(messageBuffer, startOffset));
}
/** Get the URL of the given origin node */
public String getUrl(long nodeId) throws IOException {
- return new String(getMessage(nodeId));
+ byte[] url = getMessage(nodeId);
+ return (url != null) ? new String(url) : null;
}
/** Load the release names */
public void loadTagNames() throws IOException {
tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin");
tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin");
}
/** Get the name of the given release node */
public byte[] getTagName(long nodeId) throws IOException {
if (tagNameBuffer == null || tagNameOffsets == null) {
throw new IllegalStateException("Tag names not loaded");
}
long startOffset = tagNameOffsets.getLong(nodeId);
if (startOffset == -1) {
return null;
}
return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset));
}
/** Load the arc label names (directory entry names and snapshot branch names) */
public void loadLabelNames() throws IOException {
try {
edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl");
} catch (ConfigurationException e) {
throw new IOException(e);
}
}
/**
* Get the arc label name (either a directory entry name or snapshot branch name) associated with
* the given label ID
*/
public byte[] getLabelName(long labelId) {
if (edgeLabelNames == null) {
throw new IllegalStateException("Label names not loaded");
}
return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId));
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
index dd20648..fed8608 100644
--- a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
@@ -1,268 +1,268 @@
package org.softwareheritage.graph.compress;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.ints.IntBigArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.shorts.ShortBigArrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.maps.NodeIdMap;
import org.softwareheritage.graph.compress.ORCGraphDataset.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
/**
* This class is used to extract the node properties from the graph dataset, and write them to a set
* of property files.
*
* Note: because the nodes are not sorted by type, we have an incentive to minimize the number of
* "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same
* files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file).
* Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all
* the different properties in their own files.
*/
public class WriteNodeProperties {
final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class);
private final ORCGraphDataset dataset;
private final String graphBasename;
private final NodeIdMap nodeIdMap;
private final long numNodes;
public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) {
this.dataset = new ORCGraphDataset(dataset);
this.graphBasename = graphBasename;
this.nodeIdMap = nodeIdMap;
this.numNodes = nodeIdMap.size64();
}
public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped",
"person_ids", "messages", "tag_names",};
private static JSAPResult parseArgs(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"),
new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
"Basename of the output graph"),
new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties",
"Properties to write, comma separated (default: all). Possible choices: "
+ String.join(",", PROPERTY_WRITERS)),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
System.err.println("Usage error: " + e.getMessage());
System.exit(1);
}
return config;
}
public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException,
InvocationTargetException, IllegalAccessException {
JSAPResult args = parseArgs(argv);
String dataset = args.getString("dataset");
String graphBasename = args.getString("graphBasename");
NodeIdMap nodeIdMap = new NodeIdMap(graphBasename);
Set<String> properties;
if (args.getString("properties").equals("*")) {
properties = Set.of(PROPERTY_WRITERS);
} else {
properties = new HashSet<>(Arrays.asList(args.getString("properties").split(",")));
}
WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap);
if (properties.contains("timestamps")) {
writer.writeTimestamps();
}
if (properties.contains("content_length")) {
writer.writeContentLength();
}
if (properties.contains("content_is_skipped")) {
writer.writeContentIsSkipped();
}
if (properties.contains("person_ids")) {
writer.writePersonIds();
}
if (properties.contains("messages")) {
writer.writeMessages();
}
if (properties.contains("tag_names")) {
writer.writeTagNames();
}
}
public void writeContentLength() throws IOException {
logger.info("Writing content lengths");
long[][] valueArray = LongBigArrays.newBigArray(numNodes);
- BigArrays.fill(valueArray, Long.MIN_VALUE);
+ BigArrays.fill(valueArray, -1);
for (String tableName : new String[]{"content", "skipped_content"}) {
SwhOrcTable table = dataset.getTable(tableName);
table.readLongColumn("length", (swhid, value) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(valueArray, id, value);
});
}
BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin");
}
public void writeContentIsSkipped() throws IOException {
LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes);
SwhOrcTable table = dataset.getTable("skipped_content");
table.readIdColumn((swhid) -> {
long id = nodeIdMap.getNodeId(swhid);
isSkippedBitVector.set(id);
});
BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin");
}
public void writeTimestamps() throws IOException {
logger.info("Writing author/committer timestamps for release + revision");
SwhOrcTable releaseTable = dataset.getTable("release");
SwhOrcTable revisionTable = dataset.getTable("revision");
long[][] timestampArray = LongBigArrays.newBigArray(numNodes);
short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes);
// Author timestamps
BigArrays.fill(timestampArray, Long.MIN_VALUE);
BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin");
BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin");
// Committer timestamps
BigArrays.fill(timestampArray, Long.MIN_VALUE);
BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(timestampArray, id, date);
BigArrays.set(timestampOffsetArray, id, dateOffset);
});
BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin");
BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin");
}
public void writePersonIds() throws IOException {
logger.info("Writing author/committer IDs for release + revision");
Object2LongFunction<byte[]> personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph");
SwhOrcTable releaseTable = dataset.getTable("release");
SwhOrcTable revisionTable = dataset.getTable("revision");
int[][] personArray = IntBigArrays.newBigArray(numNodes);
// Author IDs
BigArrays.fill(personArray, -1);
releaseTable.readBytes64Column("author", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
revisionTable.readBytes64Column("author", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin");
// Committer IDs
BigArrays.fill(personArray, -1);
revisionTable.readBytes64Column("committer", (swhid, personBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
});
BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin");
}
public void writeMessages() throws IOException {
logger.info("Writing messages for release + revision, and URLs for origins");
long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(messageOffsetArray, -1);
FastBufferedOutputStream messageStream = new FastBufferedOutputStream(
new FileOutputStream(graphBasename + ".property.message.bin"));
AtomicLong offset = new AtomicLong(0L);
SwhOrcTable releaseTable = dataset.getTable("release");
releaseTable.readBytes64Column("message", (swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
SwhOrcTable revisionTable = dataset.getTable("revision");
revisionTable.readBytes64Column("message", (swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin");
originTable.readURLs((swhid, messageBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
messageStream.write(messageBase64);
messageStream.write('\n');
BigArrays.set(messageOffsetArray, id, offset.longValue());
offset.addAndGet(messageBase64.length + 1);
});
// TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file
BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin");
// EliasFanoLongBigList messageOffsetEF = new
// EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray));
// BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin");
messageStream.close();
}
public void writeTagNames() throws IOException {
logger.info("Writing tag names for release");
long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes);
BigArrays.fill(tagNameOffsetArray, -1);
FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream(
new FileOutputStream(graphBasename + ".property.tag_name.bin"));
AtomicLong offset = new AtomicLong(0L);
SwhOrcTable releaseTable = dataset.getTable("release");
releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> {
long id = nodeIdMap.getNodeId(swhid);
tagNameStream.write(tagNameBase64);
tagNameStream.write('\n');
BigArrays.set(tagNameOffsetArray, id, offset.longValue());
offset.addAndGet(tagNameBase64.length + 1);
});
BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin");
// EliasFanoLongBigList tagNameOffsetEF = new
// EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray));
// BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin");
tagNameStream.close();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
index cfb47a4..7b02d76 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java
@@ -1,79 +1,91 @@
package org.softwareheritage.graph.utils;
-import it.unimi.dsi.big.webgraph.NodeIterator;
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
+import it.unimi.dsi.logging.ProgressLogger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.softwareheritage.graph.SwhUnidirectionalGraph;
import org.softwareheritage.graph.labels.DirEntry;
import java.io.IOException;
+import java.util.concurrent.TimeUnit;
public class DumpProperties {
+ final static Logger logger = LoggerFactory.getLogger(DumpProperties.class);
+
public static void main(String[] args) throws IOException {
String graphPath = args[0];
+ ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
SwhUnidirectionalGraph graph;
if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) {
- graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath);
+ graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl);
} else {
- graph = SwhUnidirectionalGraph.loadLabelled(graphPath);
+ graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl);
}
graph.loadContentLength();
graph.loadContentIsSkipped();
graph.loadPersonIds();
graph.loadAuthorTimestamps();
graph.loadCommitterTimestamps();
graph.loadMessages();
graph.loadTagNames();
graph.loadLabelNames();
- NodeIterator it = graph.nodeIterator();
+ ArcLabelledNodeIterator it = graph.labelledNodeIterator();
while (it.hasNext()) {
long node = it.nextLong();
System.out.format("%s: %s\n", node, graph.getSWHID(node));
- var s = graph.labelledSuccessors(node);
- long succ;
+ var s = it.successors();
System.out.println(" successors:");
- while ((succ = s.nextLong()) >= 0) {
+ for (long succ; (succ = s.nextLong()) >= 0;) {
DirEntry[] labels = (DirEntry[]) s.label().get();
if (labels.length > 0) {
for (DirEntry label : labels) {
System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ),
new String(graph.getLabelName(label.filenameId)), label.permission);
}
} else {
System.out.format(" %s\n", graph.getSWHID(succ));
}
}
switch (graph.getNodeType(node)) {
case CNT:
System.out.format(" length: %s\n", graph.getContentLength(node));
System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node));
break;
case REV:
System.out.format(" author: %s\n", graph.getAuthorId(node));
System.out.format(" committer: %s\n", graph.getCommitterId(node));
System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
graph.getAuthorTimestampOffset(node));
System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node),
graph.getCommitterTimestampOffset(node));
byte[] msg = graph.getMessage(node);
if (msg != null) {
System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n"));
}
break;
case REL:
System.out.format(" author: %s\n", graph.getAuthorId(node));
System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node),
graph.getAuthorTimestamp(node));
- System.out.format(" message: %s\n", (new String(graph.getMessage(node))).replace("\n", "\\n"));
- System.out.format(" tag name: %s\n", new String(graph.getTagName(node)));
+ byte[] tagMsg = graph.getMessage(node);
+ if (tagMsg != null) {
+ System.out.format(" message: %s\n", (new String(tagMsg)).replace("\n", "\\n"));
+ }
+ byte[] tagName = graph.getTagName(node);
+ if (tagName != null) {
+ System.out.format(" message: %s\n", (new String(tagName)));
+ }
break;
case ORI:
System.out.format(" url: %s\n", graph.getUrl(node));
}
System.out.println();
}
}
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jun 21, 7:40 PM (3 w, 21 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3290392
Attached To
rDGRPH Compressed graph representation
Event Timeline
Log In to Comment