diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java index 6d6848d..6d4db39 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java @@ -1,144 +1,144 @@ package org.softwareheritage.graph; import java.io.IOException; /** * Common interface for SWH graph classes. * * This interface forwards all property loading/access methods to the SwhGraphProperties object * returned by the getProperties() method of the implementing class. This allows API users to write * graph.getNodeType() instead of graph.getProperties().getNodeType(). */ public interface SwhGraph { /** * Cleans up graph resources after use. */ void close() throws IOException; /** * Returns the SWH graph properties object of this graph. * * @return graph properties */ SwhGraphProperties getProperties(); /** @see SwhGraphProperties#getPath() */ default String getPath() { return getProperties().getPath(); } /** @see SwhGraphProperties#getNodeId(SWHID) */ default long getNodeId(SWHID swhid) { return getProperties().getNodeId(swhid); } /** @see SwhGraphProperties#getSWHID(long) */ default SWHID getSWHID(long nodeId) { return getProperties().getSWHID(nodeId); } /** @see SwhGraphProperties#getNodeType(long) */ default Node.Type getNodeType(long nodeId) { return getProperties().getNodeType(nodeId); } /** @see SwhGraphProperties#loadContentLength() */ default void loadContentLength() throws IOException { getProperties().loadContentLength(); } /** @see SwhGraphProperties#getContentLength(long) */ - default long getContentLength(long nodeId) { + default Long getContentLength(long nodeId) { return getProperties().getContentLength(nodeId); } /** @see SwhGraphProperties#loadPersonIds() */ default void loadPersonIds() throws IOException { getProperties().loadPersonIds(); } /** @see SwhGraphProperties#getAuthorId(long) */ - default long getAuthorId(long nodeId) { + default Long getAuthorId(long nodeId) { return getProperties().getAuthorId(nodeId); } /** @see SwhGraphProperties#getCommitterId(long) */ - default long getCommitterId(long nodeId) { + default Long getCommitterId(long nodeId) { return getProperties().getCommitterId(nodeId); } /** @see SwhGraphProperties#loadContentIsSkipped() */ default void loadContentIsSkipped() throws IOException { getProperties().loadContentIsSkipped(); } /** @see SwhGraphProperties#isContentSkipped(long) */ default boolean isContentSkipped(long nodeId) { return getProperties().isContentSkipped(nodeId); } /** @see SwhGraphProperties#loadAuthorTimestamps() */ default void loadAuthorTimestamps() throws IOException { getProperties().loadAuthorTimestamps(); } /** @see SwhGraphProperties#getAuthorTimestamp(long) */ - default long getAuthorTimestamp(long nodeId) { + default Long getAuthorTimestamp(long nodeId) { return getProperties().getAuthorTimestamp(nodeId); } /** @see SwhGraphProperties#getAuthorTimestampOffset(long) */ - default short getAuthorTimestampOffset(long nodeId) { + default Short getAuthorTimestampOffset(long nodeId) { return getProperties().getAuthorTimestampOffset(nodeId); } /** @see SwhGraphProperties#loadCommitterTimestamps() */ default void loadCommitterTimestamps() throws IOException { getProperties().loadCommitterTimestamps(); } /** @see SwhGraphProperties#getCommitterTimestamp(long) */ - default long getCommitterTimestamp(long nodeId) { + default Long getCommitterTimestamp(long nodeId) { return getProperties().getCommitterTimestamp(nodeId); } /** @see SwhGraphProperties#getCommitterTimestampOffset(long) */ - default short getCommitterTimestampOffset(long nodeId) { + default Short getCommitterTimestampOffset(long nodeId) { return getProperties().getCommitterTimestampOffset(nodeId); } /** @see SwhGraphProperties#loadMessages() */ default void loadMessages() throws IOException { getProperties().loadMessages(); } /** @see SwhGraphProperties#getMessage(long) */ default byte[] getMessage(long nodeId) throws IOException { return getProperties().getMessage(nodeId); } /** @see SwhGraphProperties#getUrl(long) */ default String getUrl(long nodeId) throws IOException { return getProperties().getUrl(nodeId); } /** @see SwhGraphProperties#loadTagNames() */ default void loadTagNames() throws IOException { getProperties().loadTagNames(); } /** @see SwhGraphProperties#getTagName(long) */ default byte[] getTagName(long nodeId) throws IOException { return getProperties().getTagName(nodeId); } /** @see SwhGraphProperties#loadLabelNames() */ default void loadLabelNames() throws IOException { getProperties().loadLabelNames(); } /** @see SwhGraphProperties#getLabelName(long) */ default byte[] getLabelName(long labelId) { return getProperties().getLabelName(labelId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java index be43b61..b569bb5 100644 --- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java @@ -1,313 +1,324 @@ package org.softwareheritage.graph; import it.unimi.dsi.big.util.MappedFrontCodedStringBigList; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.bytes.ByteBigList; import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; import it.unimi.dsi.fastutil.ints.IntBigList; import it.unimi.dsi.fastutil.ints.IntMappedBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.longs.LongMappedBigList; import it.unimi.dsi.fastutil.shorts.ShortBigList; import it.unimi.dsi.fastutil.shorts.ShortMappedBigList; import it.unimi.dsi.sux4j.util.EliasFanoLongBigList; import org.apache.commons.configuration2.ex.ConfigurationException; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.IOException; import java.io.RandomAccessFile; import java.util.Base64; /** * This objects contains SWH graph properties such as node labels. * * Some property mappings are necessary because Software Heritage uses string based persistent * identifiers (SWHID) while WebGraph uses integers internally. * * The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph * using SWHID) and the output (convert back to SWHID for users results). * * Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a * long id → node type map is stored as well to avoid a full SWHID lookup. * * @see NodeIdMap * @see NodeTypesMap */ public class SwhGraphProperties { private final String path; private final NodeIdMap nodeIdMap; private final NodeTypesMap nodeTypesMap; private LongBigList authorTimestamp; private ShortBigList authorTimestampOffset; private LongBigList committerTimestamp; private ShortBigList committerTimestampOffset; private LongBigList contentLength; private LongArrayBitVector contentIsSkipped; private IntBigList authorId; private IntBigList committerId; private ByteBigList messageBuffer; private LongBigList messageOffsets; private ByteBigList tagNameBuffer; private LongBigList tagNameOffsets; private MappedFrontCodedStringBigList edgeLabelNames; protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { this.path = path; this.nodeIdMap = nodeIdMap; this.nodeTypesMap = nodeTypesMap; } public static SwhGraphProperties load(String path) throws IOException { return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path)); } /** * Cleans up resources after use. */ public void close() throws IOException { nodeIdMap.close(); edgeLabelNames.close(); } /** Return the basename of the compressed graph */ public String getPath() { return path; } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return nodeIdMap.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return nodeIdMap.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } private static LongBigList loadMappedLongs(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return LongMappedBigList.map(raf.getChannel()); } } private static IntBigList loadMappedInts(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return IntMappedBigList.map(raf.getChannel()); } } private static ShortBigList loadMappedShorts(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return ShortMappedBigList.map(raf.getChannel()); } } private static ByteBigList loadMappedBytes(String path) throws IOException { try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { return ByteMappedBigList.map(raf.getChannel()); } } private static LongBigList loadEFLongs(String path) throws IOException { try { return (EliasFanoLongBigList) BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e); } } private static byte[] getLine(ByteBigList byteArray, long start) { long end = start; while (end < byteArray.size64() && byteArray.getByte(end) != '\n') { end++; } int length = (int) (end - start); byte[] buffer = new byte[length]; byteArray.getElements(start, buffer, 0, length); return buffer; } /** Load the sizes of the content nodes */ public void loadContentLength() throws IOException { contentLength = loadMappedLongs(path + ".property.content.length.bin"); } /** Get the size (in bytes) of the given content node */ - public long getContentLength(long nodeId) { - return contentLength.getLong(nodeId); + public Long getContentLength(long nodeId) { + if (contentLength == null) { + throw new IllegalStateException("Content lengths not loaded"); + } + long res = contentLength.getLong(nodeId); + return (res >= 0) ? res : null; } /** Load the IDs of the persons (authors and committers) */ public void loadPersonIds() throws IOException { authorId = loadMappedInts(path + ".property.author_id.bin"); committerId = loadMappedInts(path + ".property.committer_id.bin"); } /** Get a unique integer ID representing the author of the given revision or release node */ - public long getAuthorId(long nodeId) { + public Long getAuthorId(long nodeId) { if (authorId == null) { throw new IllegalStateException("Author IDs not loaded"); } - return authorId.getInt(nodeId); + long res = authorId.getInt(nodeId); + return (res >= 0) ? res : null; } /** Get a unique integer ID representing the committer of the given revision node */ - public long getCommitterId(long nodeId) { + public Long getCommitterId(long nodeId) { if (committerId == null) { throw new IllegalStateException("Committer IDs not loaded"); } - return committerId.getInt(nodeId); + long res = committerId.getInt(nodeId); + return (res >= 0) ? res : null; } /** * Loads a boolean array indicating whether the given content node was skipped during archive * ingestion */ public void loadContentIsSkipped() throws IOException { try { contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin"); } catch (ClassNotFoundException e) { throw new IOException(e); } } /** Returns whether the given content node was skipped during archive ingestion */ public boolean isContentSkipped(long nodeId) { if (contentIsSkipped == null) { throw new IllegalStateException("Skipped content array not loaded"); } return contentIsSkipped.getBoolean(nodeId); } /** Load the timestamps at which the releases and revisions were authored */ public void loadAuthorTimestamps() throws IOException { authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin"); authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin"); } /** Return the timestamp at which the given revision or release was authored */ - public long getAuthorTimestamp(long nodeId) { + public Long getAuthorTimestamp(long nodeId) { if (authorTimestamp == null) { throw new IllegalStateException("Author timestamps not loaded"); } - return authorTimestamp.getLong(nodeId); + long res = authorTimestamp.getLong(nodeId); + return (res > Long.MIN_VALUE) ? res : null; } /** Return the timestamp offset at which the given revision or release was authored */ - public short getAuthorTimestampOffset(long nodeId) { + public Short getAuthorTimestampOffset(long nodeId) { if (authorTimestampOffset == null) { throw new IllegalStateException("Author timestamp offsets not loaded"); } - return authorTimestampOffset.getShort(nodeId); + short res = authorTimestampOffset.getShort(nodeId); + return (res > Short.MIN_VALUE) ? res : null; } /** Load the timestamps at which the releases and revisions were committed */ public void loadCommitterTimestamps() throws IOException { committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin"); committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin"); } /** Return the timestamp at which the given revision was committed */ - public long getCommitterTimestamp(long nodeId) { + public Long getCommitterTimestamp(long nodeId) { if (committerTimestamp == null) { throw new IllegalStateException("Committer timestamps not loaded"); } - return committerTimestamp.getLong(nodeId); + long res = committerTimestamp.getLong(nodeId); + return (res > Long.MIN_VALUE) ? res : null; } /** Return the timestamp offset at which the given revision was committed */ - public short getCommitterTimestampOffset(long nodeId) { + public Short getCommitterTimestampOffset(long nodeId) { if (committerTimestampOffset == null) { throw new IllegalStateException("Committer timestamp offsets not loaded"); } - return committerTimestampOffset.getShort(nodeId); + short res = committerTimestampOffset.getShort(nodeId); + return (res > Short.MIN_VALUE) ? res : null; } /** Load the revision messages, the release messages and the origin URLs */ public void loadMessages() throws IOException { messageBuffer = loadMappedBytes(path + ".property.message.bin"); messageOffsets = loadMappedLongs(path + ".property.message.offset.bin"); } /** Get the message of the given revision or release node */ public byte[] getMessage(long nodeId) throws IOException { if (messageBuffer == null || messageOffsets == null) { throw new IllegalStateException("Messages not loaded"); } long startOffset = messageOffsets.getLong(nodeId); if (startOffset == -1) { return null; } return Base64.getDecoder().decode(getLine(messageBuffer, startOffset)); } /** Get the URL of the given origin node */ public String getUrl(long nodeId) throws IOException { - return new String(getMessage(nodeId)); + byte[] url = getMessage(nodeId); + return (url != null) ? new String(url) : null; } /** Load the release names */ public void loadTagNames() throws IOException { tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin"); tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin"); } /** Get the name of the given release node */ public byte[] getTagName(long nodeId) throws IOException { if (tagNameBuffer == null || tagNameOffsets == null) { throw new IllegalStateException("Tag names not loaded"); } long startOffset = tagNameOffsets.getLong(nodeId); if (startOffset == -1) { return null; } return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset)); } /** Load the arc label names (directory entry names and snapshot branch names) */ public void loadLabelNames() throws IOException { try { edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl"); } catch (ConfigurationException e) { throw new IOException(e); } } /** * Get the arc label name (either a directory entry name or snapshot branch name) associated with * the given label ID */ public byte[] getLabelName(long labelId) { if (edgeLabelNames == null) { throw new IllegalStateException("Label names not loaded"); } return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId)); } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java index dd20648..fed8608 100644 --- a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java @@ -1,268 +1,268 @@ package org.softwareheritage.graph.compress; import com.martiansoftware.jsap.*; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.ints.IntBigArrays; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.shorts.ShortBigArrays; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.compress.ORCGraphDataset.*; import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.*; import java.util.concurrent.atomic.AtomicLong; /** * This class is used to extract the node properties from the graph dataset, and write them to a set * of property files. * * Note: because the nodes are not sorted by type, we have an incentive to minimize the number of * "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same * files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file). * Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all * the different properties in their own files. */ public class WriteNodeProperties { final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class); private final ORCGraphDataset dataset; private final String graphBasename; private final NodeIdMap nodeIdMap; private final long numNodes; public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) { this.dataset = new ORCGraphDataset(dataset); this.graphBasename = graphBasename; this.nodeIdMap = nodeIdMap; this.numNodes = nodeIdMap.size64(); } public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped", "person_ids", "messages", "tag_names",}; private static JSAPResult parseArgs(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"), new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"), new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties", "Properties to write, comma separated (default: all). Possible choices: " + String.join(",", PROPERTY_WRITERS)),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { System.err.println("Usage error: " + e.getMessage()); System.exit(1); } return config; } public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { JSAPResult args = parseArgs(argv); String dataset = args.getString("dataset"); String graphBasename = args.getString("graphBasename"); NodeIdMap nodeIdMap = new NodeIdMap(graphBasename); Set properties; if (args.getString("properties").equals("*")) { properties = Set.of(PROPERTY_WRITERS); } else { properties = new HashSet<>(Arrays.asList(args.getString("properties").split(","))); } WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap); if (properties.contains("timestamps")) { writer.writeTimestamps(); } if (properties.contains("content_length")) { writer.writeContentLength(); } if (properties.contains("content_is_skipped")) { writer.writeContentIsSkipped(); } if (properties.contains("person_ids")) { writer.writePersonIds(); } if (properties.contains("messages")) { writer.writeMessages(); } if (properties.contains("tag_names")) { writer.writeTagNames(); } } public void writeContentLength() throws IOException { logger.info("Writing content lengths"); long[][] valueArray = LongBigArrays.newBigArray(numNodes); - BigArrays.fill(valueArray, Long.MIN_VALUE); + BigArrays.fill(valueArray, -1); for (String tableName : new String[]{"content", "skipped_content"}) { SwhOrcTable table = dataset.getTable(tableName); table.readLongColumn("length", (swhid, value) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(valueArray, id, value); }); } BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin"); } public void writeContentIsSkipped() throws IOException { LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes); SwhOrcTable table = dataset.getTable("skipped_content"); table.readIdColumn((swhid) -> { long id = nodeIdMap.getNodeId(swhid); isSkippedBitVector.set(id); }); BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin"); } public void writeTimestamps() throws IOException { logger.info("Writing author/committer timestamps for release + revision"); SwhOrcTable releaseTable = dataset.getTable("release"); SwhOrcTable revisionTable = dataset.getTable("revision"); long[][] timestampArray = LongBigArrays.newBigArray(numNodes); short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes); // Author timestamps BigArrays.fill(timestampArray, Long.MIN_VALUE); BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE); releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(timestampArray, id, date); BigArrays.set(timestampOffsetArray, id, dateOffset); }); revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(timestampArray, id, date); BigArrays.set(timestampOffsetArray, id, dateOffset); }); BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin"); BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin"); // Committer timestamps BigArrays.fill(timestampArray, Long.MIN_VALUE); BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE); revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(timestampArray, id, date); BigArrays.set(timestampOffsetArray, id, dateOffset); }); BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin"); BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin"); } public void writePersonIds() throws IOException { logger.info("Writing author/committer IDs for release + revision"); Object2LongFunction personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph"); SwhOrcTable releaseTable = dataset.getTable("release"); SwhOrcTable revisionTable = dataset.getTable("revision"); int[][] personArray = IntBigArrays.newBigArray(numNodes); // Author IDs BigArrays.fill(personArray, -1); releaseTable.readBytes64Column("author", (swhid, personBase64) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); }); revisionTable.readBytes64Column("author", (swhid, personBase64) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); }); BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin"); // Committer IDs BigArrays.fill(personArray, -1); revisionTable.readBytes64Column("committer", (swhid, personBase64) -> { long id = nodeIdMap.getNodeId(swhid); BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); }); BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin"); } public void writeMessages() throws IOException { logger.info("Writing messages for release + revision, and URLs for origins"); long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes); BigArrays.fill(messageOffsetArray, -1); FastBufferedOutputStream messageStream = new FastBufferedOutputStream( new FileOutputStream(graphBasename + ".property.message.bin")); AtomicLong offset = new AtomicLong(0L); SwhOrcTable releaseTable = dataset.getTable("release"); releaseTable.readBytes64Column("message", (swhid, messageBase64) -> { long id = nodeIdMap.getNodeId(swhid); messageStream.write(messageBase64); messageStream.write('\n'); BigArrays.set(messageOffsetArray, id, offset.longValue()); offset.addAndGet(messageBase64.length + 1); }); SwhOrcTable revisionTable = dataset.getTable("revision"); revisionTable.readBytes64Column("message", (swhid, messageBase64) -> { long id = nodeIdMap.getNodeId(swhid); messageStream.write(messageBase64); messageStream.write('\n'); BigArrays.set(messageOffsetArray, id, offset.longValue()); offset.addAndGet(messageBase64.length + 1); }); OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin"); originTable.readURLs((swhid, messageBase64) -> { long id = nodeIdMap.getNodeId(swhid); messageStream.write(messageBase64); messageStream.write('\n'); BigArrays.set(messageOffsetArray, id, offset.longValue()); offset.addAndGet(messageBase64.length + 1); }); // TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin"); // EliasFanoLongBigList messageOffsetEF = new // EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray)); // BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin"); messageStream.close(); } public void writeTagNames() throws IOException { logger.info("Writing tag names for release"); long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes); BigArrays.fill(tagNameOffsetArray, -1); FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream( new FileOutputStream(graphBasename + ".property.tag_name.bin")); AtomicLong offset = new AtomicLong(0L); SwhOrcTable releaseTable = dataset.getTable("release"); releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> { long id = nodeIdMap.getNodeId(swhid); tagNameStream.write(tagNameBase64); tagNameStream.write('\n'); BigArrays.set(tagNameOffsetArray, id, offset.longValue()); offset.addAndGet(tagNameBase64.length + 1); }); BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin"); // EliasFanoLongBigList tagNameOffsetEF = new // EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray)); // BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin"); tagNameStream.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java index cfb47a4..7b02d76 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java @@ -1,79 +1,91 @@ package org.softwareheritage.graph.utils; -import it.unimi.dsi.big.webgraph.NodeIterator; +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.softwareheritage.graph.SwhUnidirectionalGraph; import org.softwareheritage.graph.labels.DirEntry; import java.io.IOException; +import java.util.concurrent.TimeUnit; public class DumpProperties { + final static Logger logger = LoggerFactory.getLogger(DumpProperties.class); + public static void main(String[] args) throws IOException { String graphPath = args[0]; + ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); SwhUnidirectionalGraph graph; if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { - graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath); + graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl); } else { - graph = SwhUnidirectionalGraph.loadLabelled(graphPath); + graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl); } graph.loadContentLength(); graph.loadContentIsSkipped(); graph.loadPersonIds(); graph.loadAuthorTimestamps(); graph.loadCommitterTimestamps(); graph.loadMessages(); graph.loadTagNames(); graph.loadLabelNames(); - NodeIterator it = graph.nodeIterator(); + ArcLabelledNodeIterator it = graph.labelledNodeIterator(); while (it.hasNext()) { long node = it.nextLong(); System.out.format("%s: %s\n", node, graph.getSWHID(node)); - var s = graph.labelledSuccessors(node); - long succ; + var s = it.successors(); System.out.println(" successors:"); - while ((succ = s.nextLong()) >= 0) { + for (long succ; (succ = s.nextLong()) >= 0;) { DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { for (DirEntry label : labels) { System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ), new String(graph.getLabelName(label.filenameId)), label.permission); } } else { System.out.format(" %s\n", graph.getSWHID(succ)); } } switch (graph.getNodeType(node)) { case CNT: System.out.format(" length: %s\n", graph.getContentLength(node)); System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node)); break; case REV: System.out.format(" author: %s\n", graph.getAuthorId(node)); System.out.format(" committer: %s\n", graph.getCommitterId(node)); System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node), graph.getAuthorTimestampOffset(node)); System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node), graph.getCommitterTimestampOffset(node)); byte[] msg = graph.getMessage(node); if (msg != null) { System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n")); } break; case REL: System.out.format(" author: %s\n", graph.getAuthorId(node)); System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node), graph.getAuthorTimestamp(node)); - System.out.format(" message: %s\n", (new String(graph.getMessage(node))).replace("\n", "\\n")); - System.out.format(" tag name: %s\n", new String(graph.getTagName(node))); + byte[] tagMsg = graph.getMessage(node); + if (tagMsg != null) { + System.out.format(" message: %s\n", (new String(tagMsg)).replace("\n", "\\n")); + } + byte[] tagName = graph.getTagName(node); + if (tagName != null) { + System.out.format(" message: %s\n", (new String(tagName))); + } break; case ORI: System.out.format(" url: %s\n", graph.getUrl(node)); } System.out.println(); } } }