diff --git a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java b/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java new file mode 100644 index 0000000..460f2eb --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java @@ -0,0 +1,77 @@ +package org.softwareheritage.graph; + +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.big.webgraph.Transform; + +public class BidirectionalImmutableGraph extends ImmutableGraph { + private final ImmutableGraph forwardGraph; + private final ImmutableGraph backwardGraph; + + protected BidirectionalImmutableGraph(ImmutableGraph forwardGraph, ImmutableGraph backwardGraph) { + this.forwardGraph = forwardGraph; + this.backwardGraph = backwardGraph; + } + + @Override + public long numNodes() { + assert forwardGraph.numNodes() == backwardGraph.numNodes(); + return this.forwardGraph.numNodes(); + } + + @Override + public long numArcs() { + assert forwardGraph.numArcs() == backwardGraph.numArcs(); + return this.forwardGraph.numArcs(); + } + + @Override + public boolean randomAccess() { + return this.forwardGraph.randomAccess() && this.backwardGraph.randomAccess(); + } + + @Override + public boolean hasCopiableIterators() { + return forwardGraph.hasCopiableIterators() && backwardGraph.hasCopiableIterators(); + } + + @Override + public BidirectionalImmutableGraph copy() { + return new BidirectionalImmutableGraph(this.forwardGraph.copy(), this.backwardGraph.copy()); + } + + public BidirectionalImmutableGraph transpose() { + return new BidirectionalImmutableGraph(backwardGraph, forwardGraph); + } + + public BidirectionalImmutableGraph symmetrize() { + ImmutableGraph symmetric = Transform.union(forwardGraph, backwardGraph); + return new BidirectionalImmutableGraph(symmetric, symmetric); + } + + @Override + public long outdegree(long l) { + return forwardGraph.outdegree(l); + } + + public long indegree(long l) { + return backwardGraph.outdegree(l); + } + + @Override + public LazyLongIterator successors(long nodeId) { + return forwardGraph.successors(nodeId); + } + + public LazyLongIterator predecessors(long nodeId) { + return backwardGraph.successors(nodeId); + } + + public ImmutableGraph getForwardGraph() { + return forwardGraph; + } + + public ImmutableGraph getBackwardGraph() { + return backwardGraph; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java index c5f5513..8d9acf1 100644 --- a/java/src/main/java/org/softwareheritage/graph/Graph.java +++ b/java/src/main/java/org/softwareheritage/graph/Graph.java @@ -1,310 +1,304 @@ package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.big.webgraph.Transform; import it.unimi.dsi.logging.ProgressLogger; import org.softwareheritage.graph.maps.NodeIdMap; import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.IOException; /** * Main class storing the compressed graph and node id mappings. *

* The compressed graph is stored using the WebGraph * ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent * identifiers (SWHID) while WebGraph uses integers internally. These two mappings (long id * ↔ SWHID) are used for the input (users refer to the graph using SWHID) and the output * (convert back to SWHID for users results). However, since graph traversal can be restricted * depending on the node type (see {@link AllowedEdges}), a long id → node type map is stored * as well to avoid a full SWHID lookup. * * @author The Software Heritage developers * @see org.softwareheritage.graph.AllowedEdges * @see org.softwareheritage.graph.maps.NodeIdMap * @see org.softwareheritage.graph.maps.NodeTypesMap */ public class Graph extends ImmutableGraph { - /** File extension for the SWHID to long node id map */ - public static final String SWHID_TO_NODE = ".swhid2node.bin"; - /** File extension for the long node id to SWHID map */ - public static final String NODE_TO_SWHID = ".node2swhid.bin"; - /** File extension for the long node id to node type map */ - public static final String NODE_TO_TYPE = ".node2type.map"; - - /** Compressed graph stored as a {@link it.unimi.dsi.big.webgraph.BVGraph} */ - ImmutableGraph graph; - /** Transposed compressed graph (used for backward traversals) */ - ImmutableGraph graphTransposed; + /** + * Bidirectional graph containing two compressed {@link it.unimi.dsi.big.webgraph.BVGraph} one for + * each direction + */ + BidirectionalImmutableGraph graph; + /** Path and basename of the compressed graph */ String path; /** Mapping long id ↔ SWHIDs */ NodeIdMap nodeIdMap; /** Mapping long id → node types */ NodeTypesMap nodeTypesMap; /** * Constructor. * * @param path path and basename of the compressed graph to load */ private Graph(String path) throws IOException { loadInternal(path, null, LoadMethod.MAPPED); } /** * Loading mechanisms */ enum LoadMethod { MEMORY, MAPPED, OFFLINE, } protected Graph loadInternal(String path, ProgressLogger pl, LoadMethod method) throws IOException { this.path = path; + ImmutableGraph direct = null; + ImmutableGraph transposed = null; if (method == LoadMethod.MEMORY) { - this.graph = ImmutableGraph.load(path, pl); - this.graphTransposed = ImmutableGraph.load(path + "-transposed", pl); + direct = ImmutableGraph.load(path, pl); + transposed = ImmutableGraph.load(path + "-transposed", pl); } else if (method == LoadMethod.MAPPED) { - this.graph = ImmutableGraph.loadMapped(path, pl); - this.graphTransposed = ImmutableGraph.loadMapped(path + "-transposed", pl); + direct = ImmutableGraph.load(path, pl); + transposed = ImmutableGraph.loadMapped(path + "-transposed", pl); } else if (method == LoadMethod.OFFLINE) { - this.graph = ImmutableGraph.loadOffline(path, pl); - this.graphTransposed = ImmutableGraph.loadOffline(path + "-transposed", pl); + direct = ImmutableGraph.loadOffline(path, pl); + transposed = ImmutableGraph.loadOffline(path + "-transposed", pl); } + this.graph = new BidirectionalImmutableGraph(direct, transposed); this.nodeTypesMap = new NodeTypesMap(path); this.nodeIdMap = new NodeIdMap(path, numNodes()); return this; } protected Graph() { } public static Graph load(String path, ProgressLogger pl) throws IOException { return new Graph().loadInternal(path, pl, LoadMethod.MEMORY); } public static Graph loadMapped(String path, ProgressLogger pl) throws IOException { return new Graph().loadInternal(path, pl, LoadMethod.MAPPED); } public static Graph loadOffline(String path, ProgressLogger pl) throws IOException { return new Graph().loadInternal(path, null, LoadMethod.OFFLINE); } public static Graph load(String path) throws IOException { return new Graph().loadInternal(path, null, LoadMethod.MEMORY); } public static Graph loadMapped(String path) throws IOException { return new Graph().loadInternal(path, null, LoadMethod.MAPPED); } public static Graph loadOffline(String path) throws IOException { return new Graph().loadInternal(path, null, LoadMethod.OFFLINE); } /** * Constructor used for copy() */ - protected Graph(ImmutableGraph graph, ImmutableGraph graphTransposed, String path, NodeIdMap nodeIdMap, - NodeTypesMap nodeTypesMap) { + protected Graph(BidirectionalImmutableGraph graph, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { this.graph = graph; - this.graphTransposed = graphTransposed; this.path = path; this.nodeIdMap = nodeIdMap; this.nodeTypesMap = nodeTypesMap; } /** * Return a flyweight copy of the graph. */ @Override public Graph copy() { - return new Graph(this.graph.copy(), this.graphTransposed.copy(), this.path, this.nodeIdMap, this.nodeTypesMap); + return new Graph(this.graph.copy(), this.path, this.nodeIdMap, this.nodeTypesMap); } @Override public boolean randomAccess() { - return graph.randomAccess() && graphTransposed.randomAccess(); + return graph.randomAccess(); } /** * Return a transposed version of the graph. */ public Graph transpose() { - return new Graph(this.graphTransposed, this.graph, this.path, this.nodeIdMap, this.nodeTypesMap); + return new Graph(this.graph.transpose(), this.path, this.nodeIdMap, this.nodeTypesMap); } /** * Return a symmetric version of the graph. */ public Graph symmetrize() { - ImmutableGraph symmetric = Transform.union(graph, graphTransposed); - return new Graph(symmetric, symmetric, this.path, this.nodeIdMap, this.nodeTypesMap); + return new Graph(this.graph.symmetrize(), this.path, this.nodeIdMap, this.nodeTypesMap); } /** * Cleans up graph resources after use. */ public void cleanUp() throws IOException { nodeIdMap.close(); } /** * Returns number of nodes in the graph. * * @return number of nodes in the graph */ @Override public long numNodes() { return graph.numNodes(); } /** * Returns number of edges in the graph. * * @return number of edges in the graph */ @Override public long numArcs() { return graph.numArcs(); } /** * Returns lazy iterator of successors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of successors of the node, specified as a * WebGraph LazyLongIterator */ @Override public LazyLongIterator successors(long nodeId) { return graph.successors(nodeId); } /** * Returns lazy iterator of successors of a node while following a specific set of edge types. * * @param nodeId node specified as a long id * @param allowedEdges the specification of which edges can be traversed * @return lazy iterator of successors of the node, specified as a * WebGraph LazyLongIterator */ public LazyLongIterator successors(long nodeId, AllowedEdges allowedEdges) { if (allowedEdges.restrictedTo == null) { // All edges are allowed, bypass edge check return this.successors(nodeId); } else { LazyLongIterator allSuccessors = this.successors(nodeId); Graph thisGraph = this; return new LazyLongIterator() { @Override public long nextLong() { long neighbor; while ((neighbor = allSuccessors.nextLong()) != -1) { if (allowedEdges.isAllowed(thisGraph.getNodeType(nodeId), thisGraph.getNodeType(neighbor))) { return neighbor; } } return -1; } @Override public long skip(final long n) { long i; for (i = 0; i < n && nextLong() != -1; i++) ; return i; } }; } } /** * Returns the outdegree of a node. * * @param nodeId node specified as a long id * @return outdegree of a node */ @Override public long outdegree(long nodeId) { return graph.outdegree(nodeId); } /** * Returns lazy iterator of predecessors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of predecessors of the node, specified as a * WebGraph LazyLongIterator */ public LazyLongIterator predecessors(long nodeId) { - return this.transpose().successors(nodeId); + return graph.predecessors(nodeId); } /** * Returns the indegree of a node. * * @param nodeId node specified as a long id * @return indegree of a node */ public long indegree(long nodeId) { - return this.transpose().outdegree(nodeId); + return graph.indegree(nodeId); } /** - * Returns the underlying BVGraph. + * Returns the underlying BidirectionalImmutableGraph. * - * @return WebGraph BVGraph + * @return WebGraph ImmutableGraph */ public ImmutableGraph getGraph() { return this.graph; } /** * Returns the graph full path. * * @return graph full path */ public String getPath() { return path; } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return nodeIdMap.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return nodeIdMap.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type * @see org.softwareheritage.graph.Node.Type */ public Node.Type getNodeType(long nodeId) { return nodeTypesMap.getType(nodeId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java index 46a566b..5d7641a 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,188 +1,192 @@ package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.util.ByteBufferLongBigList; -import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.SWHID; import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using * an mmap()-ed .order file containing the graph permutation. * * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ public class NodeIdMap { /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; + /** File extension for the SWHID to long node id map */ + public static final String SWHID_TO_NODE = ".swhid2node.bin"; + /** File extension for the long node id to SWHID map */ + public static final String NODE_TO_SWHID = ".node2swhid.bin"; + /** Graph path and basename */ String graphPath; /** Number of ids to map */ long nbIds; /** mmap()-ed NODE_TO_SWHID file */ MapFile nodeToSwhMap; /** Minimal perfect hash (MPH) function SWHID -> initial order */ Object2LongFunction mph; /** mmap()-ed long list with the permutation initial order -> graph order */ LongBigList orderMap; /** FileInputStream containing the permutation */ FileInputStream orderInputStream; /** * Constructor. * * @param graphPath full graph path * @param nbNodes number of nodes in the graph */ public NodeIdMap(String graphPath, long nbNodes) throws IOException { this.graphPath = graphPath; this.nbIds = nbNodes; // node -> SWHID - this.nodeToSwhMap = new MapFile(graphPath + Graph.NODE_TO_SWHID, SWHID_BIN_SIZE); + this.nodeToSwhMap = new MapFile(graphPath + NODE_TO_SWHID, SWHID_BIN_SIZE); // SWHID -> node this.mph = loadMph(graphPath + ".mph"); this.orderInputStream = new FileInputStream(graphPath + ".order"); this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel()); } @SuppressWarnings("unchecked") public static Object2LongFunction loadMph(String path) throws IOException { Object obj; try { obj = BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e.getMessage()); } Object2LongFunction res = (Object2LongFunction) obj; // Backward-compatibility for old maps parametrized with . // New maps should be parametrized with , which is faster. try { // Try to call it with bytes, will fail if it's a O2LF. res.getLong("42".getBytes(StandardCharsets.UTF_8)); } catch (ClassCastException e) { class StringCompatibleByteFunction implements Object2LongFunction, Size64 { private final Object2LongFunction legacyFunction; public StringCompatibleByteFunction(Object2LongFunction legacyFunction) { this.legacyFunction = legacyFunction; } @Override public long getLong(Object o) { byte[] bi = (byte[]) o; return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8)); } @Override public int size() { return legacyFunction.size(); } @Override public long size64() { return (legacyFunction instanceof Size64) ? ((Size64) legacyFunction).size64() : legacyFunction.size(); } } Object2LongFunction mphLegacy = (Object2LongFunction) obj; return new StringCompatibleByteFunction(mphLegacy); /* * res = (o -> { byte[] bi = (byte[]) o; return mphLegacy.getLong(new String(bi, * StandardCharsets.UTF_8)); }); */ } // End of backward-compatibility block return res; } /** * Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the * SWHID is valid. * * @param swhid node represented as bytes * @return corresponding node as a long id */ public long getNodeId(byte[] swhid) { // 1. Hash the SWHID with the MPH to get its original ID long origNodeId = mph.getLong(swhid); // 2. Use the order permutation to get the position in the permuted graph return this.orderMap.getLong(origNodeId); } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} * @param checkExists if true, error if the SWHID is not present in the graph, if false the check * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ public long getNodeId(SWHID swhid, boolean checkExists) { // Convert the SWHID to bytes and call getNodeId() long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII)); // Check that the position effectively corresponds to a real node using the reverse map. // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. if (!checkExists || getSWHID(nodeId).equals(swhid)) { return nodeId; } else { throw new IllegalArgumentException("Unknown SWHID: " + swhid); } } public long getNodeId(SWHID swhid) { return getNodeId(swhid, true); } /** * Converts a node long id to corresponding SWHID. * * @param nodeId node as a long id * @return corresponding node as a {@link SWHID} * @see SWHID */ public SWHID getSWHID(long nodeId) { /* * Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's * swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid */ if (nodeId < 0 || nodeId >= nbIds) { throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); } return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId)); } /** * Closes the mapping files. */ public void close() throws IOException { orderInputStream.close(); nodeToSwhMap.close(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java index 3c9f6ef..2050bba 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java @@ -1,217 +1,216 @@ package org.softwareheritage.graph.maps; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SWHID; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.Scanner; import java.util.concurrent.TimeUnit; /** * Create maps needed at runtime by the graph service, in particular: *

*

* * @author The Software Heritage developers */ public class NodeMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class); /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV"); System.exit(1); } String graphPath = args[0]; String tmpDir = args[1]; logger.info("starting maps generation..."); precomputeNodeIdMap(graphPath, tmpDir); logger.info("maps generation completed"); } /** * Computes and dumps on disk mapping files. * * @param graphPath path of the compressed graph */ // Suppress warning for Object2LongFunction cast @SuppressWarnings("unchecked") static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); plSWHID2Node.itemsName = "swhid→node"; plNode2SWHID.itemsName = "node→swhid"; /* * avg speed for swhid→node is sometime skewed due to write to the sort pipe hanging when sort is * sorting; hence also desplay local speed */ plSWHID2Node.displayLocalSpeed = true; // first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) Object2LongFunction mphMap = null; try { logger.info("loading MPH function..."); mphMap = (Object2LongFunction) BinIO.loadObject(graphPath + ".mph"); logger.info("MPH function loaded"); } catch (ClassNotFoundException e) { logger.error("unknown class object in .mph file: " + e); System.exit(2); } long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); plSWHID2Node.expectedUpdates = nbIds; plNode2SWHID.expectedUpdates = nbIds; // second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); logger.info("loading BFS order file..."); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); logger.info("BFS order file loaded"); if (loaded != nbIds) { logger.error("graph contains " + nbIds + " nodes, but read " + loaded); System.exit(2); } /* * Create mapping SWHID -> WebGraph node id, by sequentially reading nodes, hashing them with MPH, * and permuting according to BFS order */ FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator swhidIterator = new LineIterator(buffer); /* * The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically * sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the * sorting job to /usr/bin/sort via pipes */ ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); // for the binary format of swhidToNodeMap, see Python module swh.graph.swhid:SwhidToIntMap // for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap try (DataOutputStream swhidToNodeMap = new DataOutputStream( - new BufferedOutputStream(new FileOutputStream(graphPath + Graph.SWHID_TO_NODE))); + new BufferedOutputStream(new FileOutputStream(graphPath + NodeIdMap.SWHID_TO_NODE))); BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( - new FileOutputStream(graphPath + Graph.NODE_TO_SWHID))) { + new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) { /* * background handler for sort output, it will be fed SWHID/node pairs while swhidToNodeMap is being * filled, and will itself fill nodeToSwhidMap as soon as data from sort is ready */ SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); outputHandler.start(); /* * Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to * efficiently check edge restrictions. */ final int log2NbTypes = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); final int nbBitsPerNodeType = log2NbTypes; LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); plSWHID2Node.start("filling swhid2node map"); for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { String swhidStr = swhidIterator.next().toString(); SWHID swhid = new SWHID(swhidStr); byte[] swhidBin = swhid.toBytes(); long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII)); long nodeId = BigArrays.get(bfsMap, mphId); swhidToNodeMap.write(swhidBin, 0, swhidBin.length); swhidToNodeMap.writeLong(nodeId); sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); nodeTypesMap.set(nodeId, swhid.getType().ordinal()); plSWHID2Node.lightUpdate(); } plSWHID2Node.done(); sort_stdin.close(); // write type map logger.info("storing type map"); - BinIO.storeObject(nodeTypesMap, graphPath + Graph.NODE_TO_TYPE); + BinIO.storeObject(nodeTypesMap, graphPath + NodeTypesMap.NODE_TO_TYPE); logger.info("type map stored"); // wait for nodeToSwhidMap filling try { logger.info("waiting for node2swhid map..."); int sortExitCode = sort.waitFor(); if (sortExitCode != 0) { logger.error("sort returned non-zero exit code: " + sortExitCode); System.exit(2); } outputHandler.join(); } catch (InterruptedException e) { logger.error("processing of sort output failed with: " + e); System.exit(2); } } } private static class SortOutputHandler extends Thread { private Scanner input; private OutputStream output; private ProgressLogger pl; SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { this.input = new Scanner(input, StandardCharsets.US_ASCII); this.output = output; this.pl = pl; } public void run() { boolean sortDone = false; logger.info("node2swhid: waiting for sort output..."); while (input.hasNextLine()) { if (!sortDone) { sortDone = true; this.pl.start("filling node2swhid map"); } String line = input.nextLine(); // format: SWHID NODE_ID SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID try { output.write((byte[]) swhid.toBytes()); } catch (IOException e) { logger.error("writing to node->SWHID map failed with: " + e); } this.pl.lightUpdate(); } this.pl.done(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java index c835e02..befe094 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java @@ -1,52 +1,54 @@ package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; -import org.softwareheritage.graph.Graph; import org.softwareheritage.graph.Node; import java.io.IOException; /** * Mapping between long node id and SWH node type as described in the * data model. *

* The type mapping is pre-computed and dumped on disk in the {@link NodeMapBuilder} class, then it * is loaded in-memory here using fastutil LongBigList. * To be space-efficient, the mapping is stored as a bitmap using minimum number of bits per * {@link Node.Type}. * * @author The Software Heritage developers */ public class NodeTypesMap { + /** File extension for the long node id to node type map */ + public static final String NODE_TO_TYPE = ".node2type.map"; + /** * Array storing for each node its type */ public LongBigList nodeTypesMap; /** * Constructor. * * @param graphPath path and basename of the compressed graph */ public NodeTypesMap(String graphPath) throws IOException { try { - nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + Graph.NODE_TO_TYPE); + nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + NODE_TO_TYPE); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Unknown class object: " + e); } } /** * Returns node type from a node long id. * * @param nodeId node as a long id * @return corresponding {@link Node.Type} value * @see org.softwareheritage.graph.Node.Type */ public Node.Type getType(long nodeId) { long type = nodeTypesMap.getLong(nodeId); return Node.Type.fromInt((int) type); } }