diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java --- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java @@ -124,6 +124,11 @@ return getProperties().getMessage(nodeId); } + /** @see SwhGraphProperties#getMessageBase64(long) */ + default byte[] getMessageBase64(long nodeId) { + return getProperties().getMessageBase64(nodeId); + } + /** @see SwhGraphProperties#getUrl(long) */ default String getUrl(long nodeId) { return getProperties().getUrl(nodeId); diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java --- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java @@ -274,6 +274,11 @@ /** Get the message of the given revision or release node */ public byte[] getMessage(long nodeId) { + return Base64.getDecoder().decode(getMessageBase64(nodeId)); + } + + /** Get the message of the given revision or release node, encoded as a base64 byte array */ + public byte[] getMessageBase64(long nodeId) { if (messageBuffer == null || messageOffsets == null) { throw new IllegalStateException("Messages not loaded"); } @@ -281,7 +286,7 @@ if (startOffset == -1) { return null; } - return Base64.getDecoder().decode(getLine(messageBuffer, startOffset)); + return getLine(messageBuffer, startOffset); } /** Get the URL of the given origin node */ diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java --- a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java @@ -8,6 +8,9 @@ /* For each origin and each person, outputs a line "origin_id,person_id", * if that person contributed to the origin. * + * A .csv table containing "origin_id,origin_url_base64" is also written + * to the given path. + * * This takes the output of TopoSort on stdin. * */ @@ -17,6 +20,7 @@ import it.unimi.dsi.big.webgraph.LazyLongIterator; import org.softwareheritage.graph.*; +import java.io.PrintWriter; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; @@ -30,16 +34,20 @@ private static boolean optimizeReuse = true; public static void main(String[] args) throws IOException, ClassNotFoundException { - if (args.length != 1) { - System.err.println("Syntax: java org.softwareheritage.graph.utils.FindEarliestRevision "); + if (args.length != 2) { + System.err.println( + "Syntax: java org.softwareheritage.graph.utils.FindEarliestRevision "); System.exit(1); } String graphBasename = args[0]; + PrintWriter originUrlsFileWriter = new PrintWriter(args[1]); System.err.println("Loading graph " + graphBasename + " ..."); SwhBidirectionalGraph underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename); System.err.println("Loading person ids"); underlyingGraph.loadPersonIds(); + System.err.println("Loading messages"); + underlyingGraph.loadMessages(); System.err.println("Selecting subgraph."); Subgraph graph = new Subgraph(underlyingGraph, new AllowedNodes("rev,rel,snp,ori")); System.err.println("Graph loaded."); @@ -61,6 +69,7 @@ HashMap pendingSuccessors = new HashMap<>(); System.out.println("origin_id,person_id"); + originUrlsFileWriter.println("origin_id,origin_url_base64"); while (stdin.hasNextLine()) { String cells[] = stdin.nextLine().strip().split(",", -1); SWHID nodeSWHID = new SWHID(cells[0]); @@ -137,6 +146,10 @@ nodeContributors.forEach((contributorId) -> { System.out.format("%d,%d\n", nodeId, contributorId); }); + byte[] url = underlyingGraph.getMessageBase64(nodeId); + if (url != null) { + originUrlsFileWriter.format("%d,%s\n", nodeId, new String(url)); + } } if (successorCount > 0) { @@ -147,5 +160,7 @@ pendingSuccessors.put(nodeId, successorCount); } } + + originUrlsFileWriter.flush(); } } diff --git a/swh/graph/luigi/origin_contributors.py b/swh/graph/luigi/origin_contributors.py --- a/swh/graph/luigi/origin_contributors.py +++ b/swh/graph/luigi/origin_contributors.py @@ -30,6 +30,7 @@ local_graph_path = luigi.PathParameter() topological_order_path = luigi.PathParameter() origin_contributors_path = luigi.PathParameter() + origin_urls_path = luigi.PathParameter() graph_name = luigi.Parameter(default="graph") def requires(self) -> List[luigi.Task]: @@ -50,14 +51,23 @@ def run(self) -> None: """Runs org.softwareheritage.graph.utils.TopoSort and compresses""" + import tempfile + class_name = "org.softwareheritage.graph.utils.ListOriginContributors" - script = f""" - zstdcat {self.topological_order_path} \ - | java {class_name} '{self.local_graph_path}/{self.graph_name}' \ - | pv --line-mode --wait \ - | zstdmt -19 - """ - run_script(script, self.origin_contributors_path) + with tempfile.NamedTemporaryFile( + prefix="origin_urls_", suffix=".csv" + ) as origin_urls_fd: + script = f""" + zstdcat {self.topological_order_path} \ + | java {class_name} '{self.local_graph_path}/{self.graph_name}' '{origin_urls_fd.name}' \ + | pv --line-mode --wait \ + | zstdmt -19 + """ # noqa + run_script(script, self.origin_contributors_path) + run_script( + f"pv '{origin_urls_fd.name}' | zstdmt -19", + self.origin_urls_path, + ) class ExportDeanonymizationTable(luigi.Task): diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py --- a/swh/graph/tests/test_origin_contributors.py +++ b/swh/graph/tests/test_origin_contributors.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import base64 import datetime from pathlib import Path import subprocess @@ -37,6 +38,21 @@ 0,2 """ +assert ( + base64.b64decode("aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=") + == b"https://example.com/swh/graph" +) +assert ( + base64.b64decode("aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy") + == b"https://example.com/swh/graph2" +) + +ORIGIN_URLS = """\ +origin_id,origin_url_base64 +2,aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg= +0,aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy +""" + DEANONYMIZATION_TABLE = """\ sha256_base64,base64,escaped 8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe @@ -65,6 +81,7 @@ topological_order_path = tmpdir / "topo_order.csv.zst" origin_contributors_path = tmpdir / "origin_contributors.csv.zst" + origin_urls_path = tmpdir / "origin_urls.csv.zst" subprocess.run( ["zstdmt", "-o", topological_order_path], @@ -76,15 +93,18 @@ local_graph_path=DATA_DIR / "compressed", topological_order_path=topological_order_path, origin_contributors_path=origin_contributors_path, + origin_urls_path=origin_urls_path, graph_name="example", ) task.run() csv_text = subprocess.check_output(["zstdcat", origin_contributors_path]).decode() - assert csv_text == ORIGIN_CONTRIBUTORS + urls_text = subprocess.check_output(["zstdcat", origin_urls_path]).decode() + assert urls_text == ORIGIN_URLS + def test_export_deanonymization_table(tmpdir, swh_storage_postgresql, swh_storage): tmpdir = Path(tmpdir)