Page MenuHomeSoftware Heritage

D8971.id32395.diff
No OneTemporary

D8971.id32395.diff

diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
@@ -124,6 +124,11 @@
return getProperties().getMessage(nodeId);
}
+ /** @see SwhGraphProperties#getMessageBase64(long) */
+ default byte[] getMessageBase64(long nodeId) {
+ return getProperties().getMessageBase64(nodeId);
+ }
+
/** @see SwhGraphProperties#getUrl(long) */
default String getUrl(long nodeId) {
return getProperties().getUrl(nodeId);
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
@@ -274,6 +274,11 @@
/** Get the message of the given revision or release node */
public byte[] getMessage(long nodeId) {
+ return Base64.getDecoder().decode(getMessageBase64(nodeId));
+ }
+
+ /** Get the message of the given revision or release node, encoded as a base64 byte array */
+ public byte[] getMessageBase64(long nodeId) {
if (messageBuffer == null || messageOffsets == null) {
throw new IllegalStateException("Messages not loaded");
}
@@ -281,7 +286,7 @@
if (startOffset == -1) {
return null;
}
- return Base64.getDecoder().decode(getLine(messageBuffer, startOffset));
+ return getLine(messageBuffer, startOffset);
}
/** Get the URL of the given origin node */
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java
@@ -8,6 +8,9 @@
/* For each origin and each person, outputs a line "origin_id,person_id",
* if that person contributed to the origin.
*
+ * A .csv table containing "origin_id,origin_url_base64" is also written
+ * to the given path.
+ *
* This takes the output of TopoSort on stdin.
*
*/
@@ -17,6 +20,7 @@
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.*;
+import java.io.PrintWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
@@ -30,16 +34,20 @@
private static boolean optimizeReuse = true;
public static void main(String[] args) throws IOException, ClassNotFoundException {
- if (args.length != 1) {
- System.err.println("Syntax: java org.softwareheritage.graph.utils.FindEarliestRevision <path/to/graph>");
+ if (args.length != 2) {
+ System.err.println(
+ "Syntax: java org.softwareheritage.graph.utils.FindEarliestRevision <path/to/graph> <path/to/origin_urls.csv>");
System.exit(1);
}
String graphBasename = args[0];
+ PrintWriter originUrlsFileWriter = new PrintWriter(args[1]);
System.err.println("Loading graph " + graphBasename + " ...");
SwhBidirectionalGraph underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename);
System.err.println("Loading person ids");
underlyingGraph.loadPersonIds();
+ System.err.println("Loading messages");
+ underlyingGraph.loadMessages();
System.err.println("Selecting subgraph.");
Subgraph graph = new Subgraph(underlyingGraph, new AllowedNodes("rev,rel,snp,ori"));
System.err.println("Graph loaded.");
@@ -61,6 +69,7 @@
HashMap<Long, Long> pendingSuccessors = new HashMap<>();
System.out.println("origin_id,person_id");
+ originUrlsFileWriter.println("origin_id,origin_url_base64");
while (stdin.hasNextLine()) {
String cells[] = stdin.nextLine().strip().split(",", -1);
SWHID nodeSWHID = new SWHID(cells[0]);
@@ -137,6 +146,10 @@
nodeContributors.forEach((contributorId) -> {
System.out.format("%d,%d\n", nodeId, contributorId);
});
+ byte[] url = underlyingGraph.getMessageBase64(nodeId);
+ if (url != null) {
+ originUrlsFileWriter.format("%d,%s\n", nodeId, new String(url));
+ }
}
if (successorCount > 0) {
@@ -147,5 +160,7 @@
pendingSuccessors.put(nodeId, successorCount);
}
}
+
+ originUrlsFileWriter.flush();
}
}
diff --git a/swh/graph/luigi/origin_contributors.py b/swh/graph/luigi/origin_contributors.py
--- a/swh/graph/luigi/origin_contributors.py
+++ b/swh/graph/luigi/origin_contributors.py
@@ -30,6 +30,7 @@
local_graph_path = luigi.PathParameter()
topological_order_path = luigi.PathParameter()
origin_contributors_path = luigi.PathParameter()
+ origin_urls_path = luigi.PathParameter()
graph_name = luigi.Parameter(default="graph")
def requires(self) -> List[luigi.Task]:
@@ -50,14 +51,23 @@
def run(self) -> None:
"""Runs org.softwareheritage.graph.utils.TopoSort and compresses"""
+ import tempfile
+
class_name = "org.softwareheritage.graph.utils.ListOriginContributors"
- script = f"""
- zstdcat {self.topological_order_path} \
- | java {class_name} '{self.local_graph_path}/{self.graph_name}' \
- | pv --line-mode --wait \
- | zstdmt -19
- """
- run_script(script, self.origin_contributors_path)
+ with tempfile.NamedTemporaryFile(
+ prefix="origin_urls_", suffix=".csv"
+ ) as origin_urls_fd:
+ script = f"""
+ zstdcat {self.topological_order_path} \
+ | java {class_name} '{self.local_graph_path}/{self.graph_name}' '{origin_urls_fd.name}' \
+ | pv --line-mode --wait \
+ | zstdmt -19
+ """ # noqa
+ run_script(script, self.origin_contributors_path)
+ run_script(
+ f"pv '{origin_urls_fd.name}' | zstdmt -19",
+ self.origin_urls_path,
+ )
class ExportDeanonymizationTable(luigi.Task):
diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py
--- a/swh/graph/tests/test_origin_contributors.py
+++ b/swh/graph/tests/test_origin_contributors.py
@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import base64
import datetime
from pathlib import Path
import subprocess
@@ -37,6 +38,21 @@
0,2
"""
+assert (
+ base64.b64decode("aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=")
+ == b"https://example.com/swh/graph"
+)
+assert (
+ base64.b64decode("aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy")
+ == b"https://example.com/swh/graph2"
+)
+
+ORIGIN_URLS = """\
+origin_id,origin_url_base64
+2,aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=
+0,aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy
+"""
+
DEANONYMIZATION_TABLE = """\
sha256_base64,base64,escaped
8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
@@ -65,6 +81,7 @@
topological_order_path = tmpdir / "topo_order.csv.zst"
origin_contributors_path = tmpdir / "origin_contributors.csv.zst"
+ origin_urls_path = tmpdir / "origin_urls.csv.zst"
subprocess.run(
["zstdmt", "-o", topological_order_path],
@@ -76,15 +93,18 @@
local_graph_path=DATA_DIR / "compressed",
topological_order_path=topological_order_path,
origin_contributors_path=origin_contributors_path,
+ origin_urls_path=origin_urls_path,
graph_name="example",
)
task.run()
csv_text = subprocess.check_output(["zstdcat", origin_contributors_path]).decode()
-
assert csv_text == ORIGIN_CONTRIBUTORS
+ urls_text = subprocess.check_output(["zstdcat", origin_urls_path]).decode()
+ assert urls_text == ORIGIN_URLS
+
def test_export_deanonymization_table(tmpdir, swh_storage_postgresql, swh_storage):
tmpdir = Path(tmpdir)

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 8:34 PM (2 h, 45 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214818

Event Timeline