Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123329
D8971.id32395.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D8971.id32395.diff
View Options
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
@@ -124,6 +124,11 @@
return getProperties().getMessage(nodeId);
}
+ /** @see SwhGraphProperties#getMessageBase64(long) */
+ default byte[] getMessageBase64(long nodeId) {
+ return getProperties().getMessageBase64(nodeId);
+ }
+
/** @see SwhGraphProperties#getUrl(long) */
default String getUrl(long nodeId) {
return getProperties().getUrl(nodeId);
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
--- a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
@@ -274,6 +274,11 @@
/** Get the message of the given revision or release node */
public byte[] getMessage(long nodeId) {
+ return Base64.getDecoder().decode(getMessageBase64(nodeId));
+ }
+
+ /** Get the message of the given revision or release node, encoded as a base64 byte array */
+ public byte[] getMessageBase64(long nodeId) {
if (messageBuffer == null || messageOffsets == null) {
throw new IllegalStateException("Messages not loaded");
}
@@ -281,7 +286,7 @@
if (startOffset == -1) {
return null;
}
- return Base64.getDecoder().decode(getLine(messageBuffer, startOffset));
+ return getLine(messageBuffer, startOffset);
}
/** Get the URL of the given origin node */
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java
--- a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java
@@ -8,6 +8,9 @@
/* For each origin and each person, outputs a line "origin_id,person_id",
* if that person contributed to the origin.
*
+ * A .csv table containing "origin_id,origin_url_base64" is also written
+ * to the given path.
+ *
* This takes the output of TopoSort on stdin.
*
*/
@@ -17,6 +20,7 @@
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import org.softwareheritage.graph.*;
+import java.io.PrintWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
@@ -30,16 +34,20 @@
private static boolean optimizeReuse = true;
public static void main(String[] args) throws IOException, ClassNotFoundException {
- if (args.length != 1) {
- System.err.println("Syntax: java org.softwareheritage.graph.utils.FindEarliestRevision <path/to/graph>");
+ if (args.length != 2) {
+ System.err.println(
+ "Syntax: java org.softwareheritage.graph.utils.FindEarliestRevision <path/to/graph> <path/to/origin_urls.csv>");
System.exit(1);
}
String graphBasename = args[0];
+ PrintWriter originUrlsFileWriter = new PrintWriter(args[1]);
System.err.println("Loading graph " + graphBasename + " ...");
SwhBidirectionalGraph underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename);
System.err.println("Loading person ids");
underlyingGraph.loadPersonIds();
+ System.err.println("Loading messages");
+ underlyingGraph.loadMessages();
System.err.println("Selecting subgraph.");
Subgraph graph = new Subgraph(underlyingGraph, new AllowedNodes("rev,rel,snp,ori"));
System.err.println("Graph loaded.");
@@ -61,6 +69,7 @@
HashMap<Long, Long> pendingSuccessors = new HashMap<>();
System.out.println("origin_id,person_id");
+ originUrlsFileWriter.println("origin_id,origin_url_base64");
while (stdin.hasNextLine()) {
String cells[] = stdin.nextLine().strip().split(",", -1);
SWHID nodeSWHID = new SWHID(cells[0]);
@@ -137,6 +146,10 @@
nodeContributors.forEach((contributorId) -> {
System.out.format("%d,%d\n", nodeId, contributorId);
});
+ byte[] url = underlyingGraph.getMessageBase64(nodeId);
+ if (url != null) {
+ originUrlsFileWriter.format("%d,%s\n", nodeId, new String(url));
+ }
}
if (successorCount > 0) {
@@ -147,5 +160,7 @@
pendingSuccessors.put(nodeId, successorCount);
}
}
+
+ originUrlsFileWriter.flush();
}
}
diff --git a/swh/graph/luigi/origin_contributors.py b/swh/graph/luigi/origin_contributors.py
--- a/swh/graph/luigi/origin_contributors.py
+++ b/swh/graph/luigi/origin_contributors.py
@@ -30,6 +30,7 @@
local_graph_path = luigi.PathParameter()
topological_order_path = luigi.PathParameter()
origin_contributors_path = luigi.PathParameter()
+ origin_urls_path = luigi.PathParameter()
graph_name = luigi.Parameter(default="graph")
def requires(self) -> List[luigi.Task]:
@@ -50,14 +51,23 @@
def run(self) -> None:
"""Runs org.softwareheritage.graph.utils.TopoSort and compresses"""
+ import tempfile
+
class_name = "org.softwareheritage.graph.utils.ListOriginContributors"
- script = f"""
- zstdcat {self.topological_order_path} \
- | java {class_name} '{self.local_graph_path}/{self.graph_name}' \
- | pv --line-mode --wait \
- | zstdmt -19
- """
- run_script(script, self.origin_contributors_path)
+ with tempfile.NamedTemporaryFile(
+ prefix="origin_urls_", suffix=".csv"
+ ) as origin_urls_fd:
+ script = f"""
+ zstdcat {self.topological_order_path} \
+ | java {class_name} '{self.local_graph_path}/{self.graph_name}' '{origin_urls_fd.name}' \
+ | pv --line-mode --wait \
+ | zstdmt -19
+ """ # noqa
+ run_script(script, self.origin_contributors_path)
+ run_script(
+ f"pv '{origin_urls_fd.name}' | zstdmt -19",
+ self.origin_urls_path,
+ )
class ExportDeanonymizationTable(luigi.Task):
diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py
--- a/swh/graph/tests/test_origin_contributors.py
+++ b/swh/graph/tests/test_origin_contributors.py
@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import base64
import datetime
from pathlib import Path
import subprocess
@@ -37,6 +38,21 @@
0,2
"""
+assert (
+ base64.b64decode("aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=")
+ == b"https://example.com/swh/graph"
+)
+assert (
+ base64.b64decode("aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy")
+ == b"https://example.com/swh/graph2"
+)
+
+ORIGIN_URLS = """\
+origin_id,origin_url_base64
+2,aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg=
+0,aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGgy
+"""
+
DEANONYMIZATION_TABLE = """\
sha256_base64,base64,escaped
8qhF7WQ2bmeoRbZipAaqtNw6QdOCDcpggLWCQLzITsI=,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe <jdoe@example.org>
@@ -65,6 +81,7 @@
topological_order_path = tmpdir / "topo_order.csv.zst"
origin_contributors_path = tmpdir / "origin_contributors.csv.zst"
+ origin_urls_path = tmpdir / "origin_urls.csv.zst"
subprocess.run(
["zstdmt", "-o", topological_order_path],
@@ -76,15 +93,18 @@
local_graph_path=DATA_DIR / "compressed",
topological_order_path=topological_order_path,
origin_contributors_path=origin_contributors_path,
+ origin_urls_path=origin_urls_path,
graph_name="example",
)
task.run()
csv_text = subprocess.check_output(["zstdcat", origin_contributors_path]).decode()
-
assert csv_text == ORIGIN_CONTRIBUTORS
+ urls_text = subprocess.check_output(["zstdcat", origin_urls_path]).decode()
+ assert urls_text == ORIGIN_URLS
+
def test_export_deanonymization_table(tmpdir, swh_storage_postgresql, swh_storage):
tmpdir = Path(tmpdir)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 8:34 PM (12 m, 48 s ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214818
Attached To
D8971: origin_contributors: Write table mapping origin ID to origin URL (base64-encoded)
Event Timeline
Log In to Comment