diff --git a/swh/dataset/graph.py b/swh/dataset/graph.py --- a/swh/dataset/graph.py +++ b/swh/dataset/graph.py @@ -184,13 +184,22 @@ function) ; - deflate the edges ; - count the number of edges and write it in graph.edges.count.txt ; + - count the number of occurrences of each edge type and write them + in graph.edges.stats.txt ; - concatenate all the (deflated) nodes from the export with the destination edges, and sort the output to get the list of unique graph nodes ; - count the number of unique graph nodes and write it in graph.nodes.count.txt ; + - count the number of occurrences of each node type and write them + in graph.nodes.stats.txt ; - compress and write the resulting nodes in graph.nodes.csv.zst. """ + + # Use awk as a replacement of `sort | uniq -c` to avoid buffering everything + # in memory + counter_command = "awk '{ t[$0]++ } END { for (i in t) print i,t[i] }'" + # Use bytes for the sorting algorithm (faster than being locale-specific) env = { **os.environ.copy(), @@ -212,15 +221,20 @@ "tee {export_path}/graph.edges.csv.zst |" "zstdcat |" "tee >( wc -l > {export_path}/graph.edges.count.txt ) |" + "tee >( cut -d: -f3,6 | {counter_command} | sort " + " > {export_path}/graph.edges.stats.txt ) |" "cut -d' ' -f2 | " "cat - <( zstdcat {export_path}/*/*.nodes.csv.zst ) | " "sort -u -S{sort_buffer_size} -T{buffer_path} | " "tee >( wc -l > {export_path}/graph.nodes.count.txt ) |" + "tee >( cut -d: -f3 | {counter_command} | sort " + " > {export_path}/graph.nodes.stats.txt ) |" "zstdmt > {export_path}/graph.nodes.csv.zst" ).format( export_path=shlex.quote(str(export_path)), buffer_path=shlex.quote(str(buffer_path)), sort_buffer_size=shlex.quote(sort_buffer_size), + counter_command=counter_command, ), ], env=env, diff --git a/swh/dataset/test/test_graph.py b/swh/dataset/test/test_graph.py --- a/swh/dataset/test/test_graph.py +++ b/swh/dataset/test/test_graph.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import collections import hashlib from typing import Tuple @@ -484,9 +485,32 @@ output_nodes = list(filter(bool, output_nodes)) output_edges = list(filter(bool, output_edges)) - expected_nodes = set(input_nodes) | set(l.split()[1] for l in input_edges) + expected_nodes = set(input_nodes) | set(e.split()[1] for e in input_edges) assert output_nodes == sorted(expected_nodes) assert int((tmp_path / "graph.nodes.count.txt").read_text()) == len(expected_nodes) assert sorted(output_edges) == sorted(input_edges) assert int((tmp_path / "graph.edges.count.txt").read_text()) == len(input_edges) + + actual_node_stats = (tmp_path / "graph.nodes.stats.txt").read_text().strip() + expected_node_stats = "\n".join( + sorted( + "{} {}".format(k, v) + for k, v in collections.Counter( + node.split(":")[2] for node in expected_nodes + ).items() + ) + ) + assert actual_node_stats == expected_node_stats + + actual_edge_stats = (tmp_path / "graph.edges.stats.txt").read_text().strip() + expected_edge_stats = "\n".join( + sorted( + "{} {}".format(k, v) + for k, v in collections.Counter( + "{}:{}".format(edge.split(":")[2], edge.split(":")[5]) + for edge in input_edges + ).items() + ) + ) + assert actual_edge_stats == expected_edge_stats