Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/test/test_graph.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from base64 import b64encode | |||||
import collections | import collections | ||||
import hashlib | import hashlib | ||||
from typing import Tuple | from typing import Tuple | ||||
from unittest.mock import Mock, call | from unittest.mock import Mock, call | ||||
import pytest | import pytest | ||||
from swh.dataset.graph import process_messages, sort_graph_nodes | from swh.dataset.graph import process_messages, sort_graph_nodes | ||||
▲ Show 20 Lines • Show All 92 Lines • ▼ Show 20 Lines | |||||
def binhash(s): | def binhash(s): | ||||
return hashlib.sha1(s.encode()).digest() | return hashlib.sha1(s.encode()).digest() | ||||
def hexhash(s): | def hexhash(s): | ||||
return hashlib.sha1(s.encode()).hexdigest() | return hashlib.sha1(s.encode()).hexdigest() | ||||
def b64e(s: str) -> str: | |||||
return b64encode(s.encode()).decode() | |||||
def test_export_origin_visit_status(exporter): | def test_export_origin_visit_status(exporter): | ||||
node_writer, edge_writer = exporter( | node_writer, edge_writer = exporter( | ||||
{ | { | ||||
"origin_visit_status": [ | "origin_visit_status": [ | ||||
{ | { | ||||
**TEST_ORIGIN_VISIT_STATUS, | **TEST_ORIGIN_VISIT_STATUS, | ||||
"origin": "ori1", | "origin": "ori1", | ||||
"snapshot": binhash("snp1"), | "snapshot": binhash("snp1"), | ||||
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines | node_writer, edge_writer = exporter( | ||||
} | } | ||||
) | ) | ||||
assert node_writer.mock_calls == [ | assert node_writer.mock_calls == [ | ||||
call(f"swh:1:snp:{hexhash('snp1')}\n"), | call(f"swh:1:snp:{hexhash('snp1')}\n"), | ||||
call(f"swh:1:snp:{hexhash('snp2')}\n"), | call(f"swh:1:snp:{hexhash('snp2')}\n"), | ||||
call(f"swh:1:snp:{hexhash('snp3')}\n"), | call(f"swh:1:snp:{hexhash('snp3')}\n"), | ||||
] | ] | ||||
assert edge_writer.mock_calls == [ | assert edge_writer.mock_calls == [ | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}\n"), | call( | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}\n"), | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | ||||
call(f"swh:1:snp:{hexhash('snp2')} swh:1:rev:{hexhash('rev1')}\n"), | f" {b64e('refs/heads/master')}\n" | ||||
call(f"swh:1:snp:{hexhash('snp2')} swh:1:rev:{hexhash('rev2')}\n"), | ), | ||||
call(f"swh:1:snp:{hexhash('snp2')} swh:1:cnt:{hexhash('cnt1')}\n"), | call( | ||||
call(f"swh:1:snp:{hexhash('snp2')} swh:1:dir:{hexhash('dir1')}\n"), | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | ||||
call(f"swh:1:snp:{hexhash('snp2')} swh:1:rel:{hexhash('rel1')}\n"), | f" {b64e('HEAD')}\n" | ||||
call(f"swh:1:snp:{hexhash('snp2')} swh:1:snp:{hexhash('snp1')}\n"), | ), | ||||
call( | |||||
f"swh:1:snp:{hexhash('snp2')} swh:1:rev:{hexhash('rev1')}" | |||||
f" {b64e('refs/heads/master')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp2')} swh:1:rev:{hexhash('rev2')}" | |||||
f" {b64e('HEAD')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp2')} swh:1:cnt:{hexhash('cnt1')}" | |||||
f" {b64e('bcnt')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp2')} swh:1:dir:{hexhash('dir1')}" | |||||
f" {b64e('bdir')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp2')} swh:1:rel:{hexhash('rel1')}" | |||||
f" {b64e('brel')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp2')} swh:1:snp:{hexhash('snp1')}" | |||||
f" {b64e('bsnp')}\n" | |||||
), | |||||
] | ] | ||||
def test_export_snapshot_aliases(exporter): | def test_export_snapshot_aliases(exporter): | ||||
node_writer, edge_writer = exporter( | node_writer, edge_writer = exporter( | ||||
{ | { | ||||
"snapshot": [ | "snapshot": [ | ||||
{ | { | ||||
"id": binhash("snp1"), | "id": binhash("snp1"), | ||||
"branches": { | "branches": { | ||||
b"origin_branch": { | b"origin_branch": { | ||||
"target": binhash("rev1"), | "target": binhash("rev1"), | ||||
"target_type": "revision", | "target_type": "revision", | ||||
}, | }, | ||||
b"alias1": {"target": b"origin_branch", "target_type": "alias"}, | b"alias1": {"target": b"origin_branch", "target_type": "alias"}, | ||||
b"alias2": {"target": b"alias1", "target_type": "alias"}, | b"alias2": {"target": b"alias1", "target_type": "alias"}, | ||||
b"alias3": {"target": b"alias2", "target_type": "alias"}, | b"alias3": {"target": b"alias2", "target_type": "alias"}, | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
} | } | ||||
) | ) | ||||
assert node_writer.mock_calls == [call(f"swh:1:snp:{hexhash('snp1')}\n")] | assert node_writer.mock_calls == [call(f"swh:1:snp:{hexhash('snp1')}\n")] | ||||
assert edge_writer.mock_calls == ( | assert edge_writer.mock_calls == [ | ||||
[call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}\n")] * 4 | call( | ||||
) | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | ||||
f" {b64e('origin_branch')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | |||||
f" {b64e('alias1')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | |||||
f" {b64e('alias2')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | |||||
f" {b64e('alias3')}\n" | |||||
), | |||||
] | |||||
def test_export_snapshot_no_pull_requests(exporter): | def test_export_snapshot_no_pull_requests(exporter): | ||||
snp = { | snp = { | ||||
"id": binhash("snp1"), | "id": binhash("snp1"), | ||||
"branches": { | "branches": { | ||||
b"refs/heads/master": { | b"refs/heads/master": { | ||||
"target": binhash("rev1"), | "target": binhash("rev1"), | ||||
Show All 12 Lines | snp = { | ||||
"target": binhash("rev5"), | "target": binhash("rev5"), | ||||
"target_type": "revision", | "target_type": "revision", | ||||
}, | }, | ||||
}, | }, | ||||
} | } | ||||
node_writer, edge_writer = exporter({"snapshot": [snp]}) | node_writer, edge_writer = exporter({"snapshot": [snp]}) | ||||
assert edge_writer.mock_calls == [ | assert edge_writer.mock_calls == [ | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}\n"), | call( | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev2')}\n"), | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev3')}\n"), | f" {b64e('refs/heads/master')}\n" | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev4')}\n"), | ), | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev5')}\n"), | call( | ||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev2')}" | |||||
f" {b64e('refs/pull/42')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev3')}" | |||||
f" {b64e('refs/merge-requests/lol')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev4')}" | |||||
f" {b64e('refs/tags/v1.0.0')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev5')}" | |||||
f" {b64e('refs/patch/123456abc')}\n" | |||||
), | |||||
] | ] | ||||
node_writer, edge_writer = exporter( | node_writer, edge_writer = exporter( | ||||
{"snapshot": [snp]}, config={"remove_pull_requests": True} | {"snapshot": [snp]}, config={"remove_pull_requests": True} | ||||
) | ) | ||||
assert edge_writer.mock_calls == [ | assert edge_writer.mock_calls == [ | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}\n"), | call( | ||||
call(f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev4')}\n"), | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}" | ||||
f" {b64e('refs/heads/master')}\n" | |||||
), | |||||
call( | |||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev4')}" | |||||
f" {b64e('refs/tags/v1.0.0')}\n" | |||||
), | |||||
] | ] | ||||
def test_export_releases(exporter): | def test_export_releases(exporter): | ||||
node_writer, edge_writer = exporter( | node_writer, edge_writer = exporter( | ||||
{ | { | ||||
"release": [ | "release": [ | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines | |||||
def test_export_directory(exporter): | def test_export_directory(exporter): | ||||
node_writer, edge_writer = exporter( | node_writer, edge_writer = exporter( | ||||
{ | { | ||||
"directory": [ | "directory": [ | ||||
{ | { | ||||
"id": binhash("dir1"), | "id": binhash("dir1"), | ||||
"entries": [ | "entries": [ | ||||
{"type": "file", "target": binhash("cnt1")}, | { | ||||
{"type": "dir", "target": binhash("dir2")}, | "type": "file", | ||||
{"type": "rev", "target": binhash("rev1")}, | "target": binhash("cnt1"), | ||||
"name": b"cnt1", | |||||
"perms": 0o644, | |||||
}, | |||||
{ | |||||
"type": "dir", | |||||
"target": binhash("dir2"), | |||||
"name": b"dir2", | |||||
"perms": 0o755, | |||||
}, | |||||
{ | |||||
"type": "rev", | |||||
"target": binhash("rev1"), | |||||
"name": b"rev1", | |||||
"perms": 0o160000, | |||||
}, | |||||
], | ], | ||||
}, | }, | ||||
{"id": binhash("dir2"), "entries": [],}, | {"id": binhash("dir2"), "entries": [],}, | ||||
] | ] | ||||
} | } | ||||
) | ) | ||||
assert node_writer.mock_calls == [ | assert node_writer.mock_calls == [ | ||||
call(f"swh:1:dir:{hexhash('dir1')}\n"), | call(f"swh:1:dir:{hexhash('dir1')}\n"), | ||||
call(f"swh:1:dir:{hexhash('dir2')}\n"), | call(f"swh:1:dir:{hexhash('dir2')}\n"), | ||||
] | ] | ||||
assert edge_writer.mock_calls == [ | assert edge_writer.mock_calls == [ | ||||
call(f"swh:1:dir:{hexhash('dir1')} swh:1:cnt:{hexhash('cnt1')}\n"), | call( | ||||
call(f"swh:1:dir:{hexhash('dir1')} swh:1:dir:{hexhash('dir2')}\n"), | f"swh:1:dir:{hexhash('dir1')} swh:1:cnt:{hexhash('cnt1')}" | ||||
call(f"swh:1:dir:{hexhash('dir1')} swh:1:rev:{hexhash('rev1')}\n"), | f" {b64e('cnt1')} {0o644}\n" | ||||
), | |||||
call( | |||||
f"swh:1:dir:{hexhash('dir1')} swh:1:dir:{hexhash('dir2')}" | |||||
f" {b64e('dir2')} {0o755}\n" | |||||
), | |||||
call( | |||||
f"swh:1:dir:{hexhash('dir1')} swh:1:rev:{hexhash('rev1')}" | |||||
f" {b64e('rev1')} {0o160000}\n" | |||||
), | |||||
] | ] | ||||
def test_export_content(exporter): | def test_export_content(exporter): | ||||
node_writer, edge_writer = exporter( | node_writer, edge_writer = exporter( | ||||
{ | { | ||||
"content": [ | "content": [ | ||||
{**TEST_CONTENT, "sha1_git": binhash("cnt1"),}, | {**TEST_CONTENT, "sha1_git": binhash("cnt1"),}, | ||||
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines | input_nodes = [ | ||||
for x in range(4) | for x in range(4) | ||||
] | ] | ||||
input_edges = [ | input_edges = [ | ||||
f"swh:1:ori:{hexhash('ori1')} swh:1:snp:{hexhash('snp1')}", | f"swh:1:ori:{hexhash('ori1')} swh:1:snp:{hexhash('snp1')}", | ||||
f"swh:1:ori:{hexhash('ori2')} swh:1:snp:{hexhash('snp2')}", | f"swh:1:ori:{hexhash('ori2')} swh:1:snp:{hexhash('snp2')}", | ||||
f"swh:1:ori:{hexhash('ori3')} swh:1:snp:{hexhash('snp3')}", | f"swh:1:ori:{hexhash('ori3')} swh:1:snp:{hexhash('snp3')}", | ||||
f"swh:1:ori:{hexhash('ori4')} swh:1:snp:{hexhash('snpX')}", # missing dest | f"swh:1:ori:{hexhash('ori4')} swh:1:snp:{hexhash('snpX')}", # missing dest | ||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}", # dup | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')} {b64e('dup1')}", | ||||
f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')}", # dup | f"swh:1:snp:{hexhash('snp1')} swh:1:rev:{hexhash('rev1')} {b64e('dup2')}", | ||||
f"swh:1:snp:{hexhash('snp3')} swh:1:cnt:{hexhash('cnt1')}", | f"swh:1:snp:{hexhash('snp3')} swh:1:cnt:{hexhash('cnt1')} {b64e('c1')}", | ||||
f"swh:1:snp:{hexhash('snp4')} swh:1:rel:{hexhash('rel1')}", | f"swh:1:snp:{hexhash('snp4')} swh:1:rel:{hexhash('rel1')} {b64e('r1')}", | ||||
f"swh:1:rel:{hexhash('rel1')} swh:1:rel:{hexhash('rel2')}", | f"swh:1:rel:{hexhash('rel1')} swh:1:rel:{hexhash('rel2')}", | ||||
f"swh:1:rel:{hexhash('rel2')} swh:1:rev:{hexhash('rev1')}", | f"swh:1:rel:{hexhash('rel2')} swh:1:rev:{hexhash('rev1')}", | ||||
f"swh:1:rel:{hexhash('rel3')} swh:1:rev:{hexhash('rev2')}", | f"swh:1:rel:{hexhash('rel3')} swh:1:rev:{hexhash('rev2')}", | ||||
f"swh:1:rel:{hexhash('rel4')} swh:1:dir:{hexhash('dir1')}", | f"swh:1:rel:{hexhash('rel4')} swh:1:dir:{hexhash('dir1')}", | ||||
f"swh:1:rev:{hexhash('rev1')} swh:1:rev:{hexhash('rev1')}", # dup | f"swh:1:rev:{hexhash('rev1')} swh:1:rev:{hexhash('rev1')}", # dup | ||||
f"swh:1:rev:{hexhash('rev1')} swh:1:rev:{hexhash('rev1')}", # dup | f"swh:1:rev:{hexhash('rev1')} swh:1:rev:{hexhash('rev1')}", # dup | ||||
f"swh:1:rev:{hexhash('rev1')} swh:1:rev:{hexhash('rev2')}", | f"swh:1:rev:{hexhash('rev1')} swh:1:rev:{hexhash('rev2')}", | ||||
f"swh:1:rev:{hexhash('rev2')} swh:1:rev:{hexhash('revX')}", # missing dest | f"swh:1:rev:{hexhash('rev2')} swh:1:rev:{hexhash('revX')}", # missing dest | ||||
f"swh:1:rev:{hexhash('rev3')} swh:1:rev:{hexhash('rev2')}", | f"swh:1:rev:{hexhash('rev3')} swh:1:rev:{hexhash('rev2')}", | ||||
f"swh:1:rev:{hexhash('rev4')} swh:1:dir:{hexhash('dir1')}", | f"swh:1:rev:{hexhash('rev4')} swh:1:dir:{hexhash('dir1')}", | ||||
f"swh:1:dir:{hexhash('dir1')} swh:1:cnt:{hexhash('cnt1')}", | f"swh:1:dir:{hexhash('dir1')} swh:1:cnt:{hexhash('cnt1')} {b64e('c1')} 42", | ||||
f"swh:1:dir:{hexhash('dir1')} swh:1:dir:{hexhash('dir1')}", | f"swh:1:dir:{hexhash('dir1')} swh:1:dir:{hexhash('dir1')} {b64e('d1')} 1337", | ||||
f"swh:1:dir:{hexhash('dir1')} swh:1:rev:{hexhash('rev1')}", | f"swh:1:dir:{hexhash('dir1')} swh:1:rev:{hexhash('rev1')} {b64e('r1')} 0", | ||||
] | ] | ||||
for obj_type, short_obj_type in short_type_mapping.items(): | for obj_type, short_obj_type in short_type_mapping.items(): | ||||
p = tmp_path / obj_type | p = tmp_path / obj_type | ||||
p.mkdir() | p.mkdir() | ||||
edges = [e for e in input_edges if e.startswith(f"swh:1:{short_obj_type}")] | edges = [e for e in input_edges if e.startswith(f"swh:1:{short_obj_type}")] | ||||
zstwrite(p / "00.edges.csv.zst", edges[0::2]) | zstwrite(p / "00.edges.csv.zst", edges[0::2]) | ||||
zstwrite(p / "01.edges.csv.zst", edges[1::2]) | zstwrite(p / "01.edges.csv.zst", edges[1::2]) | ||||
nodes = [n for n in input_nodes if n.startswith(f"swh:1:{short_obj_type}")] | nodes = [n for n in input_nodes if n.startswith(f"swh:1:{short_obj_type}")] | ||||
zstwrite(p / "00.nodes.csv.zst", nodes[0::2]) | zstwrite(p / "00.nodes.csv.zst", nodes[0::2]) | ||||
zstwrite(p / "01.nodes.csv.zst", nodes[1::2]) | zstwrite(p / "01.nodes.csv.zst", nodes[1::2]) | ||||
sort_graph_nodes(tmp_path, config={"sort_buffer_size": "1M"}) | sort_graph_nodes(tmp_path, config={"sort_buffer_size": "1M"}) | ||||
output_nodes = zstread(tmp_path / "graph.nodes.csv.zst").split("\n") | output_nodes = zstread(tmp_path / "graph.nodes.csv.zst").split("\n") | ||||
output_edges = zstread(tmp_path / "graph.edges.csv.zst").split("\n") | output_edges = zstread(tmp_path / "graph.edges.csv.zst").split("\n") | ||||
output_labels = zstread(tmp_path / "graph.labels.csv.zst").split("\n") | |||||
output_nodes = list(filter(bool, output_nodes)) | output_nodes = list(filter(bool, output_nodes)) | ||||
output_edges = list(filter(bool, output_edges)) | output_edges = list(filter(bool, output_edges)) | ||||
output_labels = list(filter(bool, output_labels)) | |||||
expected_nodes = set(input_nodes) | set(e.split()[1] for e in input_edges) | expected_nodes = set(input_nodes) | set(e.split()[1] for e in input_edges) | ||||
assert output_nodes == sorted(expected_nodes) | assert output_nodes == sorted(expected_nodes) | ||||
assert int((tmp_path / "graph.nodes.count.txt").read_text()) == len(expected_nodes) | assert int((tmp_path / "graph.nodes.count.txt").read_text()) == len(expected_nodes) | ||||
assert sorted(output_edges) == sorted(input_edges) | assert sorted(output_edges) == sorted(input_edges) | ||||
assert int((tmp_path / "graph.edges.count.txt").read_text()) == len(input_edges) | assert int((tmp_path / "graph.edges.count.txt").read_text()) == len(input_edges) | ||||
expected_labels = set(e[2] for e in [e.split() for e in input_edges] if len(e) > 2) | |||||
assert output_labels == sorted(expected_labels) | |||||
actual_node_stats = (tmp_path / "graph.nodes.stats.txt").read_text().strip() | actual_node_stats = (tmp_path / "graph.nodes.stats.txt").read_text().strip() | ||||
expected_node_stats = "\n".join( | expected_node_stats = "\n".join( | ||||
sorted( | sorted( | ||||
"{} {}".format(k, v) | "{} {}".format(k, v) | ||||
for k, v in collections.Counter( | for k, v in collections.Counter( | ||||
node.split(":")[2] for node in expected_nodes | node.split(":")[2] for node in expected_nodes | ||||
).items() | ).items() | ||||
) | ) | ||||
Show All 14 Lines |