diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java --- a/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ListOriginContributors.java @@ -5,8 +5,8 @@ * See top-level LICENSE file for more information */ -/* For each origin and each person, outputs a line "origin_id,person_id", - * if that person contributed to the origin. +/* For each origin and each contributor, outputs a line "origin_id,contributor_id", + * if that contributor contributed to the origin. * * A .csv table containing "origin_id,origin_url_base64" is also written * to the given path. @@ -68,7 +68,7 @@ */ HashMap pendingSuccessors = new HashMap<>(); - System.out.println("origin_id,person_id"); + System.out.println("origin_id,contributor_id"); originUrlsFileWriter.println("origin_id,origin_url_base64"); while (stdin.hasNextLine()) { String cells[] = stdin.nextLine().strip().split(",", -1); diff --git a/swh/graph/luigi/origin_contributors.py b/swh/graph/luigi/origin_contributors.py --- a/swh/graph/luigi/origin_contributors.py +++ b/swh/graph/luigi/origin_contributors.py @@ -108,8 +108,8 @@ class DeanonymizeOriginContributors(luigi.Task): """Generates a .csv.zst file similar to :class:`ListOriginContributors`'s, - but with ``person_base64`` and ``person_escaped`` columns in addition to - ``person_id``. + but with ``contributor_base64`` and ``contributor_escaped`` columns in addition to + ``contributor_id``. This assumes that :file:`graph.persons.csv.zst` is anonymized (SHA256 of names instead of names); which may not be true depending on how the swh-dataset export @@ -138,8 +138,8 @@ def output(self) -> luigi.Target: """.csv.zst file similar to :meth:`ListOriginContributors.output`'s, - but with ``person_base64`` and ``person_escaped`` columns in addition to - ``person_id``""" + but with ``contributor_base64`` and ``contributor_escaped`` columns in addition + to ``contributor_id``""" return luigi.LocalTarget(self.deanonymized_origin_contributors_path) def run(self) -> None: @@ -187,14 +187,16 @@ with pyzstd.open(tmp_output_path, "wt") as output_fd: csv_writer = csv.writer(output_fd, lineterminator="\n") # write header - csv_writer.writerow(("origin_id", "person_base64", "person_escaped")) + csv_writer.writerow( + ("origin_id", "contributor_base64", "contributor_escaped") + ) # Open input for reads as CSV with pyzstd.open(self.origin_contributors_path, "rt") as input_fd: # TODO: remove that cast once we dropped Python 3.7 support csv_reader = csv.reader(cast(Iterable[str], input_fd)) header = next(csv_reader) - assert header == ["origin_id", "person_id"], header + assert header == ["origin_id", "contributor_id"], header for (origin_id, person_id) in csv_reader: if person_id == "null": # FIXME: workaround for a bug in contribution graphs generated diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py --- a/swh/graph/tests/test_origin_contributors.py +++ b/swh/graph/tests/test_origin_contributors.py @@ -30,7 +30,7 @@ # FIXME: do not hardcode ids here; they should be dynamically loaded # from the test graph ORIGIN_CONTRIBUTORS = """\ -origin_id,person_id +origin_id,contributor_id 2,0 2,2 0,0 @@ -67,7 +67,7 @@ """ DEANONYMIZED_ORIGIN_CONTRIBUTORS = """\ -origin_id,person_base64,person_escaped +origin_id,contributor_base64,contributor_escaped 2,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe 2,Sm9obiBEb2UgPGpkb2VAZXhhbXBsZS5vcmc+,John Doe 0,SmFuZSBEb2UgPGpkb2VAZXhhbXBsZS5jb20+,Jane Doe @@ -139,7 +139,7 @@ ] ) - deanonymization_table_path = tmpdir / "person_sha256_to_names.csv.zst" + deanonymization_table_path = tmpdir / "contributor_sha256_to_names.csv.zst" task = ExportDeanonymizationTable( storage_dsn=swh_storage_postgresql.dsn, @@ -166,7 +166,7 @@ persons_path = tmpdir / "example.persons.csv.zst" origin_contributors_path = tmpdir / "origin_contributors.csv.zst" - deanonymization_table_path = tmpdir / "person_sha256_to_names.csv.zst" + deanonymization_table_path = tmpdir / "contributor_sha256_to_names.csv.zst" deanonymized_origin_contributors_path = ( tmpdir / "sensitive" / "origin_contributors.deanonymized.csv.zst" )