diff --git a/swh/dataset/exporters/orc.py b/swh/dataset/exporters/orc.py --- a/swh/dataset/exporters/orc.py +++ b/swh/dataset/exporters/orc.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from datetime import datetime +import hashlib import logging import math from types import TracebackType @@ -220,7 +221,12 @@ def process_origin(self, origin): origin_writer = self.get_writer_for("origin") - origin_writer.write((origin["url"],)) + origin_writer.write( + ( + hashlib.sha1(origin["url"].encode()).hexdigest(), + origin["url"], + ) + ) def process_origin_visit(self, visit): origin_visit_writer = self.get_writer_for("origin_visit") diff --git a/swh/dataset/relational.py b/swh/dataset/relational.py --- a/swh/dataset/relational.py +++ b/swh/dataset/relational.py @@ -6,6 +6,7 @@ # fmt: off MAIN_TABLES = { "origin": [ + ("id", "string"), ("url", "string"), ], "origin_visit": [ diff --git a/swh/dataset/test/test_orc.py b/swh/dataset/test/test_orc.py --- a/swh/dataset/test/test_orc.py +++ b/swh/dataset/test/test_orc.py @@ -5,6 +5,7 @@ import collections from contextlib import contextmanager +import hashlib import math from pathlib import Path import tempfile @@ -64,7 +65,7 @@ obj_type = "origin" output = exporter({obj_type: TEST_OBJECTS[obj_type]}) for obj in TEST_OBJECTS[obj_type]: - assert (obj.url,) in output[obj_type] + assert (hashlib.sha1(obj.url.encode()).hexdigest(), obj.url) in output[obj_type] def test_export_origin_visit():