diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -49,6 +49,21 @@ } +def sanitize_json(doc): + """Recursively replaces NUL characters, as postgresql does not allow + them in text fields.""" + if isinstance(doc, str): + return doc.replace("\x00", "") + elif not hasattr(doc, "__iter__"): + return doc + elif isinstance(doc, dict): + return {sanitize_json(k): sanitize_json(v) for (k, v) in doc.items()} + elif isinstance(doc, (list, tuple)): + return [sanitize_json(v) for v in doc] + else: + raise TypeError(f"Unexpected object type in sanitize_json: {doc}") + + def get_indexer_storage(cls: str, **kwargs) -> IndexerStorageInterface: """Instantiate an indexer storage implementation of class `cls` with arguments `kwargs`. @@ -397,8 +412,12 @@ db.mktemp_content_metadata(cur) + rows = [m.to_dict() for m in metadata] + for row in rows: + row["metadata"] = sanitize_json(row["metadata"]) + db.copy_to( - [m.to_dict() for m in metadata], + rows, "tmp_content_metadata", ["id", "metadata", "indexer_configuration_id"], cur, @@ -447,8 +466,12 @@ db.mktemp_directory_intrinsic_metadata(cur) + rows = [m.to_dict() for m in metadata] + for row in rows: + row["metadata"] = sanitize_json(row["metadata"]) + db.copy_to( - [m.to_dict() for m in metadata], + rows, "tmp_directory_intrinsic_metadata", ["id", "metadata", "mappings", "indexer_configuration_id"], cur, @@ -487,8 +510,12 @@ db.mktemp_origin_intrinsic_metadata(cur) + rows = [m.to_dict() for m in metadata] + for row in rows: + row["metadata"] = sanitize_json(row["metadata"]) + db.copy_to( - [m.to_dict() for m in metadata], + rows, "tmp_origin_intrinsic_metadata", [ "id", @@ -625,8 +652,12 @@ db.mktemp_origin_extrinsic_metadata(cur) + rows = [m.to_dict() for m in metadata] + for row in rows: + row["metadata"] = sanitize_json(row["metadata"]) + db.copy_to( - [m.to_dict() for m in metadata], + rows, "tmp_origin_extrinsic_metadata", [ "id", diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -526,6 +526,60 @@ ] row_class = ContentMetadataRow + def test_add_with_null( + self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] + ) -> None: + storage, data = swh_indexer_storage_with_data + etype = self.endpoint_type + tool = data.tools[self.tool_name] + + # conftest fills it with mimetypes + storage.journal_writer.journal.objects = [] # type: ignore + + query = [data.sha1_2, data.sha1_1] + data1 = self.row_class.from_dict( + { + "id": data.sha1_2, + "metadata": {"description": "with\u0000nul"}, + "indexer_configuration_id": tool["id"], + } + ) + + # when + summary = endpoint(storage, etype, "add")([data1]) + assert summary == expected_summary(1, etype) + + # then + actual_data = list(endpoint(storage, etype, "get")(query)) + + # then + expected_data_postgresql = [ + self.row_class.from_dict( + { + "id": data.sha1_2, + "metadata": {"description": "withnul"}, + "tool": tool, + } + ) + ] + expected_data_verbatim = [ + self.row_class.from_dict( + { + "id": data.sha1_2, + "metadata": {"description": "with\u0000nul"}, + "tool": tool, + } + ) + ] + + assert actual_data in (expected_data_postgresql, expected_data_verbatim) + + journal_objects = storage.journal_writer.journal.objects # type: ignore + actual_journal_data = [ + obj for (obj_type, obj) in journal_objects if obj_type == self.endpoint_type + ] + assert list(sorted(actual_journal_data)) == list(sorted(expected_data_verbatim)) + class TestIndexerStorageDirectoryIntrinsicMetadata(StorageETypeTester): """Test Indexer Storage directory_intrinsic_metadata related methods"""