Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/__init__.py
Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines | SERVER_IMPLEMENTATIONS: Dict[str, str] = { | ||||
"postgresql": ".IndexerStorage", | "postgresql": ".IndexerStorage", | ||||
"remote": ".api.client.RemoteStorage", | "remote": ".api.client.RemoteStorage", | ||||
"memory": ".in_memory.IndexerStorage", | "memory": ".in_memory.IndexerStorage", | ||||
# deprecated | # deprecated | ||||
"local": ".IndexerStorage", | "local": ".IndexerStorage", | ||||
} | } | ||||
def sanitize_json(doc): | |||||
anlambert: s/NUL/NULL/ | |||||
Done Inline Actionsnah, NUL is the name of the zero byte/character in ASCII, NULL is the name inherited from C for zero pointers. Unicode doesn't have a name for the zero character, so I used ASCII's vlorentz: nah, `NUL` is the name of the zero byte/character in ASCII, `NULL` is the name inherited from C… | |||||
Done Inline Actionshah, actually unicode calls it NULL but allows NUL as an alias https://www.unicode.org/Public/14.0.0/ucd/NameAliases.txt vlorentz: hah, actually unicode calls it `NULL` but allows `NUL` as an alias https://www.unicode. | |||||
Not Done Inline ActionsAck anlambert: Ack | |||||
"""Recursively replaces NUL characters, as postgresql does not allow | |||||
them in text fields.""" | |||||
if isinstance(doc, str): | |||||
return doc.replace("\x00", "") | |||||
elif not hasattr(doc, "__iter__"): | |||||
return doc | |||||
elif isinstance(doc, dict): | |||||
return {sanitize_json(k): sanitize_json(v) for (k, v) in doc.items()} | |||||
elif isinstance(doc, (list, tuple)): | |||||
return [sanitize_json(v) for v in doc] | |||||
else: | |||||
raise TypeError(f"Unexpected object type in sanitize_json: {doc}") | |||||
def get_indexer_storage(cls: str, **kwargs) -> IndexerStorageInterface: | def get_indexer_storage(cls: str, **kwargs) -> IndexerStorageInterface: | ||||
"""Instantiate an indexer storage implementation of class `cls` with arguments | """Instantiate an indexer storage implementation of class `cls` with arguments | ||||
`kwargs`. | `kwargs`. | ||||
Args: | Args: | ||||
cls: indexer storage class (local, remote or memory) | cls: indexer storage class (local, remote or memory) | ||||
kwargs: dictionary of arguments passed to the | kwargs: dictionary of arguments passed to the | ||||
indexer storage class constructor | indexer storage class constructor | ||||
▲ Show 20 Lines • Show All 332 Lines • ▼ Show 20 Lines | def content_metadata_add( | ||||
cur=None, | cur=None, | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_duplicates(metadata) | check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m.id) | metadata.sort(key=lambda m: m.id) | ||||
self.journal_writer.write_additions("content_metadata", metadata) | self.journal_writer.write_additions("content_metadata", metadata) | ||||
db.mktemp_content_metadata(cur) | db.mktemp_content_metadata(cur) | ||||
rows = [m.to_dict() for m in metadata] | |||||
for row in rows: | |||||
row["metadata"] = sanitize_json(row["metadata"]) | |||||
db.copy_to( | db.copy_to( | ||||
[m.to_dict() for m in metadata], | rows, | ||||
"tmp_content_metadata", | "tmp_content_metadata", | ||||
["id", "metadata", "indexer_configuration_id"], | ["id", "metadata", "indexer_configuration_id"], | ||||
cur, | cur, | ||||
) | ) | ||||
count = db.content_metadata_add_from_temp(cur) | count = db.content_metadata_add_from_temp(cur) | ||||
return { | return { | ||||
"content_metadata:add": count, | "content_metadata:add": count, | ||||
} | } | ||||
Show All 32 Lines | def directory_intrinsic_metadata_add( | ||||
cur=None, | cur=None, | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_duplicates(metadata) | check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m.id) | metadata.sort(key=lambda m: m.id) | ||||
self.journal_writer.write_additions("directory_intrinsic_metadata", metadata) | self.journal_writer.write_additions("directory_intrinsic_metadata", metadata) | ||||
db.mktemp_directory_intrinsic_metadata(cur) | db.mktemp_directory_intrinsic_metadata(cur) | ||||
rows = [m.to_dict() for m in metadata] | |||||
for row in rows: | |||||
row["metadata"] = sanitize_json(row["metadata"]) | |||||
db.copy_to( | db.copy_to( | ||||
[m.to_dict() for m in metadata], | rows, | ||||
"tmp_directory_intrinsic_metadata", | "tmp_directory_intrinsic_metadata", | ||||
["id", "metadata", "mappings", "indexer_configuration_id"], | ["id", "metadata", "mappings", "indexer_configuration_id"], | ||||
cur, | cur, | ||||
) | ) | ||||
count = db.directory_intrinsic_metadata_add_from_temp(cur) | count = db.directory_intrinsic_metadata_add_from_temp(cur) | ||||
return { | return { | ||||
"directory_intrinsic_metadata:add": count, | "directory_intrinsic_metadata:add": count, | ||||
} | } | ||||
Show All 22 Lines | def origin_intrinsic_metadata_add( | ||||
cur=None, | cur=None, | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_duplicates(metadata) | check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m.id) | metadata.sort(key=lambda m: m.id) | ||||
self.journal_writer.write_additions("origin_intrinsic_metadata", metadata) | self.journal_writer.write_additions("origin_intrinsic_metadata", metadata) | ||||
db.mktemp_origin_intrinsic_metadata(cur) | db.mktemp_origin_intrinsic_metadata(cur) | ||||
rows = [m.to_dict() for m in metadata] | |||||
for row in rows: | |||||
row["metadata"] = sanitize_json(row["metadata"]) | |||||
db.copy_to( | db.copy_to( | ||||
[m.to_dict() for m in metadata], | rows, | ||||
"tmp_origin_intrinsic_metadata", | "tmp_origin_intrinsic_metadata", | ||||
[ | [ | ||||
"id", | "id", | ||||
"metadata", | "metadata", | ||||
"indexer_configuration_id", | "indexer_configuration_id", | ||||
"from_directory", | "from_directory", | ||||
"mappings", | "mappings", | ||||
], | ], | ||||
▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines | def origin_extrinsic_metadata_add( | ||||
cur=None, | cur=None, | ||||
) -> Dict[str, int]: | ) -> Dict[str, int]: | ||||
check_id_duplicates(metadata) | check_id_duplicates(metadata) | ||||
metadata.sort(key=lambda m: m.id) | metadata.sort(key=lambda m: m.id) | ||||
self.journal_writer.write_additions("origin_extrinsic_metadata", metadata) | self.journal_writer.write_additions("origin_extrinsic_metadata", metadata) | ||||
db.mktemp_origin_extrinsic_metadata(cur) | db.mktemp_origin_extrinsic_metadata(cur) | ||||
rows = [m.to_dict() for m in metadata] | |||||
for row in rows: | |||||
row["metadata"] = sanitize_json(row["metadata"]) | |||||
db.copy_to( | db.copy_to( | ||||
[m.to_dict() for m in metadata], | rows, | ||||
"tmp_origin_extrinsic_metadata", | "tmp_origin_extrinsic_metadata", | ||||
[ | [ | ||||
"id", | "id", | ||||
"metadata", | "metadata", | ||||
"indexer_configuration_id", | "indexer_configuration_id", | ||||
"from_remd_id", | "from_remd_id", | ||||
"mappings", | "mappings", | ||||
], | ], | ||||
▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines |
s/NUL/NULL/