Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/utils.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import sqlite3 | import sqlite3 | ||||
import subprocess | import subprocess | ||||
try: | |||||
# Plyvel shouldn't be a hard dependency if we want to use sqlite instead | |||||
import plyvel | |||||
except ImportError: | |||||
plyvel = None | |||||
class ZSTFile: | class ZSTFile: | ||||
""" | """ | ||||
Object-like wrapper around a ZST file. Uses a subprocess of the "zstd" | Object-like wrapper around a ZST file. Uses a subprocess of the "zstd" | ||||
command to compress and deflate the objects. | command to compress and deflate the objects. | ||||
""" | """ | ||||
def __init__(self, path: str, mode: str = "r"): | def __init__(self, path: str, mode: str = "r"): | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | def add(self, v: bytes) -> bool: | ||||
try: | try: | ||||
self.db.execute("INSERT INTO tmpset(val) VALUES (?)", (v.hex(),)) | self.db.execute("INSERT INTO tmpset(val) VALUES (?)", (v.hex(),)) | ||||
except sqlite3.IntegrityError: | except sqlite3.IntegrityError: | ||||
return False | return False | ||||
else: | else: | ||||
return True | return True | ||||
class LevelDBSet: | |||||
""" | |||||
On-disk Set object for hashes using LevelDB as an indexer backend. Used to | |||||
deduplicate objects when processing large queues with duplicates. | |||||
""" | |||||
def __init__(self, db_path): | |||||
self.db_path = db_path | |||||
if plyvel is None: | |||||
raise ImportError("Plyvel library not found, required for LevelDBSet") | |||||
def __enter__(self): | |||||
vlorentz: Use this instead:
```
try:
import plyvel
except ImportError:
plyvel = None
```
it… | |||||
self.db = plyvel.DB(str(self.db_path), create_if_missing=True) | |||||
return self | |||||
def __exit__(self, exc_type, exc_val, exc_tb): | |||||
self.db.close() | |||||
def add(self, v: bytes) -> bool: | |||||
""" | |||||
Add an item to the set. | |||||
Args: | |||||
v: The value to add to the set. | |||||
Returns: | |||||
True if the value was added to the set, False if it was already present. | |||||
""" | |||||
if self.db.get(v): | |||||
return False | |||||
else: | |||||
self.db.put(v, b"T") | |||||
return True | |||||
def remove_pull_requests(snapshot): | def remove_pull_requests(snapshot): | ||||
""" | """ | ||||
Heuristic to filter out pull requests in snapshots: remove all branches | Heuristic to filter out pull requests in snapshots: remove all branches | ||||
that start with refs/ but do not start with refs/heads or refs/tags. | that start with refs/ but do not start with refs/heads or refs/tags. | ||||
""" | """ | ||||
# Copy the items with list() to remove items during iteration | # Copy the items with list() to remove items during iteration | ||||
for branch_name, branch in list(snapshot["branches"].items()): | for branch_name, branch in list(snapshot["branches"].items()): | ||||
original_branch_name = branch_name | original_branch_name = branch_name | ||||
Show All 10 Lines |
Use this instead:
it spares invoking the import machinery every time you use LevelDBSet