Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/utils.py
| # Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| import sqlite3 | import sqlite3 | ||||
| import subprocess | import subprocess | ||||
| try: | |||||
| # Plyvel shouldn't be a hard dependency if we want to use sqlite instead | |||||
| import plyvel | |||||
| except ImportError: | |||||
| plyvel = None | |||||
| class ZSTFile: | class ZSTFile: | ||||
| """ | """ | ||||
| Object-like wrapper around a ZST file. Uses a subprocess of the "zstd" | Object-like wrapper around a ZST file. Uses a subprocess of the "zstd" | ||||
| command to compress and deflate the objects. | command to compress and deflate the objects. | ||||
| """ | """ | ||||
| def __init__(self, path: str, mode: str = "r"): | def __init__(self, path: str, mode: str = "r"): | ||||
| ▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | def add(self, v: bytes) -> bool: | ||||
| try: | try: | ||||
| self.db.execute("INSERT INTO tmpset(val) VALUES (?)", (v.hex(),)) | self.db.execute("INSERT INTO tmpset(val) VALUES (?)", (v.hex(),)) | ||||
| except sqlite3.IntegrityError: | except sqlite3.IntegrityError: | ||||
| return False | return False | ||||
| else: | else: | ||||
| return True | return True | ||||
| class LevelDBSet: | |||||
| """ | |||||
| On-disk Set object for hashes using LevelDB as an indexer backend. Used to | |||||
| deduplicate objects when processing large queues with duplicates. | |||||
| """ | |||||
| def __init__(self, db_path): | |||||
| self.db_path = db_path | |||||
| if plyvel is None: | |||||
| raise ImportError("Plyvel library not found, required for LevelDBSet") | |||||
| def __enter__(self): | |||||
vlorentz: Use this instead:
```
try:
import plyvel
except ImportError:
plyvel = None
```
it… | |||||
| self.db = plyvel.DB(str(self.db_path), create_if_missing=True) | |||||
| return self | |||||
| def __exit__(self, exc_type, exc_val, exc_tb): | |||||
| self.db.close() | |||||
| def add(self, v: bytes) -> bool: | |||||
| """ | |||||
| Add an item to the set. | |||||
| Args: | |||||
| v: The value to add to the set. | |||||
| Returns: | |||||
| True if the value was added to the set, False if it was already present. | |||||
| """ | |||||
| if self.db.get(v): | |||||
| return False | |||||
| else: | |||||
| self.db.put(v, b"T") | |||||
| return True | |||||
| def remove_pull_requests(snapshot): | def remove_pull_requests(snapshot): | ||||
| """ | """ | ||||
| Heuristic to filter out pull requests in snapshots: remove all branches | Heuristic to filter out pull requests in snapshots: remove all branches | ||||
| that start with refs/ but do not start with refs/heads or refs/tags. | that start with refs/ but do not start with refs/heads or refs/tags. | ||||
| """ | """ | ||||
| # Copy the items with list() to remove items during iteration | # Copy the items with list() to remove items during iteration | ||||
| for branch_name, branch in list(snapshot["branches"].items()): | for branch_name, branch in list(snapshot["branches"].items()): | ||||
| original_branch_name = branch_name | original_branch_name = branch_name | ||||
| Show All 10 Lines | |||||
Use this instead:
try: import plyvel except ImportError: plyvel = Noneit spares invoking the import machinery every time you use LevelDBSet