Changeset View
Changeset View
Standalone View
Standalone View
swh/objstorage/interface.py
# Copyright (C) 2015-2022 The Software Heritage developers | # Copyright (C) 2015-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from typing import Dict | from typing import Dict, Iterable, Iterator, List, Optional | ||||
from typing_extensions import Protocol, runtime_checkable | from typing_extensions import Protocol, runtime_checkable | ||||
from swh.core.api import remote_api_endpoint | from swh.core.api import remote_api_endpoint | ||||
from swh.objstorage.objstorage import DEFAULT_LIMIT | from swh.objstorage.constants import DEFAULT_LIMIT | ||||
ObjId = bytes | |||||
"""Type of object ids, which should be a sha1 hash.""" | |||||
@runtime_checkable | @runtime_checkable | ||||
class ObjStorageInterface(Protocol): | class ObjStorageInterface(Protocol): | ||||
"""High-level API to manipulate the Software Heritage object storage. | """High-level API to manipulate the Software Heritage object storage. | ||||
Conceptually, the object storage offers the following methods: | Conceptually, the object storage offers the following methods: | ||||
Show All 23 Lines | def check_config(self, *, check_write): | ||||
can succeed. | can succeed. | ||||
Returns: | Returns: | ||||
True if the configuration check worked, an exception if it didn't. | True if the configuration check worked, an exception if it didn't. | ||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/contains") | @remote_api_endpoint("content/contains") | ||||
def __contains__(self, obj_id): | def __contains__(self, obj_id: ObjId) -> bool: | ||||
"""Indicate if the given object is present in the storage. | """Indicate if the given object is present in the storage. | ||||
Args: | Args: | ||||
obj_id (bytes): object identifier. | obj_id: object identifier. | ||||
Returns: | Returns: | ||||
True if and only if the object is present in the current object | True if and only if the object is present in the current object | ||||
storage. | storage. | ||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/add") | @remote_api_endpoint("content/add") | ||||
def add(self, content, obj_id, check_presence=True): | def add(self, content: bytes, obj_id: ObjId, check_presence: bool = True) -> ObjId: | ||||
"""Add a new object to the object storage. | """Add a new object to the object storage. | ||||
Args: | Args: | ||||
content (bytes): object's raw content to add in storage. | content: object's raw content to add in storage. | ||||
obj_id (bytes): checksum of [bytes] using [ID_HASH_ALGO] | obj_id: checksum of [bytes] using [ID_HASH_ALGO] | ||||
algorithm. It is trusted to match the bytes. | algorithm. It is trusted to match the bytes. | ||||
check_presence (bool): indicate if the presence of the | check_presence (bool): indicate if the presence of the | ||||
content should be verified before adding the file. | content should be verified before adding the file. | ||||
Returns: | Returns: | ||||
the id (bytes) of the object into the storage. | the id (bytes) of the object into the storage. | ||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/add/batch") | @remote_api_endpoint("content/add/batch") | ||||
def add_batch(self, contents, check_presence=True) -> Dict: | def add_batch(self, contents, check_presence=True) -> Dict: | ||||
"""Add a batch of new objects to the object storage. | """Add a batch of new objects to the object storage. | ||||
Args: | Args: | ||||
contents: mapping from obj_id to object contents | contents: mapping from obj_id to object contents | ||||
Returns: | Returns: | ||||
the summary of objects added to the storage (count of object, | the summary of objects added to the storage (count of object, | ||||
count of bytes object) | count of bytes object) | ||||
""" | """ | ||||
... | ... | ||||
def restore(self, content, obj_id): | def restore(self, content: bytes, obj_id: ObjId): | ||||
"""Restore a content that have been corrupted. | """Restore a content that have been corrupted. | ||||
This function is identical to add but does not check if | This function is identical to add but does not check if | ||||
the object id is already in the file system. | the object id is already in the file system. | ||||
The default implementation provided by the current class is | The default implementation provided by the current class is | ||||
suitable for most cases. | suitable for most cases. | ||||
Args: | Args: | ||||
content (bytes): object's raw content to add in storage | content: object's raw content to add in storage | ||||
obj_id (bytes): checksum of `bytes` as computed by | obj_id: dict of hashes of the content (or only the sha1, for legacy clients) | ||||
ID_HASH_ALGO. When given, obj_id will be trusted to | |||||
match bytes. If missing, obj_id will be computed on | |||||
the fly. | |||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/get") | @remote_api_endpoint("content/get") | ||||
def get(self, obj_id): | def get(self, obj_id: ObjId) -> bytes: | ||||
"""Retrieve the content of a given object. | """Retrieve the content of a given object. | ||||
Args: | Args: | ||||
obj_id (bytes): object id. | obj_id: object id. | ||||
Returns: | Returns: | ||||
the content of the requested object as bytes. | the content of the requested object as bytes. | ||||
Raises: | Raises: | ||||
ObjNotFoundError: if the requested object is missing. | ObjNotFoundError: if the requested object is missing. | ||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/get/batch") | @remote_api_endpoint("content/get/batch") | ||||
def get_batch(self, obj_ids): | def get_batch(self, obj_ids: List[ObjId]) -> Iterator[Optional[bytes]]: | ||||
"""Retrieve objects' raw content in bulk from storage. | """Retrieve objects' raw content in bulk from storage. | ||||
Note: This function does have a default implementation in | Note: This function does have a default implementation in | ||||
ObjStorage that is suitable for most cases. | ObjStorage that is suitable for most cases. | ||||
For object storages that needs to do the minimal number of | For object storages that needs to do the minimal number of | ||||
requests possible (ex: remote object storages), that method | requests possible (ex: remote object storages), that method | ||||
can be overridden to perform a more efficient operation. | can be overridden to perform a more efficient operation. | ||||
Args: | Args: | ||||
obj_ids ([bytes]: list of object ids. | obj_ids: list of object ids. | ||||
Returns: | Returns: | ||||
list of resulting contents, or None if the content could | list of resulting contents, or None if the content could | ||||
not be retrieved. Do not raise any exception as a fail for | not be retrieved. Do not raise any exception as a fail for | ||||
one content will not cancel the whole request. | one content will not cancel the whole request. | ||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/check") | @remote_api_endpoint("content/check") | ||||
def check(self, obj_id): | def check(self, obj_id: ObjId) -> None: | ||||
"""Perform an integrity check for a given object. | """Perform an integrity check for a given object. | ||||
Verify that the file object is in place and that the content matches | Verify that the file object is in place and that the content matches | ||||
the object id. | the object id. | ||||
Args: | Args: | ||||
obj_id (bytes): object identifier. | obj_id: object identifier. | ||||
Raises: | Raises: | ||||
ObjNotFoundError: if the requested object is missing. | ObjNotFoundError: if the requested object is missing. | ||||
Error: if the request object is corrupted. | Error: if the request object is corrupted. | ||||
""" | """ | ||||
... | ... | ||||
@remote_api_endpoint("content/delete") | @remote_api_endpoint("content/delete") | ||||
def delete(self, obj_id): | def delete(self, obj_id: ObjId): | ||||
"""Delete an object. | """Delete an object. | ||||
Args: | Args: | ||||
obj_id (bytes): object identifier. | obj_id: object identifier. | ||||
Raises: | Raises: | ||||
ObjNotFoundError: if the requested object is missing. | ObjNotFoundError: if the requested object is missing. | ||||
""" | """ | ||||
... | ... | ||||
# Management methods | # Management methods | ||||
@remote_api_endpoint("content/get/random") | @remote_api_endpoint("content/get/random") | ||||
def get_random(self, batch_size): | def get_random(self, batch_size: int) -> Iterable[ObjId]: | ||||
"""Get random ids of existing contents. | """Get random ids of existing contents. | ||||
This method is used in order to get random ids to perform | This method is used in order to get random ids to perform | ||||
content integrity verifications on random contents. | content integrity verifications on random contents. | ||||
Args: | Args: | ||||
batch_size (int): Number of ids that will be given | batch_size: Number of ids that will be given | ||||
Yields: | Yields: | ||||
An iterable of ids (bytes) of contents that are in the | ids of contents that are in the current object storage. | ||||
current object storage. | |||||
""" | """ | ||||
... | ... | ||||
def __iter__(self): | def __iter__(self) -> Iterator[ObjId]: | ||||
... | ... | ||||
def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): | def list_content( | ||||
self, last_obj_id: Optional[ObjId] = None, limit: int = DEFAULT_LIMIT | |||||
) -> Iterator[ObjId]: | |||||
"""Generates known object ids. | """Generates known object ids. | ||||
Args: | Args: | ||||
last_obj_id (bytes): object id from which to iterate from | last_obj_id: object id from which to iterate from | ||||
(excluded). | (excluded). | ||||
limit (int): max number of object ids to generate. | limit (int): max number of object ids to generate. | ||||
Generates: | Generates: | ||||
obj_id (bytes): object ids. | obj_id: object ids. | ||||
""" | """ | ||||
... | ... |