diff --git a/swh/objstorage/backends/http.py b/swh/objstorage/backends/http.py new file mode 100644 index 0000000..b90f2f3 --- /dev/null +++ b/swh/objstorage/backends/http.py @@ -0,0 +1,92 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from urllib.parse import urljoin + +import requests + +from swh.model import hashutil +from swh.objstorage import exc +from swh.objstorage.objstorage import ( + DEFAULT_LIMIT, + ObjStorage, + compute_hash, + decompressors, +) + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.ERROR) + + +class HTTPReadOnlyObjStorage(ObjStorage): + """Simple ObjStorage retrieving objects from an HTTP server. + + For example, can be used to retrieve objects from S3: + + objstorage: + cls: http + url: https://softwareheritage.s3.amazonaws.com/content/ + """ + + def __init__(self, url=None, compression=None, **kwargs): + super().__init__(**kwargs) + self.session = requests.sessions.Session() + self.root_path = url + if not self.root_path.endswith("/"): + self.root_path += "/" + self.compression = compression + + def check_config(self, *, check_write): + """Check the configuration for this object storage""" + return True + + def __contains__(self, obj_id): + resp = self.session.head(self._path(obj_id)) + return resp.status_code == 200 + + def __iter__(self): + raise exc.NonIterableObjStorage("__iter__") + + def __len__(self): + raise exc.NonIterableObjStorage("__len__") + + def add(self, content, obj_id=None, check_presence=True): + raise exc.ReadOnlyObjStorage("add") + + def delete(self, obj_id): + raise exc.ReadOnlyObjStorage("delete") + + def restore(self, content, obj_id=None): + raise exc.ReadOnlyObjStorage("restore") + + def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): + raise exc.NonIterableObjStorage("__len__") + + def get(self, obj_id): + try: + resp = self.session.get(self._path(obj_id)) + resp.raise_for_status() + except Exception: + raise exc.ObjNotFoundError(obj_id) + + ret: bytes = resp.content + if self.compression: + d = decompressors[self.compression]() + ret = d.decompress(ret) + if d.unused_data: + hex_obj_id = hashutil.hash_to_hex(obj_id) + raise exc.Error("Corrupt object %s: trailing data found" % hex_obj_id) + return ret + + def check(self, obj_id): + # Check the content integrity + obj_content = self.get(obj_id) + content_obj_id = compute_hash(obj_content) + if content_obj_id != obj_id: + raise exc.Error(obj_id) + + def _path(self, obj_id): + return urljoin(self.root_path, hashutil.hash_to_hex(obj_id)) diff --git a/swh/objstorage/exc.py b/swh/objstorage/exc.py index 12ce578..59bc9b1 100644 --- a/swh/objstorage/exc.py +++ b/swh/objstorage/exc.py @@ -1,23 +1,41 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information class Error(Exception): def __str__(self): return "storage error on object: %s" % self.args class ObjNotFoundError(Error): def __str__(self): return "object not found: %s" % self.args class ObjStorageAPIError(Exception): """ Specific internal exception of an object storage (mainly connection). """ def __str__(self): args = self.args return "An unexpected error occurred in the api backend: %s" % args + + +class ReadOnlyObjStorage(Error): + def __init__(self, method, *args): + super().__init__(*args) + self.method = method + + def __str__(self): + return "This object storage is Read-Only: cannot use %s" % self.method + + +class NonIterableObjStorage(Error): + def __init__(self, method, *args): + super().__init__(*args) + self.method = method + + def __str__(self): + return "This object storage is not iterable: cannot use %s" % self.method diff --git a/swh/objstorage/factory.py b/swh/objstorage/factory.py index df8bcf6..8ce5284 100644 --- a/swh/objstorage/factory.py +++ b/swh/objstorage/factory.py @@ -1,125 +1,127 @@ # Copyright (C) 2016-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Callable, Dict, Union import warnings from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.backends.generator import RandomGeneratorObjStorage +from swh.objstorage.backends.http import HTTPReadOnlyObjStorage from swh.objstorage.backends.in_memory import InMemoryObjStorage from swh.objstorage.backends.pathslicing import PathSlicingObjStorage from swh.objstorage.backends.seaweedfs import SeaweedFilerObjStorage from swh.objstorage.multiplexer import MultiplexerObjStorage, StripingObjStorage from swh.objstorage.multiplexer.filter import add_filters from swh.objstorage.objstorage import ID_HASH_LENGTH, ObjStorage # noqa __all__ = ["get_objstorage", "ObjStorage"] _STORAGE_CLASSES: Dict[str, Union[type, Callable[..., type]]] = { "pathslicing": PathSlicingObjStorage, "remote": RemoteObjStorage, "memory": InMemoryObjStorage, "seaweedfs": SeaweedFilerObjStorage, "random": RandomGeneratorObjStorage, + "http": HTTPReadOnlyObjStorage, } _STORAGE_CLASSES_MISSING = {} _STORAGE_CLASSES_DEPRECATED = {"weed": "seaweedfs"} try: from swh.objstorage.backends.azure import ( AzureCloudObjStorage, PrefixedAzureCloudObjStorage, ) _STORAGE_CLASSES["azure"] = AzureCloudObjStorage _STORAGE_CLASSES["azure-prefixed"] = PrefixedAzureCloudObjStorage except ImportError as e: _STORAGE_CLASSES_MISSING["azure"] = e.args[0] _STORAGE_CLASSES_MISSING["azure-prefixed"] = e.args[0] try: from swh.objstorage.backends.rados import RADOSObjStorage _STORAGE_CLASSES["rados"] = RADOSObjStorage except ImportError as e: _STORAGE_CLASSES_MISSING["rados"] = e.args[0] try: from swh.objstorage.backends.libcloud import ( AwsCloudObjStorage, OpenStackCloudObjStorage, ) _STORAGE_CLASSES["s3"] = AwsCloudObjStorage _STORAGE_CLASSES["swift"] = OpenStackCloudObjStorage except ImportError as e: _STORAGE_CLASSES_MISSING["s3"] = e.args[0] _STORAGE_CLASSES_MISSING["swift"] = e.args[0] def get_objstorage(cls: str, args=None, **kwargs): """ Create an ObjStorage using the given implementation class. Args: cls: objstorage class unique key contained in the _STORAGE_CLASSES dict. kwargs: arguments for the required class of objstorage that must match exactly the one in the `__init__` method of the class. Returns: subclass of ObjStorage that match the given `storage_class` argument. Raises: ValueError: if the given storage class is not a valid objstorage key. """ if cls in _STORAGE_CLASSES_DEPRECATED: warnings.warn( f"{cls} objstorage class is deprecated, " f"use {_STORAGE_CLASSES_DEPRECATED[cls]} class instead.", DeprecationWarning, ) cls = _STORAGE_CLASSES_DEPRECATED[cls] if cls in _STORAGE_CLASSES: if args is not None: warnings.warn( 'Explicit "args" key is deprecated for objstorage initialization, ' "use class arguments keys directly instead.", DeprecationWarning, ) # TODO: when removing this, drop the "args" backwards compatibility # from swh.objstorage.api.server configuration checker kwargs = args return _STORAGE_CLASSES[cls](**kwargs) else: raise ValueError( "Storage class {} is not available: {}".format( cls, _STORAGE_CLASSES_MISSING.get(cls, "unknown name") ) ) def _construct_filtered_objstorage(storage_conf, filters_conf): return add_filters(get_objstorage(**storage_conf), filters_conf) _STORAGE_CLASSES["filtered"] = _construct_filtered_objstorage def _construct_multiplexer_objstorage(objstorages): storages = [get_objstorage(**conf) for conf in objstorages] return MultiplexerObjStorage(storages) _STORAGE_CLASSES["multiplexer"] = _construct_multiplexer_objstorage def _construct_striping_objstorage(objstorages): storages = [get_objstorage(**conf) for conf in objstorages] return StripingObjStorage(storages) _STORAGE_CLASSES["striping"] = _construct_striping_objstorage diff --git a/swh/objstorage/tests/test_objstorage_http.py b/swh/objstorage/tests/test_objstorage_http.py new file mode 100644 index 0000000..055ef5d --- /dev/null +++ b/swh/objstorage/tests/test_objstorage_http.py @@ -0,0 +1,110 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import requests_mock +from requests_mock.contrib import fixture + +from swh.objstorage import exc +from swh.objstorage.factory import get_objstorage + + +def build_objstorage(): + """Build an HTTPReadOnlyObjStorage suitable for tests + + this instancaite 2 ObjStorage, one HTTPReadOnlyObjStorage (the "front" one + being under test), and one InMemoryObjStorage (which actually stores the + test content), and install a request mock fixture to route HTTP requests + from the HTTPReadOnlyObjStorage to query the InMemoryStorage. + + Also fills the backend storage with a 100 objects. + """ + sto_back = get_objstorage(cls="memory") + objids = [] + for i in range(100): + objids.append(sto_back.add(f"some content {i}".encode())) + + url = "http://127.0.0.1/content/" + sto_front = get_objstorage(cls="http", url=url) + mock = fixture.Fixture() + mock.setUp() + + def get_cb(request, context): + dirname, basename = request.path.rsplit("/", 1) + objid = bytes.fromhex(basename) + if dirname == "/content" and objid in sto_back: + return sto_back.get(objid) + context.status_code = 404 + + def head_cb(request, context): + dirname, basename = request.path.rsplit("/", 1) + objid = bytes.fromhex(basename) + if dirname != "/content" or objid not in sto_back: + context.status_code = 404 + return b"Not Found" + return b"Found" + + mock.register_uri(requests_mock.GET, requests_mock.ANY, content=get_cb) + mock.register_uri(requests_mock.HEAD, requests_mock.ANY, content=head_cb) + + return sto_front, sto_back, objids + + +def test_http_objstorage(): + sto_front, sto_back, objids = build_objstorage() + + for objid in objids: + assert objid in sto_front + assert sto_front.get(objid) == sto_back.get(objid) + assert sto_front.get(objid).decode().startswith("some content ") + + +def test_http_objstorage_missing(): + sto_front, sto_back, objids = build_objstorage() + + assert b"\x00" * 20 not in sto_front + + +def test_http_objstorage_get_missing(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(exc.ObjNotFoundError): + sto_front.get(b"\x00" * 20) + + +def test_http_objstorage_check(): + sto_front, sto_back, objids = build_objstorage() + for objid in objids: + assert sto_front.check(objid) is None # no Exception means OK + + # create an invalid object in the in-memory objstorage + invalid_content = b"p0wn3d content" + fake_objid = "\x01" * 20 + id_added = sto_back.add(invalid_content, fake_objid) + assert id_added == fake_objid + + # the http objstorage should report it as invalid + with pytest.raises(exc.Error): + sto_front.check(id_added) + + +def test_http_objstorage_read_only(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(exc.ReadOnlyObjStorage): + sto_front.add(b"") + with pytest.raises(exc.ReadOnlyObjStorage): + sto_front.restore(b"") + with pytest.raises(exc.ReadOnlyObjStorage): + sto_front.delete(b"\x00" * 20) + + +def test_http_objstorage_not_iterable(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(exc.NonIterableObjStorage): + len(sto_front) + with pytest.raises(exc.NonIterableObjStorage): + iter(sto_front)