diff --git a/swh/objstorage/backends/http.py b/swh/objstorage/backends/http.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/backends/http.py @@ -0,0 +1,92 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from urllib.parse import urljoin + +import requests + +from swh.model import hashutil +from swh.objstorage import exc +from swh.objstorage.objstorage import ( + DEFAULT_LIMIT, + ObjStorage, + compute_hash, + decompressors, +) + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.ERROR) + + +class HTTPReadOnlyObjStorage(ObjStorage): + """Simple ObjStorage retrieving objects from an HTTP server. + + For example, can be used to retrieve objects from S3: + + objstorage: + cls: http + url: https://softwareheritage.s3.amazonaws.com/content/ + """ + + def __init__(self, url=None, compression=None, **kwargs): + super().__init__(**kwargs) + self.session = requests.sessions.Session() + self.root_path = url + if not self.root_path.endswith("/"): + self.root_path += "/" + self.compression = compression + + def check_config(self, *, check_write): + """Check the configuration for this object storage""" + return True + + def __contains__(self, obj_id): + resp = self.session.head(self._path(obj_id)) + return resp.status_code == 200 + + def __iter__(self): + raise exc.NonIterableObjStorage("__iter__") + + def __len__(self): + raise exc.NonIterableObjStorage("__len__") + + def add(self, content, obj_id=None, check_presence=True): + raise exc.ReadOnlyObjStorage("add") + + def delete(self, obj_id): + raise exc.ReadOnlyObjStorage("delete") + + def restore(self, content, obj_id=None): + raise exc.ReadOnlyObjStorage("restore") + + def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): + raise exc.NonIterableObjStorage("__len__") + + def get(self, obj_id): + try: + resp = self.session.get(self._path(obj_id)) + resp.raise_for_status() + except Exception: + raise exc.ObjNotFoundError(obj_id) + + ret: bytes = resp.content + if self.compression: + d = decompressors[self.compression]() + ret = d.decompress(ret) + if d.unused_data: + hex_obj_id = hashutil.hash_to_hex(obj_id) + raise exc.Error("Corrupt object %s: trailing data found" % hex_obj_id) + return ret + + def check(self, obj_id): + # Check the content integrity + obj_content = self.get(obj_id) + content_obj_id = compute_hash(obj_content) + if content_obj_id != obj_id: + raise exc.Error(obj_id) + + def _path(self, obj_id): + return urljoin(self.root_path, hashutil.hash_to_hex(obj_id)) diff --git a/swh/objstorage/exc.py b/swh/objstorage/exc.py --- a/swh/objstorage/exc.py +++ b/swh/objstorage/exc.py @@ -21,3 +21,21 @@ def __str__(self): args = self.args return "An unexpected error occurred in the api backend: %s" % args + + +class ReadOnlyObjStorage(Error): + def __init__(self, method, *args): + super().__init__(*args) + self.method = method + + def __str__(self): + return "This object storage is Read-Only: cannot use %s" % self.method + + +class NonIterableObjStorage(Error): + def __init__(self, method, *args): + super().__init__(*args) + self.method = method + + def __str__(self): + return "This object storage is not iterable: cannot use %s" % self.method diff --git a/swh/objstorage/factory.py b/swh/objstorage/factory.py --- a/swh/objstorage/factory.py +++ b/swh/objstorage/factory.py @@ -8,6 +8,7 @@ from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.backends.generator import RandomGeneratorObjStorage +from swh.objstorage.backends.http import HTTPReadOnlyObjStorage from swh.objstorage.backends.in_memory import InMemoryObjStorage from swh.objstorage.backends.pathslicing import PathSlicingObjStorage from swh.objstorage.backends.seaweedfs import SeaweedFilerObjStorage @@ -24,6 +25,7 @@ "memory": InMemoryObjStorage, "seaweedfs": SeaweedFilerObjStorage, "random": RandomGeneratorObjStorage, + "http": HTTPReadOnlyObjStorage, } _STORAGE_CLASSES_MISSING = {} diff --git a/swh/objstorage/tests/test_objstorage_http.py b/swh/objstorage/tests/test_objstorage_http.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/tests/test_objstorage_http.py @@ -0,0 +1,110 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import requests_mock +from requests_mock.contrib import fixture + +from swh.objstorage import exc +from swh.objstorage.factory import get_objstorage + + +def build_objstorage(): + """Build an HTTPReadOnlyObjStorage suitable for tests + + this instancaite 2 ObjStorage, one HTTPReadOnlyObjStorage (the "front" one + being under test), and one InMemoryObjStorage (which actually stores the + test content), and install a request mock fixture to route HTTP requests + from the HTTPReadOnlyObjStorage to query the InMemoryStorage. + + Also fills the backend storage with a 100 objects. + """ + sto_back = get_objstorage(cls="memory") + objids = [] + for i in range(100): + objids.append(sto_back.add(f"some content {i}".encode())) + + url = "http://127.0.0.1/content/" + sto_front = get_objstorage(cls="http", url=url) + mock = fixture.Fixture() + mock.setUp() + + def get_cb(request, context): + dirname, basename = request.path.rsplit("/", 1) + objid = bytes.fromhex(basename) + if dirname == "/content" and objid in sto_back: + return sto_back.get(objid) + context.status_code = 404 + + def head_cb(request, context): + dirname, basename = request.path.rsplit("/", 1) + objid = bytes.fromhex(basename) + if dirname != "/content" or objid not in sto_back: + context.status_code = 404 + return b"Not Found" + return b"Found" + + mock.register_uri(requests_mock.GET, requests_mock.ANY, content=get_cb) + mock.register_uri(requests_mock.HEAD, requests_mock.ANY, content=head_cb) + + return sto_front, sto_back, objids + + +def test_http_objstorage(): + sto_front, sto_back, objids = build_objstorage() + + for objid in objids: + assert objid in sto_front + assert sto_front.get(objid) == sto_back.get(objid) + assert sto_front.get(objid).decode().startswith("some content ") + + +def test_http_objstorage_missing(): + sto_front, sto_back, objids = build_objstorage() + + assert b"\x00" * 20 not in sto_front + + +def test_http_objstorage_get_missing(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(exc.ObjNotFoundError): + sto_front.get(b"\x00" * 20) + + +def test_http_objstorage_check(): + sto_front, sto_back, objids = build_objstorage() + for objid in objids: + assert sto_front.check(objid) is None # no Exception means OK + + # create an invalid object in the in-memory objstorage + invalid_content = b"p0wn3d content" + fake_objid = "\x01" * 20 + id_added = sto_back.add(invalid_content, fake_objid) + assert id_added == fake_objid + + # the http objstorage should report it as invalid + with pytest.raises(exc.Error): + sto_front.check(id_added) + + +def test_http_objstorage_read_only(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(exc.ReadOnlyObjStorage): + sto_front.add(b"") + with pytest.raises(exc.ReadOnlyObjStorage): + sto_front.restore(b"") + with pytest.raises(exc.ReadOnlyObjStorage): + sto_front.delete(b"\x00" * 20) + + +def test_http_objstorage_not_iterable(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(exc.NonIterableObjStorage): + len(sto_front) + with pytest.raises(exc.NonIterableObjStorage): + iter(sto_front)