diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -1,7 +1,7 @@ [mypy] namespace_packages = True warn_unused_ignores = True - +exclude = swh/objstorage/misc # 3rd party libraries without stubs (yet) diff --git a/swh/objstorage/backends/http.py b/swh/objstorage/backends/http.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/backends/http.py @@ -0,0 +1,93 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from urllib.parse import urljoin + +import requests + +from swh.model import hashutil +from swh.objstorage.exc import Error, ObjNotFoundError +from swh.objstorage.objstorage import ( + DEFAULT_LIMIT, + ObjStorage, + compute_hash, + decompressors, +) + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.ERROR) + + +class HTTPReadOnlyObjStorage(ObjStorage): + """Simple ObjStorage retrieving objects from an HTTP server. + + For example, can be used to retrieve objects from S3: + + objstorage: + cls: http + url: https://softwareheritage.s3.amazonaws.com/content/ + """ + + def __init__(self, url=None, compression=None, **kwargs): + super().__init__(**kwargs) + self.session = requests.sessions.Session() + self.root_path = url + if not self.root_path.endswith("/"): + self.root_path += "/" + self.compression = compression + + def check_config(self, *, check_write): + """Check the configuration for this object storage""" + # FIXME: hopefully this blew up during instantiation + return True + + def __contains__(self, obj_id): + resp = self.session.head(self._path(obj_id)) + return resp.status_code == 200 + + def __iter__(self): + raise NotImplementedError() + + def __len__(self): + raise NotImplementedError() + + def add(self, content, obj_id=None, check_presence=True): + raise NotImplementedError() + + def restore(self, content, obj_id=None): + raise NotImplementedError() + + def get(self, obj_id): + try: + resp = self.session.get(self._path(obj_id)) + resp.raise_for_status() + except Exception: + raise ObjNotFoundError(obj_id) + + ret: bytes = resp.content + if self.compression: + d = decompressors[self.compression]() + ret = d.decompress(ret) + if d.unused_data: + hex_obj_id = hashutil.hash_to_hex(obj_id) + raise Error("Corrupt object %s: trailing data found" % hex_obj_id) + return ret + + def check(self, obj_id): + # Check the content integrity + obj_content = self.get(obj_id) + content_obj_id = compute_hash(obj_content) + if content_obj_id != obj_id: + raise Error(obj_id) + + def delete(self, obj_id): + raise NotImplementedError() + + def list_content(self, last_obj_id=None, limit=DEFAULT_LIMIT): + raise NotImplementedError() + + def _path(self, obj_id): + return urljoin(self.root_path, hashutil.hash_to_hex(obj_id)) diff --git a/swh/objstorage/factory.py b/swh/objstorage/factory.py --- a/swh/objstorage/factory.py +++ b/swh/objstorage/factory.py @@ -8,6 +8,7 @@ from swh.objstorage.api.client import RemoteObjStorage from swh.objstorage.backends.generator import RandomGeneratorObjStorage +from swh.objstorage.backends.http import HTTPReadOnlyObjStorage from swh.objstorage.backends.in_memory import InMemoryObjStorage from swh.objstorage.backends.pathslicing import PathSlicingObjStorage from swh.objstorage.backends.seaweed import WeedObjStorage @@ -24,6 +25,7 @@ "memory": InMemoryObjStorage, "weed": WeedObjStorage, "random": RandomGeneratorObjStorage, + "http": HTTPReadOnlyObjStorage, } _STORAGE_CLASSES_MISSING = {} diff --git a/swh/objstorage/tests/test_objstorage_http.py b/swh/objstorage/tests/test_objstorage_http.py new file mode 100644 --- /dev/null +++ b/swh/objstorage/tests/test_objstorage_http.py @@ -0,0 +1,90 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import requests_mock +from requests_mock.contrib import fixture + +from swh.objstorage import exc +from swh.objstorage.factory import get_objstorage + + +def build_objstorage(): + sto_back = get_objstorage(cls="memory") + objids = [] + for i in range(100): + objids.append(sto_back.add(f"some content {i}".encode())) + + url = "http://127.0.0.1/content/" + sto_front = get_objstorage(cls="http", url=url) + mock = fixture.Fixture() + mock.setUp() + + def get_cb(request, context): + dirname, basename = request.path.rsplit("/", 1) + objid = bytes.fromhex(basename) + if dirname == "/content" and objid in sto_back: + return sto_back.get(objid) + context.status_code = 404 + + def head_cb(request, context): + dirname, basename = request.path.rsplit("/", 1) + objid = bytes.fromhex(basename) + if dirname != "/content" or objid not in sto_back: + context.status_code = 404 + return b"Not Found" + return b"Found" + + mock.register_uri(requests_mock.GET, requests_mock.ANY, content=get_cb) + mock.register_uri(requests_mock.HEAD, requests_mock.ANY, content=head_cb) + + return sto_front, sto_back, objids + + +def test_http_objstorage(): + sto_front, sto_back, objids = build_objstorage() + + for objid in objids: + assert objid in sto_front + assert sto_front.get(objid) == sto_back.get(objid) + assert sto_front.get(objid).decode().startswith("some content ") + + +def test_http_objstorage_missing(): + sto_front, sto_back, objids = build_objstorage() + + assert b"\x00" * 20 not in sto_front + + +def test_http_objstorage_check(): + sto_front, sto_back, objids = build_objstorage() + sto_back.allow_delete = True + for objid in objids: + assert sto_front.check(objid) is None # no Exception means OK + + # create an invalid object in the in-memory objstorage + invalid_content = b"p0wn3d content" + fake_objid = "\x01" * 20 + id_added = sto_back.add(invalid_content, fake_objid) + assert id_added == fake_objid + + # the http objstorage should report it as invalid + with pytest.raises(exc.Error): + sto_front.check(id_added) + + +def test_http_objstorage_not_implemented(): + sto_front, sto_back, objids = build_objstorage() + + with pytest.raises(NotImplementedError): + sto_front.add(b"") + with pytest.raises(NotImplementedError): + sto_front.restore(b"") + with pytest.raises(NotImplementedError): + sto_front.delete(b"\x00" * 20) + with pytest.raises(NotImplementedError): + len(sto_front) + with pytest.raises(NotImplementedError): + iter(sto_front)