diff --git a/bin/swh-objstorage-azure b/bin/swh-objstorage-azure index a5f844e..bdf4ac7 100755 --- a/bin/swh-objstorage-azure +++ b/bin/swh-objstorage-azure @@ -1,110 +1,114 @@ #!/usr/bin/env python3 # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # NOT FOR PRODUCTION import click from swh.objstorage import get_objstorage from swh.objstorage.cloud.objstorage_azure import AzureCloudObjStorage from swh.core import config, hashutil class AzureAccess(config.SWHConfig): """This is an orchestration class to try and check objstorage_azure implementation.""" DEFAULT_CONFIG = { # Output storage 'storage_account_name': ('str', 'account-name-as-access-key'), 'storage_secret_key': ('str', 'associated-secret-key'), 'container_name': ('str', 'sample-container'), # Input storage 'storage': ('dict', {'cls': 'pathslicing', 'args': {'root': '/srv/softwareheritage/objects', 'slicing': '0:2/2:4/4:6'}}), } CONFIG_BASE_FILENAME = 'objstorage/azure' def __init__(self): super().__init__() self.config = self.parse_config_file() container_name = self.config['container_name'] self.azure_cloud_storage = AzureCloudObjStorage( account_name=self.config['storage_account_name'], api_secret_key=self.config['storage_secret_key'], container_name=container_name) self.read_objstorage = get_objstorage(**self.config['storage']) def _to_id(self, hex_obj_id): return hashutil.hex_to_hash(hex_obj_id) def list_contents(self): for c in self.azure_cloud_storage: print(c) def send_one_content(self, hex_obj_id): obj_id = self._to_id(hex_obj_id) obj_content = self.read_objstorage.get(obj_id) self.azure_cloud_storage.add(content=obj_content, obj_id=obj_id) def check_integrity(self, hex_obj_id): obj_id = self._to_id(hex_obj_id) self.azure_cloud_storage.check(obj_id) # will raise if problem def check_presence(self, hex_obj_id): obj_id = self._to_id(hex_obj_id) return obj_id in self.azure_cloud_storage def download(self, hex_obj_id): obj_id = self._to_id(hex_obj_id) return self.azure_cloud_storage.get(obj_id) @click.command() def tryout(): obj_azure = AzureAccess() # hex_sample_id = '00000008e22217b439f3e582813bd875e7141a0e' hex_sample_id = '0001001d2879dd009fc11d0c5f0691940989a76b' check_presence = obj_azure.check_presence(hex_sample_id) print('presence first time should be False:', check_presence) obj_azure.send_one_content(hex_sample_id) - obj_azure.check_integrity(hex_sample_id) check_presence = obj_azure.check_presence(hex_sample_id) print('presence True:', check_presence) check_presence = obj_azure.check_presence('dfeffffeffff17b439f3e582813bd875e7141a0e') print('presence False:', check_presence) print() print('Download a blob') blob_content = obj_azure.download(hex_sample_id) print(blob_content) print() try: obj_azure.download(hex_sample_id.replace('0', 'f')) except: print('Expected `blob does not exist`!') print() print('blobs:') obj_azure.list_contents() + print() + print('content of %s' % hex_sample_id) + print(obj_azure.download(hex_sample_id)) + + obj_azure.check_integrity(hex_sample_id) if __name__ == '__main__': tryout() diff --git a/swh/objstorage/cloud/objstorage_azure.py b/swh/objstorage/cloud/objstorage_azure.py index e97b81e..0c62e31 100644 --- a/swh/objstorage/cloud/objstorage_azure.py +++ b/swh/objstorage/cloud/objstorage_azure.py @@ -1,79 +1,85 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import gzip + from swh.core import hashutil from swh.objstorage.objstorage import ObjStorage, compute_hash from swh.objstorage.exc import ObjNotFoundError, Error from azure.storage.blob import BlockBlobService class AzureCloudObjStorage(ObjStorage): """ObjStorage with azure abilities """ def __init__(self, account_name, api_secret_key, container_name): self.block_blob_service = BlockBlobService( account_name=account_name, account_key=api_secret_key) self.container_name = container_name def __contains__(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) return self.block_blob_service.exists( container_name=self.container_name, blob_name=hex_obj_id) def __iter__(self): """ Iterate over the objects present in the storage """ for obj in self.block_blob_service.list_blobs(self.container_name): yield obj.name def __len__(self): """Compute the number of objects in the current object storage. Returns: number of objects contained in the storage. """ return sum(1 for i in self) def add(self, content, obj_id=None, check_presence=True): """Add an obj in storage if it's not there already. """ if obj_id is None: # Checksum is missing, compute it on the fly. obj_id = compute_hash(content) if check_presence and obj_id in self: return obj_id hex_obj_id = hashutil.hash_to_hex(obj_id) - self.block_blob_service.create_blob_from_bytes(self.container_name, - hex_obj_id, - content) + + # Send the gzipped content + self.block_blob_service.create_blob_from_bytes( + container_name=self.container_name, + blob_name=hex_obj_id, + blob=gzip.compress(content)) + return obj_id def restore(self, content, obj_id=None): return self.add(content, obj_id, check_presence=False) def get(self, obj_id): hex_obj_id = hashutil.hash_to_hex(obj_id) blob = self.block_blob_service.get_blob_to_bytes( container_name=self.container_name, blob_name=hex_obj_id) if not blob: raise ObjNotFoundError('Content %s not found!' % hex_obj_id) - return blob.content + return gzip.decompress(blob.content) def check(self, obj_id): # Check the content integrity obj_content = self.get(obj_id) content_obj_id = compute_hash(obj_content) if content_obj_id != obj_id: raise Error(obj_id)