diff --git a/swh/storage/vault/cache.py b/swh/storage/vault/cache.py new file mode 100644 --- /dev/null +++ b/swh/storage/vault/cache.py @@ -0,0 +1,113 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import pickle + +from swh.core import hashutil +from swh.objstorage import ObjStorage, get_objstorage +from swh.objstorage.objstorage_pathslicing import DIR_MODE + +BUNDLE_TYPES = { + 'directory': 'd', +} + + +class TempStorage(ObjStorage): + + def __init__(self): + try: + self._load() + except: + self.o = {} + self._save() + + def _load(self): + with open('storage.dump', 'rb') as f: + self.o = pickle.load(f) + + def _save(self): + with open('storage.dump', 'wb') as f: + pickle.dump(self.o, f) + + def __str__(self): + self._load() + return str(self.o) + + def __repr__(self): + self._load() + return repr(self.o) + + def __iter__(self): + self._load() + yield from self.o + + def __contains__(self, id): + self._load() + return id in self.o + + def add(self, content, id): + self._load() + self.o[id] = content + self._save() + return id + + def get(self, id): + self._load() + return self.o[id] + + def check(self, id): + return True + + +class VaultCache(): + """The vault cache is an object storage that stores bundles + + The current implementation uses a PathSlicingObjStorage to store + the bundles. The id of a content if prefixed to specify its type + and store different types of bundle in different folders. + + """ + + def __init__(self, root): + for subdir in BUNDLE_TYPES.values(): + fp = os.path.join(root, subdir) + if not os.path.isdir(fp): + os.makedirs(fp, DIR_MODE, exist_ok=True) + + self.storages = { + type: get_objstorage( + 'pathslicing', {'root': os.path.join(root, subdir), + 'slicing': '0:1/0:5'} + ) + for type, subdir in BUNDLE_TYPES.items() + } + # self.storages = {type: TempStorage() for type in BUNDLE_TYPES} + + def __contains__(self, obj_id): + return obj_id in self.storage + + def add(self, obj_type, obj_id, content): + storage = self._get_storage(obj_type) + return storage.add(content, hashutil.hex_to_hash(obj_id)) + + def get(self, obj_type, obj_id): + storage = self._get_storage(obj_type) + return storage.get(hashutil.hex_to_hash(obj_id)) + + def is_cached(self, obj_type, obj_id): + storage = self._get_storage(obj_type) + return hashutil.hex_to_hash(obj_id) in storage + + def ls(self, obj_type): + storage = self._get_storage(obj_type) + yield from storage + + def _get_storage(self, obj_type): + """Get the storage that corresponds to the object type""" + try: + return self.storages[obj_type] + except: + raise ValueError('Wrong bundle type: ' + obj_type) diff --git a/swh/storage/vault/cooker.py b/swh/storage/vault/cooker.py new file mode 100644 --- /dev/null +++ b/swh/storage/vault/cooker.py @@ -0,0 +1,166 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import io +import os +import tarfile +import tempfile + +SKIPPED_MESSAGE = (b'This content have not been retrieved in ' + b'Software Heritage archive due to its size') + + +HIDDEN_MESSAGE = (b'This content is hidden') + + +class BaseVaultCooker(metaclass=abc.ABCMeta): + """Abstract base class for the vault's bundle creators + + This class describe a common API for the cookers. + + """ + @abc.abstractmethod + def cook(self, obj_id): + """Cook the requested object into a bundle + + The type of the object represented by the id depends on the + concrete class. Very likely, each type of bundle will have its + own cooker class. + + Args: + obj_id: id of the object to be cooked into a bundle. + + """ + raise NotImplementedError( + 'Vault cookers must implements a `cook` method') + + +class DirectoryVaultCooker(BaseVaultCooker): + """Cooker to create a directory bundle """ + + def __init__(self, storage, cache): + """Initialize a cooker that create directory bundles + + Args: + storage: source storage where content are retrieved. + cache: destination storage where the cooked bundle are stored. + + """ + self.storage = storage + self.cache = cache + + def cook(self, dir_id): + """Cook the requested directory into a Bundle + + Args: + dir_id (bytes): the id of the directory to be cooked. + + Returns: + bytes that correspond to the bundle + + """ + root = bytes(tempfile.mkdtemp(prefix='directory.', suffix='.cook'), + 'utf8') + # Retrieve data from the database + datas = list(self.storage.directory_ls(dir_id, recursive=True)) + dir_datas, file_datas = self._split_data(datas) + + # Recreate the directory + self._create_tree(root, map(lambda x: x['name'], dir_datas)) + self._create_files(root, file_datas) + + # Use the created directory to get the bundle datas + bundle_content = self._create_bundle_content(root) + self.cache.add('directory', dir_id, bundle_content) + + # Make a notification that the bundle have been cooked + self._notify_bundle_ready(dir_id) + + def _split_data(self, datas): + """Separates the directories data from the file data.""" + dir_datas = [entry for entry in datas if entry['type'] == 'dir'] + file_datas = [entry for entry in datas if entry['type'] == 'file'] + return dir_datas, file_datas + + def _create_tree(self, root, directory_paths): + """Create a directory tree from the given paths + + The tree is created from `root` and each given path in + `directory_paths` will be created. + + """ + # Directories are sorted by depth so they are created in the + # right order + bsep = bytes(os.path.sep, 'utf8') + dir_names = sorted( + directory_paths, + key=lambda x: len(x.split(bsep)) + ) + for dir_name in dir_names: + os.makedirs(os.path.join(root, dir_name)) + + def _create_files(self, root, file_datas): + """Iterates over the file datas and delegate to the right methof. + + """ + # Then create the files + for file_data in file_datas: + path = os.path.join(root, file_data['name']) + status = file_data['status'] + if status == 'absent': + self._create_file_absent(path) + elif status == 'hidden': + self._create_file_hidden(path) + else: + content = list( + self.storage.content_get([file_data['sha1']]))[0]['data'] + self._create_file(path, content) + + def _create_file(self, path, content): + """Create the given file and fill it with content.""" + print(repr(content)) + with open(path, 'wb') as f: + f.write(content) + + def _create_file_absent(self, path): + """Create a file that indicates a skipped content + + Create the given file but fill it with a specific content to + indicates that the content have not been retrieved by the + software heritage archive due to its size. + + """ + self._create_file(self, SKIPPED_MESSAGE) + + def _create_file_hidden(self, path): + """Create a file that indicates an hidden content + + Create the given file but fill it with a specific content to + indicates that the content could not be retrieved due to + privacy policy. + + """ + self._create_file(self, HIDDEN_MESSAGE) + + def _create_bundle_content(self, path): + """Create a bundle from the given directory + + Args: + path: location of the directory to package. + + Returns: + a path to the newly created archive file. + + """ + tar_buffer = io.BytesIO() + tar = tarfile.open(fileobj=tar_buffer, mode='w') + tar.add(path, arcname=os.path.basename(path)) + return tar_buffer.getbuffer() + + def _notify_bundle_ready(self, bundle_id): + # TODO plug this method with the notification method once + # done. + pass