diff --git a/swh/storage/vault/cache.py b/swh/storage/vault/cache.py new file mode 100644 --- /dev/null +++ b/swh/storage/vault/cache.py @@ -0,0 +1,48 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.objstorage import get_objstorage + +_TYPES_PREFIX = { + 'directory': 'd', +} + + +class VaultCache(): + """The vault cache is an object storage that stores bundles + + The current implementation uses a PathSlicingObjStorage to store + the bundles. The id of a content if prefixed to specify its type + and store different types of bundle in different folders. + + """ + def __init__(self, root): + self.storage = get_objstorage('pathslicing', {'root': root, + 'slicing': '0:2/2:6'}) + + def __contains__(self, obj_id): + return obj_id in self.storage + + def add(self, obj_type, obj_id, content): + storage_id = self._compute_id(obj_type, obj_id) + return self.storage.add(content, obj_id=storage_id) + + def get(self, obj_type, obj_id, content): + storage_id = self._compute_id(obj_type, obj_id) + return self.storage.get(storage_id) + + def is_cached(self, obj_type, obj_id): + storage_id = self._compute_id(obj_type, obj_id) + return storage_id in self + + def _compute_id(self, obj_type, obj_id): + """Compute an id to be used by the underlaying storage.""" + if obj_id is None: + raise ValueError('Object id must be specified for the Vault cache') + try: + storage_id = _TYPES_PREFIX[obj_type] + obj_id + except KeyError as e: + raise ValueError('Invalid bundle type: ' + e.args[0]) + return storage_id diff --git a/swh/storage/vault/cooker.py b/swh/storage/vault/cooker.py new file mode 100644 --- /dev/null +++ b/swh/storage/vault/cooker.py @@ -0,0 +1,167 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import io +import abc +import tarfile +import tempfile + + +SKIPPED_MESSAGE = (b'This content have not been retrieved in ' + b'Software Heritage archive due to its size') + + +HIDDEN_MESSAGE = (b'This content is hidden') + + +class BaseVaultCooker(metaclass=abc.ABCMeta): + """Abstract base class for the vault's bundle creators + + This class describe a common API for the cookers. + + """ + @abc.abstractmethod + def cook(self, obj_id): + """Cook the requested object into a bundle + + The type of the object represented by the id depends on the + concrete class. Very likely, each type of bundle will have its + own cooker class. + + Args: + obj_id: id of the object to be cooked into a bundle. + + """ + raise NotImplementedError( + 'Vault cookers must implements a `cook` method') + + +class DirectoryVaultCooker(BaseVaultCooker): + """Cooker to create a directory bundle """ + + def __init__(self, storage, cache): + """Initialize a cooker that create directory bundles + + Args: + storage: source storage where content are retrieved. + cache: destination storage where the cooked bundle are stored. + + """ + self.storage = storage + self.cache = cache + + def cook(self, dir_id): + """Cook the requested directory into a Bundle + + Args: + dir_id (bytes): the id of the directory to be cooked. + + Returns: + bytes that correspond to the bundle + + """ + root = bytes(tempfile.mkdtemp(prefix='directory.', suffix='.cook'), + 'utf8') + # Retrieve data from the database + datas = list(self.storage.directory_ls(dir_id, recursive=True)) + dir_datas, file_datas = self._split_data(datas) + + # Recreate the directory + self._create_tree(root, map(lambda x: x['name'], dir_datas)) + self._create_files(root, file_datas) + + # Use the created directory to get the bundle datas + bundle_content = self._create_bundle_content(root) + self.cache.add('directory', dir_id, bundle_content) + + # Make a notification that the bundle have been cooked + self._notify_bundle_ready(dir_id) + + def _split_data(self, datas): + """Separates the directories data from the file data.""" + dir_datas = [entry for entry in datas if entry['type'] == 'dir'] + file_datas = [entry for entry in datas if entry['type'] == 'file'] + return dir_datas, file_datas + + def _create_tree(self, root, directory_paths): + """Create a directory tree from the given paths + + The tree is created from `root` and each given path in + `directory_paths` will be created. + + """ + # Directories are sorted by depth so they are created in the + # right order + bsep = bytes(os.path.sep, 'utf8') + dir_names = sorted( + directory_paths, + key=lambda x: len(x.split(bsep)) + ) + for dir_name in dir_names: + os.makedirs(os.path.join(root, dir_name)) + + def _create_files(self, root, file_datas): + """Iterates over the file datas and delegate to the right methof. + + """ + # Then create the files + for file_data in file_datas: + path = os.path.join(root, file_data['name']) + status = file_data['status'] + if status == 'absent': + self._create_file_absent(path) + elif status == 'hidden': + self._create_file_hidden(path) + else: + content = list( + self.storage.content_get([file_data['sha1']]))[0]['data'] + self._create_file(path, content) + + def _create_file(self, path, content): + """Create the given file and fill it with content.""" + print(repr(content)) + with open(path, 'wb') as f: + f.write(content) + + def _create_file_absent(self, path): + """Create a file that indicates a skipped content + + Create the given file but fill it with a specific content to + indicates that the content have not been retrieved by the + software heritage archive due to its size. + + """ + self._create_file(self, SKIPPED_MESSAGE) + + def _create_file_hidden(self, path): + """Create a file that indicates an hidden content + + Create the given file but fill it with a specific content to + indicates that the content could not be retrieved due to + privacy policy. + + """ + self._create_file(self, HIDDEN_MESSAGE) + + def _create_bundle_content(self, path): + """Create a bundle from the given directory + + Args: + path: location of the directory to package. + + Returns: + a path to the newly created archive file. + + """ + tar_buffer = io.BytesIO() + tar = tarfile.open(fileobj=tar_buffer, mode='w') + tar.add(path, arcname=os.path.basename(path)) + return tar_buffer.getbuffer() + + def _notify_bundle_ready(self, bundle_id): + # TODO plug this method with the notification method once + # done. + pass