Page MenuHomeSoftware Heritage

D102.diff
No OneTemporary

D102.diff

diff --git a/swh/storage/vault/cache.py b/swh/storage/vault/cache.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/vault/cache.py
@@ -0,0 +1,64 @@
+# Copyright (C) 2016 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+
+from swh.core import hashutil
+from swh.objstorage import get_objstorage
+from swh.objstorage.objstorage_pathslicing import DIR_MODE
+
+BUNDLE_TYPES = {
+ 'directory': 'd',
+}
+
+
+class VaultCache():
+ """The vault cache is an object storage that stores bundles
+
+ The current implementation uses a PathSlicingObjStorage to store
+ the bundles. The id of a content if prefixed to specify its type
+ and store different types of bundle in different folders.
+
+ """
+
+ def __init__(self, root):
+ for subdir in BUNDLE_TYPES.values():
+ fp = os.path.join(root, subdir)
+ if not os.path.isdir(fp):
+ os.makedirs(fp, DIR_MODE, exist_ok=True)
+
+ self.storages = {
+ type: get_objstorage(
+ 'pathslicing', {'root': os.path.join(root, subdir),
+ 'slicing': '0:1/0:5'}
+ )
+ for type, subdir in BUNDLE_TYPES.items()
+ }
+
+ def __contains__(self, obj_id):
+ return obj_id in self.storage
+
+ def add(self, obj_type, obj_id, content):
+ storage = self._get_storage(obj_type)
+ return storage.add(content, obj_id)
+
+ def get(self, obj_type, obj_id):
+ storage = self._get_storage(obj_type)
+ return storage.get(hashutil.hex_to_hash(obj_id))
+
+ def is_cached(self, obj_type, obj_id):
+ storage = self._get_storage(obj_type)
+ return hashutil.hex_to_hash(obj_id) in storage
+
+ def ls(self, obj_type):
+ storage = self._get_storage(obj_type)
+ yield from storage
+
+ def _get_storage(self, obj_type):
+ """Get the storage that corresponds to the object type"""
+ try:
+ return self.storages[obj_type]
+ except:
+ raise ValueError('Wrong bundle type: ' + obj_type)
diff --git a/swh/storage/vault/conf.yaml b/swh/storage/vault/conf.yaml
new file mode 100644
diff --git a/swh/storage/vault/cooker.py b/swh/storage/vault/cooker.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/vault/cooker.py
@@ -0,0 +1,173 @@
+# Copyright (C) 2016 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import abc
+import io
+import os
+import tarfile
+import tempfile
+import itertools
+from swh.core import hashutil
+
+SKIPPED_MESSAGE = (b'This content have not been retrieved in '
+ b'Software Heritage archive due to its size')
+
+
+HIDDEN_MESSAGE = (b'This content is hidden')
+
+
+class BaseVaultCooker(metaclass=abc.ABCMeta):
+ """Abstract base class for the vault's bundle creators
+
+ This class describes a common API for the cookers.
+
+ """
+ @abc.abstractmethod
+ def cook(self, obj_id):
+ """Cook the requested object into a bundle
+
+ The type of the object represented by the id depends on the
+ concrete class. Very likely, each type of bundle will have its
+ own cooker class.
+
+ Args:
+ obj_id: id of the object to be cooked into a bundle.
+
+ """
+ raise NotImplementedError(
+ 'Vault cookers must implement a `cook` method')
+
+
+class DirectoryVaultCooker(BaseVaultCooker):
+ """Cooker to create a directory bundle """
+
+ def __init__(self, storage, cache):
+ """Initialize a cooker that create directory bundles
+
+ Args:
+ storage: source storage where content are retrieved.
+ cache: destination storage where the cooked bundle are stored.
+
+ """
+ self.storage = storage
+ self.cache = cache
+
+ def cook(self, dir_id):
+ """Cook the requested directory into a Bundle
+
+ Args:
+ dir_id (bytes): the id of the directory to be cooked.
+
+ Returns:
+ bytes that correspond to the bundle
+
+ """
+ root = bytes(tempfile.mkdtemp(prefix='directory.', suffix='.cook'),
+ 'utf8')
+ # Retrieve data from the database
+ data = list(self.storage.directory_ls(dir_id, recursive=True))
+ data1, data2 = itertools.tee(data, 2)
+ dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir')
+ file_data = (entry for entry in data2 if entry['type'] == 'file')
+
+ # Recreate the directory
+ self._create_tree(root, dir_data)
+ self._create_files(root, file_data)
+
+ # Use the created directory to get the bundle datas
+ bundle_content = self._create_bundle_content(
+ root,
+ hashutil.hash_to_hex(dir_id)
+ )
+ self._cache_bundle(dir_id, bundle_content)
+
+ # Make a notification that the bundle have been cooked
+ self._notify_bundle_ready(dir_id)
+
+ def _create_tree(self, root, directory_paths):
+ """Create a directory tree from the given paths
+
+ The tree is created from `root` and each given path in
+ `directory_paths` will be created.
+
+ """
+ # Directories are sorted by depth so they are created in the
+ # right order
+ bsep = bytes(os.path.sep, 'utf8')
+ dir_names = sorted(
+ directory_paths,
+ key=lambda x: len(x.split(bsep))
+ )
+ for dir_name in dir_names:
+ os.makedirs(os.path.join(root, dir_name))
+
+ def _create_files(self, root, file_datas):
+ """Iterates over the file datas and delegate to the right method.
+
+ """
+ # Then create the files
+ for file_data in file_datas:
+ path = os.path.join(root, file_data['name'])
+ status = file_data['status']
+ if status == 'absent':
+ self._create_file_absent(path)
+ elif status == 'hidden':
+ self._create_file_hidden(path)
+ else:
+ content = self._get_file_content(file_data['sha1'])
+ self._create_file(path, content)
+
+ def _get_file_content(self, obj_id):
+ content = list(self.storage.content_get([obj_id]))[0]['data']
+ return content
+
+ def _create_file(self, path, content):
+ """Create the given file and fill it with content."""
+ with open(path, 'wb') as f:
+ f.write(content)
+
+ def _create_file_absent(self, path):
+ """Create a file that indicates a skipped content
+
+ Create the given file but fill it with a specific content to
+ indicates that the content have not been retrieved by the
+ software heritage archive due to its size.
+
+ """
+ self._create_file(self, SKIPPED_MESSAGE)
+
+ def _create_file_hidden(self, path):
+ """Create a file that indicates an hidden content
+
+ Create the given file but fill it with a specific content to
+ indicates that the content could not be retrieved due to
+ privacy policy.
+
+ """
+ self._create_file(self, HIDDEN_MESSAGE)
+
+ def _create_bundle_content(self, path, hex_dir_id):
+ """Create a bundle from the given directory
+
+ Args:
+ path: location of the directory to package.
+ hex_dir_id: hex representation of the directory id
+
+ Returns:
+ a path to the newly created archive file.
+
+ """
+ tar_buffer = io.BytesIO()
+ tar = tarfile.open(fileobj=tar_buffer, mode='w')
+ tar.add(path.decode(), arcname=hex_dir_id)
+ return tar_buffer.getbuffer()
+
+ def _cache_bundle(self, dir_id, bundle_content):
+ self.cache.add('directory', dir_id, bundle_content)
+
+ def _notify_bundle_ready(self, bundle_id):
+ # TODO plug this method with the notification method once
+ # done.
+ pass

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 1:04 PM (9 h, 26 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217585

Event Timeline