Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/vault/cooker.py
# Copyright (C) 2016 The Software Heritage developers | # Copyright (C) 2016 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import abc | import abc | ||||
import io | import io | ||||
import os | import os | ||||
import tarfile | import tarfile | ||||
import tempfile | import tempfile | ||||
import itertools | import itertools | ||||
from swh.core import hashutil | from swh.core import hashutil | ||||
SKIPPED_MESSAGE = (b'This content have not been retrieved in ' | SKIPPED_MESSAGE = (b'This content have not been retrieved in ' | ||||
b'Software Heritage archive due to its size') | b'Software Heritage archive due to its size') | ||||
HIDDEN_MESSAGE = (b'This content is hidden') | HIDDEN_MESSAGE = (b'This content is hidden') | ||||
class BaseVaultCooker(metaclass=abc.ABCMeta): | class BaseVaultCooker(metaclass=abc.ABCMeta): | ||||
"""Abstract base class for the vault's bundle creators | """Abstract base class for the vault's bundle creators | ||||
This class describes a common API for the cookers. | This class describes a common API for the cookers. | ||||
""" | """ | ||||
def __init__(self, storage, cache): | |||||
self.storage = storage | |||||
self.cache = cache | |||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def cook(self, obj_id): | def cook(self, obj_id): | ||||
"""Cook the requested object into a bundle | """Cook the requested object into a bundle | ||||
The type of the object represented by the id depends on the | The type of the object represented by the id depends on the | ||||
concrete class. Very likely, each type of bundle will have its | concrete class. Very likely, each type of bundle will have its | ||||
own cooker class. | own cooker class. | ||||
Args: | Args: | ||||
obj_id: id of the object to be cooked into a bundle. | obj_id: id of the object to be cooked into a bundle. | ||||
""" | """ | ||||
raise NotImplementedError( | raise NotImplementedError( | ||||
'Vault cookers must implement a `cook` method') | 'Vault cookers must implement a `cook` method') | ||||
@abc.abstractmethod | |||||
def update_cache(self, id, bundle_content): | |||||
raise NotImplementedError('Vault cookers must implement a ' | |||||
'`update_cache` method') | |||||
@abc.abstractmethod | |||||
def notify_bundle_ready(self, notif_data, bundle_id): | |||||
raise NotImplementedError( | |||||
'Vault cookers must implement a `notify_bundle_ready` method') | |||||
class DirectoryVaultCooker(BaseVaultCooker): | class DirectoryVaultCooker(BaseVaultCooker): | ||||
"""Cooker to create a directory bundle """ | """Cooker to create a directory bundle """ | ||||
def __init__(self, storage, cache): | def __init__(self, storage, cache): | ||||
"""Initialize a cooker that create directory bundles | """Initialize a cooker that create directory bundles | ||||
Args: | Args: | ||||
Show All 9 Lines | def cook(self, dir_id): | ||||
Args: | Args: | ||||
dir_id (bytes): the id of the directory to be cooked. | dir_id (bytes): the id of the directory to be cooked. | ||||
Returns: | Returns: | ||||
bytes that correspond to the bundle | bytes that correspond to the bundle | ||||
""" | """ | ||||
root = bytes(tempfile.mkdtemp(prefix='directory.', suffix='.cook'), | # Create the bytes that corresponds to the compressed | ||||
'utf8') | # directory. | ||||
# Retrieve data from the database | directory_cooker = DirectoryCooker(self.storage) | ||||
bundle_content = directory_cooker.get_directory_bytes(dir_id) | |||||
# Cache the bundle | |||||
self._cache_bundle(dir_id, bundle_content) | |||||
# Make a notification that the bundle have been cooked | |||||
# NOT YET IMPLEMENTED see TODO in function. | |||||
self._notify_bundle_ready(dir_id) | |||||
def update_cache(self, dir_id, bundle_content): | |||||
self.cache.add('directory', dir_id, bundle_content) | |||||
def notify_bundle_ready(self, bundle_id): | |||||
# TODO plug this method with the notification method once | |||||
# done. | |||||
pass | |||||
class DirectoryCooker(): | |||||
"""Creates a cooked directory from its sha1_git in the db. | |||||
Warning: This is NOT a directly accessible cooker, but a low-level | |||||
one that effectuates the manipulations. | |||||
""" | |||||
def __init__(self, storage): | |||||
self.storage = storage | |||||
def get_directory_bytes(self, dir_id): | |||||
# Create temporary folder to retrieve the files into. | |||||
root = bytes(tempfile.mkdtemp(prefix='directory.', | |||||
suffix='.cook'), 'utf8') | |||||
# Retrieve data from the database. | |||||
data = list(self.storage.directory_ls(dir_id, recursive=True)) | data = list(self.storage.directory_ls(dir_id, recursive=True)) | ||||
# Split into files and directory data. | |||||
data1, data2 = itertools.tee(data, 2) | data1, data2 = itertools.tee(data, 2) | ||||
dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir') | dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir') | ||||
file_data = (entry for entry in data2 if entry['type'] == 'file') | file_data = (entry for entry in data2 if entry['type'] == 'file') | ||||
# Recreate the directory | # Recreate the directory's subtree and then the files into it. | ||||
self._create_tree(root, dir_data) | self._create_tree(root, dir_data) | ||||
self._create_files(root, file_data) | self._create_files(root, file_data) | ||||
# Use the created directory to get the bundle datas | # Use the created directory to make a bundle with the data as | ||||
# a compressed directory. | |||||
bundle_content = self._create_bundle_content( | bundle_content = self._create_bundle_content( | ||||
root, | root, | ||||
hashutil.hash_to_hex(dir_id) | hashutil.hash_to_hex(dir_id) | ||||
) | ) | ||||
self._cache_bundle(dir_id, bundle_content) | return bundle_content | ||||
# Make a notification that the bundle have been cooked | |||||
self._notify_bundle_ready(dir_id) | |||||
def _create_tree(self, root, directory_paths): | def _create_tree(self, root, directory_paths): | ||||
"""Create a directory tree from the given paths | """Create a directory tree from the given paths | ||||
The tree is created from `root` and each given path in | The tree is created from `root` and each given path in | ||||
`directory_paths` will be created. | `directory_paths` will be created. | ||||
""" | """ | ||||
# Directories are sorted by depth so they are created in the | # Directories are sorted by depth so they are created in the | ||||
# right order | # right order | ||||
bsep = bytes(os.path.sep, 'utf8') | bsep = bytes(os.path.sep, 'utf8') | ||||
dir_names = sorted( | dir_names = sorted( | ||||
directory_paths, | directory_paths, | ||||
key=lambda x: len(x.split(bsep)) | key=lambda x: len(x.split(bsep)) | ||||
) | ) | ||||
for dir_name in dir_names: | for dir_name in dir_names: | ||||
os.makedirs(os.path.join(root, dir_name)) | os.makedirs(os.path.join(root, dir_name)) | ||||
def _create_files(self, root, file_datas): | def _create_files(self, root, file_datas): | ||||
"""Iterates over the file datas and delegate to the right method. | """Create the files according to their status. | ||||
""" | """ | ||||
# Then create the files | # Then create the files | ||||
for file_data in file_datas: | for file_data in file_datas: | ||||
path = os.path.join(root, file_data['name']) | path = os.path.join(root, file_data['name']) | ||||
status = file_data['status'] | status = file_data['status'] | ||||
if status == 'absent': | if status == 'absent': | ||||
self._create_file_absent(path) | self._create_file_absent(path) | ||||
elif status == 'hidden': | elif status == 'hidden': | ||||
self._create_file_hidden(path) | self._create_file_hidden(path) | ||||
else: | else: | ||||
content = self._get_file_content(file_data['sha1']) | content = self._get_file_content(file_data['sha1']) | ||||
self._create_file(path, content) | self._create_file(path, content) | ||||
def _get_file_content(self, obj_id): | |||||
content = list(self.storage.content_get([obj_id]))[0]['data'] | |||||
return content | |||||
def _create_file(self, path, content): | def _create_file(self, path, content): | ||||
"""Create the given file and fill it with content.""" | """Create the given file and fill it with content. | ||||
""" | |||||
with open(path, 'wb') as f: | with open(path, 'wb') as f: | ||||
f.write(content) | f.write(content) | ||||
def _get_file_content(self, obj_id): | |||||
"""Get the content of the given file. | |||||
""" | |||||
content = list(self.storage.content_get([obj_id]))[0]['data'] | |||||
return content | |||||
def _create_file_absent(self, path): | def _create_file_absent(self, path): | ||||
"""Create a file that indicates a skipped content | """Create a file that indicates a skipped content | ||||
Create the given file but fill it with a specific content to | Create the given file but fill it with a specific content to | ||||
indicates that the content have not been retrieved by the | indicates that the content have not been retrieved by the | ||||
software heritage archive due to its size. | software heritage archive due to its size. | ||||
""" | """ | ||||
Show All 12 Lines | class DirectoryCooker(): | ||||
def _create_bundle_content(self, path, hex_dir_id): | def _create_bundle_content(self, path, hex_dir_id): | ||||
"""Create a bundle from the given directory | """Create a bundle from the given directory | ||||
Args: | Args: | ||||
path: location of the directory to package. | path: location of the directory to package. | ||||
hex_dir_id: hex representation of the directory id | hex_dir_id: hex representation of the directory id | ||||
Returns: | Returns: | ||||
a path to the newly created archive file. | bytes that represents the compressed directory as a | ||||
bundle. | |||||
""" | """ | ||||
tar_buffer = io.BytesIO() | tar_buffer = io.BytesIO() | ||||
tar = tarfile.open(fileobj=tar_buffer, mode='w') | tar = tarfile.open(fileobj=tar_buffer, mode='w') | ||||
tar.add(path.decode(), arcname=hex_dir_id) | tar.add(path.decode(), arcname=hex_dir_id) | ||||
return tar_buffer.getbuffer() | return tar_buffer.getbuffer() | ||||
def _cache_bundle(self, dir_id, bundle_content): | |||||
self.cache.add('directory', dir_id, bundle_content) | |||||
def _notify_bundle_ready(self, bundle_id): | |||||
# TODO plug this method with the notification method once | |||||
# done. | |||||
pass |