Changeset View
Standalone View
swh/storage/vault/cooker.py
- This file was added.
# Copyright (C) 2016 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import abc | |||||
import io | |||||
import os | |||||
import tarfile | |||||
import tempfile | |||||
import itertools | |||||
from swh.core import hashutil | |||||
SKIPPED_MESSAGE = (b'This content have not been retrieved in ' | |||||
b'Software Heritage archive due to its size') | |||||
HIDDEN_MESSAGE = (b'This content is hidden') | |||||
class BaseVaultCooker(metaclass=abc.ABCMeta): | |||||
"""Abstract base class for the vault's bundle creators | |||||
ardumont: `describes` | |||||
This class describes a common API for the cookers. | |||||
""" | |||||
@abc.abstractmethod | |||||
def cook(self, obj_id): | |||||
"""Cook the requested object into a bundle | |||||
The type of the object represented by the id depends on the | |||||
concrete class. Very likely, each type of bundle will have its | |||||
own cooker class. | |||||
Args: | |||||
obj_id: id of the object to be cooked into a bundle. | |||||
""" | |||||
Done Inline Actionsmust implement ardumont: `must implement` | |||||
raise NotImplementedError( | |||||
'Vault cookers must implement a `cook` method') | |||||
class DirectoryVaultCooker(BaseVaultCooker): | |||||
"""Cooker to create a directory bundle """ | |||||
def __init__(self, storage, cache): | |||||
"""Initialize a cooker that create directory bundles | |||||
Args: | |||||
storage: source storage where content are retrieved. | |||||
cache: destination storage where the cooked bundle are stored. | |||||
""" | |||||
self.storage = storage | |||||
self.cache = cache | |||||
def cook(self, dir_id): | |||||
"""Cook the requested directory into a Bundle | |||||
Args: | |||||
dir_id (bytes): the id of the directory to be cooked. | |||||
Returns: | |||||
bytes that correspond to the bundle | |||||
""" | |||||
root = bytes(tempfile.mkdtemp(prefix='directory.', suffix='.cook'), | |||||
'utf8') | |||||
# Retrieve data from the database | |||||
Done Inline ActionsYou could use generator: Something like this could work: import itertools datas = self.storage.directory_ls(dir_id, recursive=True) file_data, dir_data = itertools.tee(datas, 2) file_data = (entry for entry in datas if entry['type'] == 'file') dir_data = (entry for entry in datas if entry['type'] == 'dir') If this work, you could even go further as to deal with absent, hidden and normal files here by creating 5 generators and do the filtering here. file_hidden_data = (entry for entry in datas if entry['type'] == 'file' and entry['status'] == 'hidden') file_absent_data = (entry for entry in datas if entry['type'] == 'file' and entry['status'] == 'absent') ... Note: data is invariable, so it should really be data (and not datas) ^^ ardumont: You could use generator:
https://docs.python.org/3.4/library/itertools.html#itertools.tee… | |||||
data = list(self.storage.directory_ls(dir_id, recursive=True)) | |||||
Done Inline Actionss/If this work,/If this works/ ardumont: s/If this work,/If this works/ | |||||
data1, data2 = itertools.tee(data, 2) | |||||
dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir') | |||||
file_data = (entry for entry in data2 if entry['type'] == 'file') | |||||
Done Inline ActionsYou could probably remove the map here and only retrieves the list of names line 90. Also, i stand by my previous remark, you could inline the _split_data method directly here. ardumont: You could probably remove the map here and only retrieves the list of names line 90.
You lose… | |||||
# Recreate the directory | |||||
self._create_tree(root, dir_data) | |||||
self._create_files(root, file_data) | |||||
# Use the created directory to get the bundle datas | |||||
bundle_content = self._create_bundle_content( | |||||
root, | |||||
hashutil.hash_to_hex(dir_id) | |||||
) | |||||
self._cache_bundle(dir_id, bundle_content) | |||||
Done Inline ActionsYou could use generator here using ( and ) and even possibly remove altogether the function (cf. previous comment). ardumont: You could use generator here using `(` and `)`
and even possibly remove altogether the… | |||||
# Make a notification that the bundle have been cooked | |||||
self._notify_bundle_ready(dir_id) | |||||
def _create_tree(self, root, directory_paths): | |||||
"""Create a directory tree from the given paths | |||||
Done Inline ActionsIf entry is not useful, you could directly return entry['name']. ardumont: If entry is not useful, you could directly return entry['name']. | |||||
The tree is created from `root` and each given path in | |||||
`directory_paths` will be created. | |||||
""" | |||||
# Directories are sorted by depth so they are created in the | |||||
# right order | |||||
bsep = bytes(os.path.sep, 'utf8') | |||||
dir_names = sorted( | |||||
directory_paths, | |||||
key=lambda x: len(x.split(bsep)) | |||||
) | |||||
for dir_name in dir_names: | |||||
os.makedirs(os.path.join(root, dir_name)) | |||||
def _create_files(self, root, file_datas): | |||||
Done Inline Actionsmethod ardumont: method | |||||
"""Iterates over the file datas and delegate to the right method. | |||||
""" | |||||
# Then create the files | |||||
for file_data in file_datas: | |||||
path = os.path.join(root, file_data['name']) | |||||
status = file_data['status'] | |||||
if status == 'absent': | |||||
self._create_file_absent(path) | |||||
elif status == 'hidden': | |||||
self._create_file_hidden(path) | |||||
else: | |||||
content = self._get_file_content(file_data['sha1']) | |||||
self._create_file(path, content) | |||||
def _get_file_content(self, obj_id): | |||||
content = list(self.storage.content_get([obj_id]))[0]['data'] | |||||
return content | |||||
Done Inline Actionsbeware the print ^^ ardumont: beware the print ^^ | |||||
def _create_file(self, path, content): | |||||
"""Create the given file and fill it with content.""" | |||||
with open(path, 'wb') as f: | |||||
Done Inline ActionsThis could probably be improved later. ardumont: This could probably be improved later.
Since the storage api permits to retrieve multiple… | |||||
Not Done Inline ActionsRemember the performance issue we got on a remote storage? The current implementation is the reason. Working on a local storage made the execution time acceptable, but changing it is scheduled! qcampos: Remember the performance issue we got on a remote storage? The current implementation is the… | |||||
Not Done Inline ActionsThanks for the reminder ^^ ardumont: Thanks for the reminder ^^ | |||||
f.write(content) | |||||
def _create_file_absent(self, path): | |||||
"""Create a file that indicates a skipped content | |||||
Create the given file but fill it with a specific content to | |||||
indicates that the content have not been retrieved by the | |||||
software heritage archive due to its size. | |||||
""" | |||||
self._create_file(self, SKIPPED_MESSAGE) | |||||
def _create_file_hidden(self, path): | |||||
"""Create a file that indicates an hidden content | |||||
Create the given file but fill it with a specific content to | |||||
indicates that the content could not be retrieved due to | |||||
privacy policy. | |||||
""" | |||||
self._create_file(self, HIDDEN_MESSAGE) | |||||
def _create_bundle_content(self, path, hex_dir_id): | |||||
"""Create a bundle from the given directory | |||||
Args: | |||||
path: location of the directory to package. | |||||
hex_dir_id: hex representation of the directory id | |||||
Returns: | |||||
a path to the newly created archive file. | |||||
""" | |||||
tar_buffer = io.BytesIO() | |||||
tar = tarfile.open(fileobj=tar_buffer, mode='w') | |||||
tar.add(path.decode(), arcname=hex_dir_id) | |||||
return tar_buffer.getbuffer() | |||||
def _cache_bundle(self, dir_id, bundle_content): | |||||
self.cache.add('directory', dir_id, bundle_content) | |||||
def _notify_bundle_ready(self, bundle_id): | |||||
# TODO plug this method with the notification method once | |||||
# done. | |||||
pass |
describes