Changeset View
Changeset View
Standalone View
Standalone View
swh/vault/to_disk.py
# Copyright (C) 2016-2018 The Software Heritage developers | # Copyright (C) 2016-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import itertools | import functools | ||||
import collections | |||||
import os | import os | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.from_disk import mode_to_perms, DentryPerms | from swh.model.from_disk import mode_to_perms, DentryPerms | ||||
SKIPPED_MESSAGE = (b'This content has not been retrieved in the ' | SKIPPED_MESSAGE = (b'This content has not been retrieved in the ' | ||||
b'Software Heritage archive due to its size.') | b'Software Heritage archive due to its size.') | ||||
HIDDEN_MESSAGE = (b'This content is hidden.') | HIDDEN_MESSAGE = (b'This content is hidden.') | ||||
def get_filtered_file_content(storage, file_data): | def get_filtered_files_content(storage, files_data): | ||||
"""Retrieve the file specified by file_data and apply filters for skipped | """Retrieve the files specified by files_data and apply filters for skipped | ||||
and missing contents. | and missing contents. | ||||
Args: | Args: | ||||
storage: the storage from which to retrieve the object | storage: the storage from which to retrieve the objects | ||||
file_data: file entry descriptor as returned by directory_ls() | files_data: list of file entries as returned by directory_ls() | ||||
Returns: | Yields: | ||||
Bytes containing the specified content. The content will be replaced by | The entries given in files_data with a new 'content' key that points to | ||||
a specific message to indicate that the content could not be retrieved | the file content in bytes. | ||||
(either due to privacy policy or because its size was too big for us to | |||||
archive it). | The contents can be replaced by a specific message to indicate that | ||||
they could not be retrieved (either due to privacy policy or because | |||||
their sizes were too big for us to archive it). | |||||
""" | """ | ||||
assert file_data['type'] == 'file' | contents_to_fetch = [f['sha1'] for f in files_data | ||||
if f['status'] == 'visible'] | |||||
if file_data['status'] == 'absent': | contents_fetched = storage.content_get(contents_to_fetch) | ||||
return SKIPPED_MESSAGE | contents = {c['sha1']: c['data'] for c in contents_fetched} | ||||
for file_data in files_data: | |||||
if file_data['status'] == 'visible': | |||||
content = contents[file_data['sha1']] | |||||
elif file_data['status'] == 'absent': | |||||
content = SKIPPED_MESSAGE | |||||
elif file_data['status'] == 'hidden': | elif file_data['status'] == 'hidden': | ||||
return HIDDEN_MESSAGE | content = HIDDEN_MESSAGE | ||||
else: | |||||
return list(storage.content_get([file_data['sha1']]))[0]['data'] | yield {'content': content, **file_data} | ||||
def apply_chunked(func, input_list, chunk_size): | |||||
"""Apply func on input_list divided in chunks of size chunk_size""" | |||||
for i in range(0, len(input_list), chunk_size): | |||||
yield from func(input_list[i:i + chunk_size]) | |||||
class DirectoryBuilder: | class DirectoryBuilder: | ||||
"""Reconstructs the on-disk representation of a directory in the storage. | """Reconstructs the on-disk representation of a directory in the storage. | ||||
""" | """ | ||||
def __init__(self, storage, root, dir_id): | def __init__(self, storage, root, dir_id): | ||||
"""Initialize the directory builder. | """Initialize the directory builder. | ||||
Args: | Args: | ||||
storage: the storage object | storage: the storage object | ||||
root: the path where the directory should be reconstructed | root: the path where the directory should be reconstructed | ||||
dir_id: the identifier of the directory in the storage | dir_id: the identifier of the directory in the storage | ||||
""" | """ | ||||
self.storage = storage | self.storage = storage | ||||
self.root = root | self.root = root | ||||
self.dir_id = dir_id | self.dir_id = dir_id | ||||
def build(self): | def build(self): | ||||
"""Perform the reconstruction of the directory in the given root.""" | """Perform the reconstruction of the directory in the given root.""" | ||||
# Retrieve data from the database. | # Retrieve data from the database. | ||||
data = self.storage.directory_ls(self.dir_id, recursive=True) | data = self.storage.directory_ls(self.dir_id, recursive=True) | ||||
# Split into files and directory data. | # Split into files, revisions and directory data. | ||||
data1, data2 = itertools.tee(data, 2) | entries = collections.defaultdict(list) | ||||
dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir') | for entry in data: | ||||
file_data = (entry for entry in data2 if entry['type'] != 'dir') | entries[entry['type']].append(entry) | ||||
# Recreate the directory's subtree and then the files into it. | # Recreate the directory's subtree and then the files into it. | ||||
self._create_tree(dir_data) | self._create_tree(entries['dir']) | ||||
self._create_files(file_data) | self._create_files(entries['file']) | ||||
self._create_revisions(entries['rev']) | |||||
def _create_tree(self, directory_paths): | def _create_tree(self, directories): | ||||
"""Create a directory tree from the given paths | """Create a directory tree from the given paths | ||||
The tree is created from `root` and each given path in | The tree is created from `root` and each given directory in | ||||
`directory_paths` will be created. | `directories` will be created. | ||||
""" | """ | ||||
# Directories are sorted by depth so they are created in the | # Directories are sorted by depth so they are created in the | ||||
# right order | # right order | ||||
bsep = bytes(os.path.sep, 'utf8') | bsep = os.path.sep.encode() | ||||
dir_names = sorted( | directories = sorted(directories, | ||||
directory_paths, | key=lambda x: len(x['name'].split(bsep))) | ||||
key=lambda x: len(x.split(bsep))) | for dir in directories: | ||||
for dir_name in dir_names: | os.makedirs(os.path.join(self.root, dir['name'])) | ||||
os.makedirs(os.path.join(self.root, dir_name)) | |||||
def _create_files(self, files_data): | |||||
def _create_files(self, file_datas): | """Create the files in the tree and fetch their contents.""" | ||||
"""Create the files according to their status.""" | f = functools.partial(get_filtered_files_content, self.storage) | ||||
for file_data in file_datas: | files_data = apply_chunked(f, files_data, 1000) | ||||
for file_data in files_data: | |||||
path = os.path.join(self.root, file_data['name']) | path = os.path.join(self.root, file_data['name']) | ||||
if file_data['type'] == 'file': | self._create_file(path, file_data['content'], file_data['perms']) | ||||
content = get_filtered_file_content(self.storage, file_data) | |||||
self._create_file(path, content, file_data['perms']) | def _create_revisions(self, revs_data): | ||||
elif file_data['type'] == 'rev': | """Create the revisions in the tree as broken symlinks to the target | ||||
self._create_file(path, | identifier.""" | ||||
hashutil.hash_to_hex(file_data['target']), | for file_data in revs_data: | ||||
0o120000) | path = os.path.join(self.root, file_data['name']) | ||||
self._create_file(path, hashutil.hash_to_hex(file_data['target']), | |||||
mode=0o120000) | |||||
def _create_file(self, path, content, mode=0o100644): | def _create_file(self, path, content, mode=0o100644): | ||||
"""Create the given file and fill it with content.""" | """Create the given file and fill it with content.""" | ||||
perms = mode_to_perms(mode) | perms = mode_to_perms(mode) | ||||
if perms == DentryPerms.symlink: | if perms == DentryPerms.symlink: | ||||
os.symlink(content, path) | os.symlink(content, path) | ||||
else: | else: | ||||
with open(path, 'wb') as f: | with open(path, 'wb') as f: | ||||
f.write(content) | f.write(content) | ||||
os.chmod(path, perms.value) | os.chmod(path, perms.value) | ||||
def _get_file_content(self, obj_id): | |||||
"""Get the content of the given file.""" | |||||
content = list(self.storage.content_get([obj_id]))[0]['data'] | |||||
return content |