diff --git a/debian/control b/debian/control index 03ac675..d621418 100644 --- a/debian/control +++ b/debian/control @@ -1,34 +1,34 @@ Source: swh-vault Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-click, python3-dateutil, python3-dulwich, python3-fastimport, python3-flask, python3-nose, python3-psycopg2, python3-setuptools, python3-swh.core (>= 0.0.28~), python3-swh.model (>= 0.0.18~), python3-swh.objstorage (>= 0.0.17~), python3-swh.scheduler (>= 0.0.26~), - python3-swh.storage (>= 0.0.100~), + python3-swh.storage (>= 0.0.106~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DVAU/ Package: python3-swh.vault Architecture: all Depends: python3-swh.core (>= 0.0.28~), python3-swh.model (>= 0.0.18~), python3-swh.objstorage (>= 0.0.17~), python3-swh.scheduler (>= 0.0.26~), - python3-swh.storage (>= 0.0.100~), + python3-swh.storage (>= 0.0.106~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Vault diff --git a/requirements-swh.txt b/requirements-swh.txt index f45ed70..e0638e0 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.40 swh.model >= 0.0.18 swh.objstorage >= 0.0.17 swh.scheduler >= 0.0.26 -swh.storage >= 0.0.100 +swh.storage >= 0.0.106 diff --git a/swh/vault/to_disk.py b/swh/vault/to_disk.py index dae1ffb..85d70f7 100644 --- a/swh/vault/to_disk.py +++ b/swh/vault/to_disk.py @@ -1,127 +1,126 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import collections import os from swh.model import hashutil from swh.model.from_disk import mode_to_perms, DentryPerms +from swh.storage.algos.dir_iterators import dir_iterator SKIPPED_MESSAGE = (b'This content has not been retrieved in the ' b'Software Heritage archive due to its size.') HIDDEN_MESSAGE = (b'This content is hidden.') def get_filtered_files_content(storage, files_data): """Retrieve the files specified by files_data and apply filters for skipped and missing contents. Args: storage: the storage from which to retrieve the objects files_data: list of file entries as returned by directory_ls() Yields: The entries given in files_data with a new 'content' key that points to the file content in bytes. The contents can be replaced by a specific message to indicate that they could not be retrieved (either due to privacy policy or because their sizes were too big for us to archive it). """ contents_to_fetch = [f['sha1'] for f in files_data if f['status'] == 'visible'] contents_fetched = storage.content_get(contents_to_fetch) contents = {c['sha1']: c['data'] for c in contents_fetched} for file_data in files_data: if file_data['status'] == 'visible': content = contents[file_data['sha1']] elif file_data['status'] == 'absent': content = SKIPPED_MESSAGE elif file_data['status'] == 'hidden': content = HIDDEN_MESSAGE yield {'content': content, **file_data} def apply_chunked(func, input_list, chunk_size): """Apply func on input_list divided in chunks of size chunk_size""" for i in range(0, len(input_list), chunk_size): yield from func(input_list[i:i + chunk_size]) class DirectoryBuilder: """Reconstructs the on-disk representation of a directory in the storage. """ def __init__(self, storage, root, dir_id): """Initialize the directory builder. Args: storage: the storage object root: the path where the directory should be reconstructed dir_id: the identifier of the directory in the storage """ self.storage = storage self.root = root self.dir_id = dir_id def build(self): """Perform the reconstruction of the directory in the given root.""" # Retrieve data from the database. - data = self.storage.directory_ls(self.dir_id, recursive=True) - # Split into files, revisions and directory data. entries = collections.defaultdict(list) - for entry in data: + for entry in dir_iterator(self.storage, self.dir_id): entries[entry['type']].append(entry) # Recreate the directory's subtree and then the files into it. self._create_tree(entries['dir']) self._create_files(entries['file']) self._create_revisions(entries['rev']) def _create_tree(self, directories): """Create a directory tree from the given paths The tree is created from `root` and each given directory in `directories` will be created. """ # Directories are sorted by depth so they are created in the # right order bsep = os.path.sep.encode() directories = sorted(directories, - key=lambda x: len(x['name'].split(bsep))) + key=lambda x: len(x['path'].split(bsep))) for dir in directories: - os.makedirs(os.path.join(self.root, dir['name'])) + os.makedirs(os.path.join(self.root, dir['path'])) def _create_files(self, files_data): """Create the files in the tree and fetch their contents.""" f = functools.partial(get_filtered_files_content, self.storage) files_data = apply_chunked(f, files_data, 1000) for file_data in files_data: - path = os.path.join(self.root, file_data['name']) + path = os.path.join(self.root, file_data['path']) self._create_file(path, file_data['content'], file_data['perms']) def _create_revisions(self, revs_data): """Create the revisions in the tree as broken symlinks to the target identifier.""" for file_data in revs_data: - path = os.path.join(self.root, file_data['name']) + path = os.path.join(self.root, file_data['path']) self._create_file(path, hashutil.hash_to_hex(file_data['target']), mode=0o120000) def _create_file(self, path, content, mode=0o100644): """Create the given file and fill it with content.""" perms = mode_to_perms(mode) if perms == DentryPerms.symlink: os.symlink(content, path) else: with open(path, 'wb') as f: f.write(content) os.chmod(path, perms.value)