diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py
index 4d7a5c4..9e28f19 100644
--- a/swh/vault/cookers/base.py
+++ b/swh/vault/cookers/base.py
@@ -1,232 +1,128 @@
 # Copyright (C) 2016-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
 import io
-import itertools
 import logging
-import os
 
 from swh.core import config
 from swh.model import hashutil
-from swh.model.from_disk import mode_to_perms, DentryPerms
 from swh.storage import get_storage
 from swh.vault.api.client import RemoteVaultClient
 
 
 DEFAULT_CONFIG_PATH = 'vault/cooker'
 DEFAULT_CONFIG = {
     'storage': ('dict', {
         'cls': 'remote',
         'args': {
             'url': 'http://localhost:5002/',
         },
     }),
     'vault_url': ('str', 'http://localhost:5005/'),
     'max_bundle_size': ('int', 2 ** 29),  # 512 MiB
 }
 
 
 class PolicyError(Exception):
     """Raised when the bundle violates the cooking policy."""
     pass
 
 
 class BundleTooLargeError(PolicyError):
     """Raised when the bundle is too large to be cooked."""
     pass
 
 
 class BytesIOBundleSizeLimit(io.BytesIO):
     def __init__(self, *args, size_limit=None, **kwargs):
         super().__init__(self, *args, **kwargs)
         self.size_limit = size_limit
 
     def write(self, chunk):
         if ((self.size_limit is not None
              and self.getbuffer().nbytes + len(chunk) > self.size_limit)):
             raise BundleTooLargeError(
                 "The requested bundle exceeds the maximum allowed "
                 "size of {} bytes.".format(self.size_limit))
         return super().write(chunk)
 
 
 class BaseVaultCooker(metaclass=abc.ABCMeta):
     """Abstract base class for the vault's bundle creators
 
     This class describes a common API for the cookers.
 
     To define a new cooker, inherit from this class and override:
     - CACHE_TYPE_KEY: key to use for the bundle to reference in cache
     - def cook(): cook the object into a bundle
     """
     CACHE_TYPE_KEY = None
 
     def __init__(self, obj_type, obj_id):
         """Initialize the cooker.
 
         The type of the object represented by the id depends on the
         concrete class. Very likely, each type of bundle will have its
         own cooker class.
 
         Args:
             storage: the storage object
             cache: the cache where to store the bundle
             obj_id: id of the object to be cooked into a bundle.
         """
         self.config = config.load_named_config(DEFAULT_CONFIG_PATH,
                                                DEFAULT_CONFIG)
         self.obj_type = obj_type
         self.obj_id = hashutil.hash_to_bytes(obj_id)
         self.backend = RemoteVaultClient(self.config['vault_url'])
         self.storage = get_storage(**self.config['storage'])
         self.max_bundle_size = self.config['max_bundle_size']
 
     @abc.abstractmethod
     def check_exists(self):
         """Checks that the requested object exists and can be cooked.
 
         Override this in the cooker implementation.
         """
         raise NotImplemented
 
     @abc.abstractmethod
     def prepare_bundle(self):
         """Implementation of the cooker. Yields chunks of the bundle bytes.
 
         Override this with the cooker implementation.
         """
         raise NotImplemented
 
     def write(self, chunk):
         self.fileobj.write(chunk)
 
     def cook(self):
         """Cook the requested object into a bundle
         """
         self.backend.set_status(self.obj_type, self.obj_id, 'pending')
         self.backend.set_progress(self.obj_type, self.obj_id, 'Processing...')
 
         self.fileobj = BytesIOBundleSizeLimit(size_limit=self.max_bundle_size)
         try:
             self.prepare_bundle()
             bundle = self.fileobj.getvalue()
         except PolicyError as e:
             self.backend.set_status(self.obj_type, self.obj_id, 'failed')
             self.backend.set_progress(self.obj_type, self.obj_id, str(e))
         except Exception as e:
             self.backend.set_status(self.obj_type, self.obj_id, 'failed')
             self.backend.set_progress(
                 self.obj_type, self.obj_id,
                 "Internal Server Error. This incident will be reported.")
             logging.exception("Bundle cooking failed.")
         else:
             # TODO: use proper content streaming instead of put_bundle()
             self.backend.put_bundle(self.CACHE_TYPE_KEY, self.obj_id, bundle)
             self.backend.set_status(self.obj_type, self.obj_id, 'done')
             self.backend.set_progress(self.obj_type, self.obj_id, None)
         finally:
             self.backend.send_notif(self.obj_type, self.obj_id)
-
-
-SKIPPED_MESSAGE = (b'This content has not been retrieved in the '
-                   b'Software Heritage archive due to its size.')
-
-HIDDEN_MESSAGE = (b'This content is hidden.')
-
-
-def get_filtered_file_content(storage, file_data):
-    """Retrieve the file specified by file_data and apply filters for skipped
-    and missing contents.
-
-    Args:
-        storage: the storage from which to retrieve the object
-        file_data: file entry descriptor as returned by directory_ls()
-
-    Returns:
-        Bytes containing the specified content. The content will be replaced by
-        a specific message to indicate that the content could not be retrieved
-        (either due to privacy policy or because its size was too big for us to
-        archive it).
-    """
-
-    assert file_data['type'] == 'file'
-
-    if file_data['status'] == 'absent':
-        return SKIPPED_MESSAGE
-    elif file_data['status'] == 'hidden':
-        return HIDDEN_MESSAGE
-    else:
-        return list(storage.content_get([file_data['sha1']]))[0]['data']
-
-
-class DirectoryBuilder:
-    """Creates a cooked directory from its sha1_git in the db.
-
-    Warning: This is NOT a directly accessible cooker, but a low-level
-    one that executes the manipulations.
-
-    """
-    def __init__(self, storage):
-        self.storage = storage
-
-    def build_directory(self, dir_id, root):
-        # Retrieve data from the database.
-        data = self.storage.directory_ls(dir_id, recursive=True)
-
-        # Split into files and directory data.
-        # TODO(seirl): also handle revision data.
-        data1, data2 = itertools.tee(data, 2)
-        dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir')
-        file_data = (entry for entry in data2 if entry['type'] == 'file')
-
-        # Recreate the directory's subtree and then the files into it.
-        self._create_tree(root, dir_data)
-        self._create_files(root, file_data)
-
-    def _create_tree(self, root, directory_paths):
-        """Create a directory tree from the given paths
-
-        The tree is created from `root` and each given path in
-        `directory_paths` will be created.
-
-        """
-        # Directories are sorted by depth so they are created in the
-        # right order
-        bsep = bytes(os.path.sep, 'utf8')
-        dir_names = sorted(
-            directory_paths,
-            key=lambda x: len(x.split(bsep)))
-        for dir_name in dir_names:
-            os.makedirs(os.path.join(root, dir_name))
-
-    def _create_files(self, root, file_datas):
-        """Create the files according to their status.
-
-        """
-        # Then create the files
-        for file_data in file_datas:
-            path = os.path.join(root, file_data['name'])
-            content = get_filtered_file_content(self.storage, file_data)
-            self._create_file(path, content, file_data['perms'])
-
-    def _create_file(self, path, content, mode=0o100644):
-        """Create the given file and fill it with content.
-
-        """
-        perms = mode_to_perms(mode)
-        if perms == DentryPerms.symlink:
-            os.symlink(content, path)
-        else:
-            with open(path, 'wb') as f:
-                f.write(content)
-            os.chmod(path, perms.value)
-
-    def _get_file_content(self, obj_id):
-        """Get the content of the given file.
-
-        """
-        content = list(self.storage.content_get([obj_id]))[0]['data']
-        return content
diff --git a/swh/vault/cookers/directory.py b/swh/vault/cookers/directory.py
index ca599bd..70d9da0 100644
--- a/swh/vault/cookers/directory.py
+++ b/swh/vault/cookers/directory.py
@@ -1,25 +1,27 @@
 # Copyright (C) 2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import tarfile
 import tempfile
 
-from swh.vault.cookers.base import BaseVaultCooker, DirectoryBuilder
 from swh.model import hashutil
+from swh.vault.cookers.base import BaseVaultCooker
+from swh.vault.to_disk import DirectoryBuilder
 
 
 class DirectoryCooker(BaseVaultCooker):
     """Cooker to create a directory bundle """
     CACHE_TYPE_KEY = 'directory'
 
     def check_exists(self):
         return not list(self.storage.directory_missing([self.obj_id]))
 
     def prepare_bundle(self):
-        directory_builder = DirectoryBuilder(self.storage)
         with tempfile.TemporaryDirectory(prefix='tmp-vault-directory-') as td:
-            directory_builder.build_directory(self.obj_id, td.encode())
+            directory_builder = DirectoryBuilder(
+                self.storage, td.encode(), self.obj_id)
+            directory_builder.build()
             tar = tarfile.open(fileobj=self.fileobj, mode='w')
             tar.add(td, arcname=hashutil.hash_to_hex(self.obj_id))
diff --git a/swh/vault/cookers/revision_flat.py b/swh/vault/cookers/revision_flat.py
index f358a5f..0fc8579 100644
--- a/swh/vault/cookers/revision_flat.py
+++ b/swh/vault/cookers/revision_flat.py
@@ -1,31 +1,32 @@
 # Copyright (C) 2016-2017  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import tarfile
 import tempfile
 from pathlib import Path
 
 from swh.model import hashutil
-from .base import BaseVaultCooker, DirectoryBuilder
+from swh.vault.cookers.base import BaseVaultCooker
+from swh.vault.to_disk import DirectoryBuilder
 
 
 class RevisionFlatCooker(BaseVaultCooker):
     """Cooker to create a revision_flat bundle """
     CACHE_TYPE_KEY = 'revision_flat'
 
     def check_exists(self):
         return not list(self.storage.revision_missing([self.obj_id]))
 
     def prepare_bundle(self):
-        directory_builder = DirectoryBuilder(self.storage)
         with tempfile.TemporaryDirectory(prefix='tmp-vault-revision-') as td:
             root = Path(td)
             for revision in self.storage.revision_log([self.obj_id]):
                 revdir = root / hashutil.hash_to_hex(revision['id'])
                 revdir.mkdir()
-                directory_builder.build_directory(revision['directory'],
-                                                  str(revdir).encode())
+                directory_builder = DirectoryBuilder(
+                    self.storage, str(revdir).encode(), revision['directory'])
+                directory_builder.build()
             tar = tarfile.open(fileobj=self.fileobj, mode='w')
             tar.add(td, arcname=hashutil.hash_to_hex(self.obj_id))
diff --git a/swh/vault/cookers/revision_gitfast.py b/swh/vault/cookers/revision_gitfast.py
index b806a3b..e157c2c 100644
--- a/swh/vault/cookers/revision_gitfast.py
+++ b/swh/vault/cookers/revision_gitfast.py
@@ -1,223 +1,224 @@
 # Copyright (C) 2017  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import collections
 import functools
 import os
 import time
 import zlib
 
 from fastimport.commands import (CommitCommand, ResetCommand, BlobCommand,
                                  FileDeleteCommand, FileModifyCommand)
 
-from .base import BaseVaultCooker, get_filtered_file_content
 from swh.model.from_disk import mode_to_perms
+from swh.vault.cookers.base import BaseVaultCooker
+from swh.vault.to_disk import get_filtered_file_content
 
 
 class RevisionGitfastCooker(BaseVaultCooker):
     """Cooker to create a git fast-import bundle """
     CACHE_TYPE_KEY = 'revision_gitfast'
 
     def check_exists(self):
         return not list(self.storage.revision_missing([self.obj_id]))
 
     def prepare_bundle(self):
         log = self.storage.revision_log([self.obj_id])
         self.gzobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
         self.fastexport(log)
         self.write(self.gzobj.flush())
 
     def write_cmd(self, cmd):
         chunk = bytes(cmd) + b'\n'
         super().write(self.gzobj.compress(chunk))
 
     def fastexport(self, log):
         """Generate all the git fast-import commands from a given log.
         """
         self.rev_by_id = {r['id']: r for r in log}
         self.rev_sorted = list(self._toposort(self.rev_by_id))
         self.obj_done = set()
         self.obj_to_mark = {}
         self.next_available_mark = 1
 
         last_progress_report = None
 
         for i, rev in enumerate(self.rev_sorted, 1):
             # Update progress if needed
             ct = time.time()
             if (last_progress_report is None
                     or last_progress_report + 2 <= ct):
                 last_progress_report = ct
                 pg = ('Computing revision {}/{}'
                       .format(i, len(self.rev_sorted)))
                 self.backend.set_progress(self.obj_type, self.obj_id, pg)
 
             # Compute the current commit
             self._compute_commit_command(rev)
 
     def _toposort(self, rev_by_id):
         """Perform a topological sort on the revision graph.
         """
         children = collections.defaultdict(list)  # rev -> children
         in_degree = {}  # rev -> numbers of parents left to compute
 
         # Compute the in_degrees and the parents of all the revisions.
         # Add the roots to the processing queue.
         queue = collections.deque()
         for rev_id, rev in rev_by_id.items():
             in_degree[rev_id] = len(rev['parents'])
             if not rev['parents']:
                 queue.append(rev_id)
             for parent in rev['parents']:
                 children[parent].append(rev_id)
 
         # Topological sort: yield the 'ready' nodes, decrease the in degree of
         # their children and add the 'ready' ones to the queue.
         while queue:
             rev_id = queue.popleft()
             yield rev_by_id[rev_id]
             for child in children[rev_id]:
                 in_degree[child] -= 1
                 if in_degree[child] == 0:
                     queue.append(child)
 
     def mark(self, obj_id):
         """Get the mark ID as bytes of a git object.
 
         If the object has not yet been marked, assign a new ID and add it to
         the mark dictionary.
         """
         if obj_id not in self.obj_to_mark:
             self.obj_to_mark[obj_id] = self.next_available_mark
             self.next_available_mark += 1
         return str(self.obj_to_mark[obj_id]).encode()
 
     def _compute_blob_command_content(self, file_data):
         """Compute the blob command of a file entry if it has not been
         computed yet.
         """
         obj_id = file_data['sha1']
         if obj_id in self.obj_done:
             return
         content = get_filtered_file_content(self.storage, file_data)
         self.write_cmd(BlobCommand(mark=self.mark(obj_id), data=content))
         self.obj_done.add(obj_id)
 
     def _author_tuple_format(self, author, date):
         # We never want to have None values here so we replace null entries
         # by ''.
         if author is not None:
             author_tuple = (author.get('name') or b'',
                             author.get('email') or b'')
         else:
             author_tuple = (b'', b'')
         if date is not None:
             date_tuple = (date.get('timestamp', {}).get('seconds') or 0,
                           (date.get('offset') or 0) * 60)
         else:
             date_tuple = (0, 0)
         return author_tuple + date_tuple
 
     def _compute_commit_command(self, rev):
         """Compute a commit command from a specific revision.
         """
         if 'parents' in rev and rev['parents']:
             from_ = b':' + self.mark(rev['parents'][0])
             merges = [b':' + self.mark(r) for r in rev['parents'][1:]]
             parent = self.rev_by_id[rev['parents'][0]]
         else:
             # We issue a reset command before all the new roots so that they
             # are not automatically added as children of the current branch.
             self.write_cmd(ResetCommand(b'refs/heads/master', None))
             from_ = None
             merges = None
             parent = None
 
         # Retrieve the file commands while yielding new blob commands if
         # needed.
         files = list(self._compute_file_commands(rev, parent))
 
         # Construct and write the commit command
         author = self._author_tuple_format(rev['author'], rev['date'])
         committer = self._author_tuple_format(rev['committer'],
                                               rev['committer_date'])
         self.write_cmd(CommitCommand(
             ref=b'refs/heads/master',
             mark=self.mark(rev['id']),
             author=author,
             committer=committer,
             message=rev['message'] or b'',
             from_=from_,
             merges=merges,
             file_iter=files))
 
     @functools.lru_cache(maxsize=4096)
     def _get_dir_ents(self, dir_id=None):
         """Get the entities of a directory as a dictionary (name -> entity).
 
         This function has a cache to avoid doing multiple requests to retrieve
         the same entities, as doing a directory_ls() is expensive.
         """
         data = (self.storage.directory_ls(dir_id)
                 if dir_id is not None else [])
         return {f['name']: f for f in data}
 
     def _compute_file_commands(self, rev, parent=None):
         """Compute all the file commands of a revision.
 
         Generate a diff of the files between the revision and its main parent
         to find the necessary file commands to apply.
         """
         # Initialize the stack with the root of the tree.
         cur_dir = rev['directory']
         parent_dir = parent['directory'] if parent else None
         stack = [(b'', cur_dir, parent_dir)]
 
         while stack:
             # Retrieve the current directory and the directory of the parent
             # commit in order to compute the diff of the trees.
             root, cur_dir_id, prev_dir_id = stack.pop()
             cur_dir = self._get_dir_ents(cur_dir_id)
             prev_dir = self._get_dir_ents(prev_dir_id)
 
             # Find subtrees to delete:
             #  - Subtrees that are not in the new tree (file or directory
             #    deleted).
             #  - Subtrees that do not have the same type in the new tree
             #    (file -> directory or directory -> file)
             # After this step, every node remaining in the previous directory
             # has the same type than the one in the current directory.
             for fname, f in prev_dir.items():
                 if ((fname not in cur_dir
                      or f['type'] != cur_dir[fname]['type'])):
                     yield FileDeleteCommand(path=os.path.join(root, fname))
 
             # Find subtrees to modify:
             #  - Leaves (files) will be added or modified using `filemodify`
             #  - Other subtrees (directories) will be added to the stack and
             #    processed in the next iteration.
             for fname, f in cur_dir.items():
                 # A file is added or modified if it was not in the tree, if its
                 # permissions changed or if its content changed.
                 if (f['type'] == 'file'
                     and (fname not in prev_dir
                          or f['sha1'] != prev_dir[fname]['sha1']
                          or f['perms'] != prev_dir[fname]['perms'])):
                     # Issue a blob command for the new blobs if needed.
                     self._compute_blob_command_content(f)
                     yield FileModifyCommand(
                         path=os.path.join(root, fname),
                         mode=mode_to_perms(f['perms']).value,
                         dataref=(b':' + self.mark(f['sha1'])),
                         data=None)
                 # A directory is added or modified if it was not in the tree or
                 # if its target changed.
                 elif f['type'] == 'dir':
                     f_prev_target = None
                     if fname in prev_dir and prev_dir[fname]['type'] == 'dir':
                         f_prev_target = prev_dir[fname]['target']
                     if f_prev_target is None or f['target'] != f_prev_target:
                         stack.append((os.path.join(root, fname),
                                       f['target'], f_prev_target))
diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py
index 50a63b3..025e9f9 100644
--- a/swh/vault/tests/test_cookers.py
+++ b/swh/vault/tests/test_cookers.py
@@ -1,422 +1,422 @@
 # Copyright (C) 2017  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import contextlib
 import datetime
 import gzip
 import io
 import os
 import pathlib
 import subprocess
 import tarfile
 import tempfile
 import unittest
 import unittest.mock
 
 import dulwich.fastexport
 import dulwich.index
 import dulwich.objects
 import dulwich.porcelain
 import dulwich.repo
 
 from swh.core.tests.db_testing import DbTestFixture
 from swh.loader.git.loader import GitLoader
 from swh.model import hashutil
 from swh.model.from_disk import Directory
 from swh.storage.tests.storage_testing import StorageTestFixture
 from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker
-from swh.vault.cookers.base import SKIPPED_MESSAGE, HIDDEN_MESSAGE
 from swh.vault.tests.vault_testing import VaultTestFixture, hash_content
+from swh.vault.to_disk import SKIPPED_MESSAGE, HIDDEN_MESSAGE
 
 
 class TestRepo:
     """A tiny context manager for a test git repository, with some utility
     functions to perform basic git stuff.
     """
     def __enter__(self):
         self.tmp_dir = tempfile.TemporaryDirectory(prefix='tmp-vault-repo-')
         self.repo_dir = self.tmp_dir.__enter__()
         self.repo = dulwich.repo.Repo.init(self.repo_dir)
         self.author = '"Test Author" <test@softwareheritage.org>'.encode()
         return pathlib.Path(self.repo_dir)
 
     def __exit__(self, exc, value, tb):
         self.tmp_dir.__exit__(exc, value, tb)
 
     def checkout(self, rev_sha):
         rev = self.repo[rev_sha]
         dulwich.index.build_index_from_tree(self.repo_dir,
                                             self.repo.index_path(),
                                             self.repo.object_store,
                                             rev.tree)
 
     def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs):
         subprocess.check_call(('git', '-C', self.repo_dir) + cmd,
                               stdout=stdout, **kwargs)
 
     def commit(self, message='Commit test\n', ref=b'HEAD'):
         self.git_shell('add', '.')
         message = message.encode() + b'\n'
         return self.repo.do_commit(message=message, committer=self.author,
                                    ref=ref)
 
     def merge(self, parent_sha_list, message='Merge branches.'):
         self.git_shell('merge', '--allow-unrelated-histories',
                        '-m', message, *[p.decode() for p in parent_sha_list])
         return self.repo.refs[b'HEAD']
 
     def print_debug_graph(self, reflog=False):
         args = ['log', '--all', '--graph', '--decorate']
         if reflog:
             args.append('--reflog')
         self.git_shell(*args, stdout=None)
 
 
 class BaseTestCookers(VaultTestFixture, StorageTestFixture, DbTestFixture):
     """Base class of cookers unit tests"""
     def setUp(self):
         super().setUp()
         self.loader = GitLoader()
         self.loader.storage = self.storage
 
     def load(self, repo_path):
         """Load a repository in the test storage"""
         self.loader.load('fake_origin', repo_path, datetime.datetime.now())
 
     @contextlib.contextmanager
     def cook_extract_directory(self, obj_id):
         """Context manager that cooks a directory and extract it."""
         cooker = DirectoryCooker('directory', obj_id)
         cooker.storage = self.storage
         cooker.backend = unittest.mock.MagicMock()
         cooker.fileobj = io.BytesIO()
         assert cooker.check_exists()
         cooker.prepare_bundle()
         cooker.fileobj.seek(0)
         with tempfile.TemporaryDirectory(prefix='tmp-vault-extract-') as td:
             with tarfile.open(fileobj=cooker.fileobj, mode='r') as tar:
                 tar.extractall(td)
             yield pathlib.Path(td) / hashutil.hash_to_hex(obj_id)
 
     @contextlib.contextmanager
     def cook_extract_revision_gitfast(self, obj_id):
         """Context manager that cooks a revision and extract it."""
         cooker = RevisionGitfastCooker('revision_gitfast', obj_id)
         cooker.storage = self.storage
         cooker.backend = unittest.mock.MagicMock()
         cooker.fileobj = io.BytesIO()
         assert cooker.check_exists()
         cooker.prepare_bundle()
         cooker.fileobj.seek(0)
         fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj)
         test_repo = TestRepo()
         with test_repo as p:
             processor = dulwich.fastexport.GitImportProcessor(test_repo.repo)
             processor.import_stream(fastexport_stream)
             yield test_repo, p
 
 
 TEST_CONTENT = ("   test content\n"
                 "and unicode \N{BLACK HEART SUIT}\n"
                 " and trailing spaces   ")
 TEST_EXECUTABLE = b'\x42\x40\x00\x00\x05'
 
 
 class TestDirectoryCooker(BaseTestCookers, unittest.TestCase):
     def test_directory_simple(self):
         repo = TestRepo()
         with repo as rp:
             (rp / 'file').write_text(TEST_CONTENT)
             (rp / 'executable').write_bytes(TEST_EXECUTABLE)
             (rp / 'executable').chmod(0o755)
             (rp / 'link').symlink_to('file')
             (rp / 'dir1/dir2').mkdir(parents=True)
             (rp / 'dir1/dir2/file').write_text(TEST_CONTENT)
             c = repo.commit()
             self.load(str(rp))
 
             obj_id_hex = repo.repo[c].tree.decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
 
         with self.cook_extract_directory(obj_id) as p:
             self.assertEqual((p / 'file').stat().st_mode, 0o100644)
             self.assertEqual((p / 'file').read_text(), TEST_CONTENT)
             self.assertEqual((p / 'executable').stat().st_mode, 0o100755)
             self.assertEqual((p / 'executable').read_bytes(), TEST_EXECUTABLE)
             self.assertTrue((p / 'link').is_symlink)
             self.assertEqual(os.readlink(str(p / 'link')), 'file')
             self.assertEqual((p / 'dir1/dir2/file').stat().st_mode, 0o100644)
             self.assertEqual((p / 'dir1/dir2/file').read_text(), TEST_CONTENT)
 
             directory = Directory.from_disk(path=bytes(p))
             self.assertEqual(obj_id_hex, hashutil.hash_to_hex(directory.hash))
 
     def test_directory_filtered_objects(self):
         repo = TestRepo()
         with repo as rp:
             file_1, id_1 = hash_content(b'test1')
             file_2, id_2 = hash_content(b'test2')
             file_3, id_3 = hash_content(b'test3')
 
             (rp / 'file').write_bytes(file_1)
             (rp / 'hidden_file').write_bytes(file_2)
             (rp / 'absent_file').write_bytes(file_3)
 
             c = repo.commit()
             self.load(str(rp))
 
             obj_id_hex = repo.repo[c].tree.decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
 
         # FIXME: storage.content_update() should be changed to allow things
         # like that
         cur = self.storage.db._cursor(None)
         cur.execute("""update content set status = 'visible'
                        where sha1 = %s""", (id_1,))
         cur.execute("""update content set status = 'hidden'
                        where sha1 = %s""", (id_2,))
         cur.execute("""update content set status = 'absent'
                        where sha1 = %s""", (id_3,))
         cur.close()
 
         with self.cook_extract_directory(obj_id) as p:
             self.assertEqual((p / 'file').read_bytes(), b'test1')
             self.assertEqual((p / 'hidden_file').read_bytes(), HIDDEN_MESSAGE)
             self.assertEqual((p / 'absent_file').read_bytes(), SKIPPED_MESSAGE)
 
     def test_directory_bogus_perms(self):
         # Some early git repositories have 664/775 permissions... let's check
         # if all the weird modes are properly normalized in the directory
         # cooker.
         repo = TestRepo()
         with repo as rp:
             (rp / 'file').write_text(TEST_CONTENT)
             (rp / 'file').chmod(0o664)
             (rp / 'executable').write_bytes(TEST_EXECUTABLE)
             (rp / 'executable').chmod(0o775)
             (rp / 'wat').write_text(TEST_CONTENT)
             (rp / 'wat').chmod(0o604)
             c = repo.commit()
             self.load(str(rp))
 
             obj_id_hex = repo.repo[c].tree.decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
 
         with self.cook_extract_directory(obj_id) as p:
             self.assertEqual((p / 'file').stat().st_mode, 0o100644)
             self.assertEqual((p / 'executable').stat().st_mode, 0o100755)
             self.assertEqual((p / 'wat').stat().st_mode, 0o100644)
 
 
 class TestRevisionGitfastCooker(BaseTestCookers, unittest.TestCase):
     def test_revision_simple(self):
         #
         #     1--2--3--4--5--6--7
         #
         repo = TestRepo()
         with repo as rp:
             (rp / 'file1').write_text(TEST_CONTENT)
             repo.commit('add file1')
             (rp / 'file2').write_text(TEST_CONTENT)
             repo.commit('add file2')
             (rp / 'dir1/dir2').mkdir(parents=True)
             (rp / 'dir1/dir2/file').write_text(TEST_CONTENT)
             repo.commit('add dir1/dir2/file')
             (rp / 'bin1').write_bytes(TEST_EXECUTABLE)
             (rp / 'bin1').chmod(0o755)
             repo.commit('add bin1')
             (rp / 'link1').symlink_to('file1')
             repo.commit('link link1 to file1')
             (rp / 'file2').unlink()
             repo.commit('remove file2')
             (rp / 'bin1').rename(rp / 'bin')
             repo.commit('rename bin1 to bin')
             self.load(str(rp))
             obj_id_hex = repo.repo.refs[b'HEAD'].decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
 
         with self.cook_extract_revision_gitfast(obj_id) as (ert, p):
             ert.checkout(b'HEAD')
             self.assertEqual((p / 'file1').stat().st_mode, 0o100644)
             self.assertEqual((p / 'file1').read_text(), TEST_CONTENT)
             self.assertTrue((p / 'link1').is_symlink)
             self.assertEqual(os.readlink(str(p / 'link1')), 'file1')
             self.assertEqual((p / 'bin').stat().st_mode, 0o100755)
             self.assertEqual((p / 'bin').read_bytes(), TEST_EXECUTABLE)
             self.assertEqual((p / 'dir1/dir2/file').read_text(), TEST_CONTENT)
             self.assertEqual((p / 'dir1/dir2/file').stat().st_mode, 0o100644)
             self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex)
 
     def test_revision_two_roots(self):
         #
         #    1----3---4
         #        /
         #   2----
         #
         repo = TestRepo()
         with repo as rp:
             (rp / 'file1').write_text(TEST_CONTENT)
             c1 = repo.commit('Add file1')
             del repo.repo.refs[b'refs/heads/master']  # git update-ref -d HEAD
             (rp / 'file2').write_text(TEST_CONTENT)
             repo.commit('Add file2')
             repo.merge([c1])
             (rp / 'file3').write_text(TEST_CONTENT)
             repo.commit('add file3')
             obj_id_hex = repo.repo.refs[b'HEAD'].decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
             self.load(str(rp))
 
         with self.cook_extract_revision_gitfast(obj_id) as (ert, p):
             self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex)
 
     def test_revision_two_double_fork_merge(self):
         #
         #     2---4---6
         #    /   /   /
         #   1---3---5
         #
         repo = TestRepo()
         with repo as rp:
             (rp / 'file1').write_text(TEST_CONTENT)
             c1 = repo.commit('Add file1')
             repo.repo.refs[b'refs/heads/c1'] = c1
 
             (rp / 'file2').write_text(TEST_CONTENT)
             repo.commit('Add file2')
 
             (rp / 'file3').write_text(TEST_CONTENT)
             c3 = repo.commit('Add file3', ref=b'refs/heads/c1')
             repo.repo.refs[b'refs/heads/c3'] = c3
 
             repo.merge([c3])
 
             (rp / 'file5').write_text(TEST_CONTENT)
             c5 = repo.commit('Add file3', ref=b'refs/heads/c3')
 
             repo.merge([c5])
 
             obj_id_hex = repo.repo.refs[b'HEAD'].decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
             self.load(str(rp))
 
         with self.cook_extract_revision_gitfast(obj_id) as (ert, p):
             self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex)
 
     def test_revision_triple_merge(self):
         #
         #       .---.---5
         #      /   /   /
         #     2   3   4
         #    /   /   /
         #   1---.---.
         #
         repo = TestRepo()
         with repo as rp:
             (rp / 'file1').write_text(TEST_CONTENT)
             c1 = repo.commit('Commit 1')
             repo.repo.refs[b'refs/heads/b1'] = c1
             repo.repo.refs[b'refs/heads/b2'] = c1
 
             repo.commit('Commit 2')
             c3 = repo.commit('Commit 3', ref=b'refs/heads/b1')
             c4 = repo.commit('Commit 4', ref=b'refs/heads/b2')
             repo.merge([c3, c4])
 
             obj_id_hex = repo.repo.refs[b'HEAD'].decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
             self.load(str(rp))
 
         with self.cook_extract_revision_gitfast(obj_id) as (ert, p):
             self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex)
 
     def test_revision_filtered_objects(self):
         repo = TestRepo()
         with repo as rp:
             file_1, id_1 = hash_content(b'test1')
             file_2, id_2 = hash_content(b'test2')
             file_3, id_3 = hash_content(b'test3')
 
             (rp / 'file').write_bytes(file_1)
             (rp / 'hidden_file').write_bytes(file_2)
             (rp / 'absent_file').write_bytes(file_3)
 
             repo.commit()
             obj_id_hex = repo.repo.refs[b'HEAD'].decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
             self.load(str(rp))
 
         # FIXME: storage.content_update() should be changed to allow things
         # like that
         cur = self.storage.db._cursor(None)
         cur.execute("""update content set status = 'visible'
                        where sha1 = %s""", (id_1,))
         cur.execute("""update content set status = 'hidden'
                        where sha1 = %s""", (id_2,))
         cur.execute("""update content set status = 'absent'
                        where sha1 = %s""", (id_3,))
         cur.close()
 
         with self.cook_extract_revision_gitfast(obj_id) as (ert, p):
             ert.checkout(b'HEAD')
             self.assertEqual((p / 'file').read_bytes(), b'test1')
             self.assertEqual((p / 'hidden_file').read_bytes(), HIDDEN_MESSAGE)
             self.assertEqual((p / 'absent_file').read_bytes(), SKIPPED_MESSAGE)
 
     def test_revision_bogus_perms(self):
         # Some early git repositories have 664/775 permissions... let's check
         # if all the weird modes are properly normalized in the revision
         # cooker.
         repo = TestRepo()
         with repo as rp:
             (rp / 'file').write_text(TEST_CONTENT)
             (rp / 'file').chmod(0o664)
             (rp / 'executable').write_bytes(TEST_EXECUTABLE)
             (rp / 'executable').chmod(0o775)
             (rp / 'wat').write_text(TEST_CONTENT)
             (rp / 'wat').chmod(0o604)
             repo.commit('initial commit')
             self.load(str(rp))
             obj_id_hex = repo.repo.refs[b'HEAD'].decode()
             obj_id = hashutil.hash_to_bytes(obj_id_hex)
 
         with self.cook_extract_revision_gitfast(obj_id) as (ert, p):
             ert.checkout(b'HEAD')
             self.assertEqual((p / 'file').stat().st_mode, 0o100644)
             self.assertEqual((p / 'executable').stat().st_mode, 0o100755)
             self.assertEqual((p / 'wat').stat().st_mode, 0o100644)
 
     def test_revision_null_fields(self):
         # Our schema doesn't enforce a lot of non-null revision fields. We need
         # to check these cases don't break the cooker.
         repo = TestRepo()
         with repo as rp:
             (rp / 'file').write_text(TEST_CONTENT)
             c = repo.commit('initial commit')
             self.load(str(rp))
             repo.repo.refs[b'HEAD'].decode()
             dir_id_hex = repo.repo[c].tree.decode()
             dir_id = hashutil.hash_to_bytes(dir_id_hex)
 
         test_id = b'56789012345678901234'
         test_revision = {
             'id': test_id,
             'message': None,
             'author': {'name': None, 'email': None, 'fullname': ''},
             'date': None,
             'committer': {'name': None, 'email': None, 'fullname': ''},
             'committer_date': None,
             'parents': [],
             'type': 'git',
             'directory': dir_id,
             'metadata': {},
             'synthetic': True
         }
 
         self.storage.revision_add([test_revision])
 
         with self.cook_extract_revision_gitfast(test_id) as (ert, p):
             ert.checkout(b'HEAD')
             self.assertEqual((p / 'file').stat().st_mode, 0o100644)
diff --git a/swh/vault/to_disk.py b/swh/vault/to_disk.py
new file mode 100644
index 0000000..3f1c9e4
--- /dev/null
+++ b/swh/vault/to_disk.py
@@ -0,0 +1,110 @@
+# Copyright (C) 2016-2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import itertools
+import os
+
+from swh.model.from_disk import mode_to_perms, DentryPerms
+
+SKIPPED_MESSAGE = (b'This content has not been retrieved in the '
+                   b'Software Heritage archive due to its size.')
+
+HIDDEN_MESSAGE = (b'This content is hidden.')
+
+
+def get_filtered_file_content(storage, file_data):
+    """Retrieve the file specified by file_data and apply filters for skipped
+    and missing contents.
+
+    Args:
+        storage: the storage from which to retrieve the object
+        file_data: file entry descriptor as returned by directory_ls()
+
+    Returns:
+        Bytes containing the specified content. The content will be replaced by
+        a specific message to indicate that the content could not be retrieved
+        (either due to privacy policy or because its size was too big for us to
+        archive it).
+    """
+
+    assert file_data['type'] == 'file'
+
+    if file_data['status'] == 'absent':
+        return SKIPPED_MESSAGE
+    elif file_data['status'] == 'hidden':
+        return HIDDEN_MESSAGE
+    else:
+        return list(storage.content_get([file_data['sha1']]))[0]['data']
+
+
+class DirectoryBuilder:
+    """Reconstructs the on-disk representation of a directory in the storage.
+    """
+
+    def __init__(self, storage, root, dir_id):
+        """Initialize the directory builder.
+
+        Args:
+            storage: the storage object
+            root: the path where the directory should be reconstructed
+            dir_id: the identifier of the directory in the storage
+        """
+        self.storage = storage
+        self.root = root
+        self.dir_id = dir_id
+
+    def build(self):
+        """Perform the reconstruction of the directory in the given root."""
+        # Retrieve data from the database.
+        data = self.storage.directory_ls(self.dir_id, recursive=True)
+
+        # Split into files and directory data.
+        # TODO(seirl): also handle revision data.
+        data1, data2 = itertools.tee(data, 2)
+        dir_data = (entry['name'] for entry in data1 if entry['type'] == 'dir')
+        file_data = (entry for entry in data2 if entry['type'] == 'file')
+
+        # Recreate the directory's subtree and then the files into it.
+        self._create_tree(dir_data)
+        self._create_files(file_data)
+
+    def _create_tree(self, directory_paths):
+        """Create a directory tree from the given paths
+
+        The tree is created from `root` and each given path in
+        `directory_paths` will be created.
+
+        """
+        # Directories are sorted by depth so they are created in the
+        # right order
+        bsep = bytes(os.path.sep, 'utf8')
+        dir_names = sorted(
+            directory_paths,
+            key=lambda x: len(x.split(bsep)))
+        for dir_name in dir_names:
+            os.makedirs(os.path.join(self.root, dir_name))
+
+    def _create_files(self, file_datas):
+        """Create the files according to their status."""
+        # Then create the files
+        for file_data in file_datas:
+            path = os.path.join(self.root, file_data['name'])
+            content = get_filtered_file_content(self.storage, file_data)
+            self._create_file(path, content, file_data['perms'])
+
+    def _create_file(self, path, content, mode=0o100644):
+        """Create the given file and fill it with content."""
+        perms = mode_to_perms(mode)
+        if perms == DentryPerms.symlink:
+            os.symlink(content, path)
+        else:
+            with open(path, 'wb') as f:
+                f.write(content)
+            os.chmod(path, perms.value)
+
+    def _get_file_content(self, obj_id):
+        """Get the content of the given file."""
+        content = list(self.storage.content_get([obj_id]))[0]['data']
+        return content