diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
index ad0d042..d4ec64e 100644
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -1,292 +1,352 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
 from datetime import datetime, timezone
 import os
 from typing import Any, Dict, Generator, Optional, Tuple
 
 import click
 import iso8601
 import yaml
 
 from swh.core import config
 from swh.core.cli import CONTEXT_SETTINGS
 from swh.core.cli import swh as swh_cli_group
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 from swh.model.model import Sha1Git
 
 # All generic config code should reside in swh.core.config
 CONFIG_ENVVAR = "SWH_CONFIG_FILENAME"
 DEFAULT_PATH = os.environ.get(CONFIG_ENVVAR, None)
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "provenance": {
         "archive": {
             # Storage API based Archive object
             # "cls": "api",
             # "storage": {
             #     "cls": "remote",
             #     "url": "http://uffizi.internal.softwareheritage.org:5002",
             # }
             # Direct access Archive object
             "cls": "direct",
             "db": {
                 "host": "belvedere.internal.softwareheritage.org",
                 "port": 5432,
                 "dbname": "softwareheritage",
                 "user": "guest",
             },
         },
         "storage": {
             # Local PostgreSQL Storage
             # "cls": "postgresql",
             # "db": {
             #     "host": "localhost",
             #     "user": "postgres",
             #     "password": "postgres",
             #     "dbname": "provenance",
             # },
             # Local MongoDB Storage
             # "cls": "mongodb",
             # "db": {
             #     "dbname": "provenance",
             # },
             # Remote RabbitMQ/PostgreSQL Storage
             "cls": "rabbitmq",
             "url": "amqp://localhost:5672/%2f",
             "storage_config": {
                 "cls": "postgresql",
                 "db": {
                     "host": "localhost",
                     "user": "postgres",
                     "password": "postgres",
                     "dbname": "provenance",
                 },
             },
             "batch_size": 100,
             "prefetch_count": 100,
         },
     }
 }
 
 
 CONFIG_FILE_HELP = f"""
 \b Configuration can be loaded from a yaml file given either as --config-file
 option or the {CONFIG_ENVVAR} environment variable. If no configuration file
 is specified, use the following default configuration::
 
 \b
 {yaml.dump(DEFAULT_CONFIG)}"""
 PROVENANCE_HELP = f"""Software Heritage provenance index database tools
 
 {CONFIG_FILE_HELP}
 """
 
 
 @swh_cli_group.group(
     name="provenance", context_settings=CONTEXT_SETTINGS, help=PROVENANCE_HELP
 )
 @click.option(
     "-C",
     "--config-file",
     default=None,
     type=click.Path(exists=True, dir_okay=False, path_type=str),
     help="""YAML configuration file.""",
 )
 @click.option(
     "-P",
     "--profile",
     default=None,
     type=click.Path(exists=False, dir_okay=False, path_type=str),
     help="""Enable profiling to specified file.""",
 )
 @click.pass_context
 def cli(ctx: click.core.Context, config_file: Optional[str], profile: str) -> None:
     if (
         config_file is None
         and DEFAULT_PATH is not None
         and config.config_exists(DEFAULT_PATH)
     ):
         config_file = DEFAULT_PATH
 
     if config_file is None:
         conf = DEFAULT_CONFIG
     else:
         # read_raw_config do not fail on ENOENT
         if not os.path.exists(config_file):
             raise FileNotFoundError(config_file)
         conf = yaml.safe_load(open(config_file, "rb"))
 
     ctx.ensure_object(dict)
     ctx.obj["config"] = conf
 
     if profile:
         import atexit
         import cProfile
 
         print("Profiling...")
         pr = cProfile.Profile()
         pr.enable()
 
         def exit() -> None:
             pr.disable()
             pr.dump_stats(profile)
 
         atexit.register(exit)
 
 
 @cli.command(name="iter-frontiers")
 @click.argument("filename")
-@click.option("-l", "--limit", type=int)
-@click.option("-s", "--min-size", default=0, type=int)
+@click.option(
+    "-l",
+    "--limit",
+    type=int,
+    help="""Limit the amount of entries (directories) to read from the input file.""",
+)
+@click.option(
+    "-s",
+    "--min-size",
+    default=0,
+    type=int,
+    help="""Set the minimum size (in bytes) of files to be indexed. """
+    """Any smaller file will be ignored.""",
+)
 @click.pass_context
 def iter_frontiers(
     ctx: click.core.Context,
     filename: str,
     limit: Optional[int],
     min_size: int,
 ) -> None:
     """Process a provided list of directories in the isochrone frontier."""
     from . import get_archive, get_provenance
     from .directory import CSVDirectoryIterator, directory_add
 
     archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
     directories_provider = generate_directory_ids(filename)
     directories = CSVDirectoryIterator(directories_provider, limit=limit)
 
     with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
         for directory in directories:
             directory_add(
                 provenance,
                 archive,
                 [directory],
                 minsize=min_size,
             )
 
 
 def generate_directory_ids(
     filename: str,
 ) -> Generator[Sha1Git, None, None]:
     for line in open(filename, "r"):
         if line.strip():
             yield hash_to_bytes(line.strip())
 
 
 @cli.command(name="iter-revisions")
 @click.argument("filename")
-@click.option("-a", "--track-all", default=True, type=bool)
-@click.option("-l", "--limit", type=int)
-@click.option("-m", "--min-depth", default=1, type=int)
-@click.option("-r", "--reuse", default=True, type=bool)
-@click.option("-s", "--min-size", default=0, type=int)
+@click.option(
+    "-a",
+    "--track-all",
+    default=True,
+    type=bool,
+    help="""Index all occurrences of files in the development history.""",
+)
+@click.option(
+    "-f",
+    "--flatten",
+    default=True,
+    type=bool,
+    help="""Create flat models for directories in the isochrone frontier.""",
+)
+@click.option(
+    "-l",
+    "--limit",
+    type=int,
+    help="""Limit the amount of entries (revisions) to read from the input file.""",
+)
+@click.option(
+    "-m",
+    "--min-depth",
+    default=1,
+    type=int,
+    help="""Set minimum depth (in the directory tree) at which an isochrone """
+    """frontier can be defined.""",
+)
+@click.option(
+    "-r",
+    "--reuse",
+    default=True,
+    type=bool,
+    help="""Prioritize the usage of previously defined isochrone frontiers """
+    """whenever possible.""",
+)
+@click.option(
+    "-s",
+    "--min-size",
+    default=0,
+    type=int,
+    help="""Set the minimum size (in bytes) of files to be indexed. """
+    """Any smaller file will be ignored.""",
+)
 @click.pass_context
 def iter_revisions(
     ctx: click.core.Context,
     filename: str,
     track_all: bool,
+    flatten: bool,
     limit: Optional[int],
     min_depth: int,
     reuse: bool,
     min_size: int,
 ) -> None:
     """Process a provided list of revisions."""
     from . import get_archive, get_provenance
     from .revision import CSVRevisionIterator, revision_add
 
     archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
     revisions_provider = generate_revision_tuples(filename)
     revisions = CSVRevisionIterator(revisions_provider, limit=limit)
 
     with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
         for revision in revisions:
             revision_add(
                 provenance,
                 archive,
                 [revision],
                 trackall=track_all,
+                flatten=flatten,
                 lower=reuse,
                 mindepth=min_depth,
                 minsize=min_size,
             )
 
 
 def generate_revision_tuples(
     filename: str,
 ) -> Generator[Tuple[Sha1Git, datetime, Sha1Git], None, None]:
     for line in open(filename, "r"):
         if line.strip():
             revision, date, root = line.strip().split(",")
             yield (
                 hash_to_bytes(revision),
                 iso8601.parse_date(date, default_timezone=timezone.utc),
                 hash_to_bytes(root),
             )
 
 
 @cli.command(name="iter-origins")
 @click.argument("filename")
-@click.option("-l", "--limit", type=int)
+@click.option(
+    "-l",
+    "--limit",
+    type=int,
+    help="""Limit the amount of entries (origins) to read from the input file.""",
+)
 @click.pass_context
 def iter_origins(ctx: click.core.Context, filename: str, limit: Optional[int]) -> None:
     """Process a provided list of origins."""
     from . import get_archive, get_provenance
     from .origin import CSVOriginIterator, origin_add
 
     archive = get_archive(**ctx.obj["config"]["provenance"]["archive"])
     origins_provider = generate_origin_tuples(filename)
     origins = CSVOriginIterator(origins_provider, limit=limit)
 
     with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
         for origin in origins:
             origin_add(provenance, archive, [origin])
 
 
 def generate_origin_tuples(filename: str) -> Generator[Tuple[str, bytes], None, None]:
     for line in open(filename, "r"):
         if line.strip():
             url, snapshot = line.strip().split(",")
             yield (url, hash_to_bytes(snapshot))
 
 
 @cli.command(name="find-first")
 @click.argument("swhid")
 @click.pass_context
 def find_first(ctx: click.core.Context, swhid: str) -> None:
     """Find first occurrence of the requested blob."""
     from . import get_provenance
 
     with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
         occur = provenance.content_find_first(hash_to_bytes(swhid))
         if occur is not None:
             print(
                 f"swh:1:cnt:{hash_to_hex(occur.content)}, "
                 f"swh:1:rev:{hash_to_hex(occur.revision)}, "
                 f"{occur.date}, "
                 f"{occur.origin}, "
                 f"{os.fsdecode(occur.path)}"
             )
         else:
             print(f"Cannot find a content with the id {swhid}")
 
 
 @cli.command(name="find-all")
 @click.argument("swhid")
-@click.option("-l", "--limit", type=int)
+@click.option(
+    "-l", "--limit", type=int, help="""Limit the amount results to be retrieved."""
+)
 @click.pass_context
 def find_all(ctx: click.core.Context, swhid: str, limit: Optional[int]) -> None:
     """Find all occurrences of the requested blob."""
     from . import get_provenance
 
     with get_provenance(**ctx.obj["config"]["provenance"]["storage"]) as provenance:
         for occur in provenance.content_find_all(hash_to_bytes(swhid), limit=limit):
             print(
                 f"swh:1:cnt:{hash_to_hex(occur.content)}, "
                 f"swh:1:rev:{hash_to_hex(occur.revision)}, "
                 f"{occur.date}, "
                 f"{occur.origin}, "
                 f"{os.fsdecode(occur.path)}"
             )
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
index 2b578e1..d77c949 100644
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -1,224 +1,228 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 from typing import Generator, Iterable, Iterator, List, Optional, Tuple
 
 from swh.core.statsd import statsd
 from swh.model.model import Sha1Git
 
 from .archive import ArchiveInterface
 from .directory import directory_flatten
 from .graph import IsochroneNode, build_isochrone_graph
 from .interface import ProvenanceInterface
 from .model import DirectoryEntry, RevisionEntry
 
 REVISION_DURATION_METRIC = "swh_provenance_revision_content_layer_duration_seconds"
 
 
 class CSVRevisionIterator:
     """Iterator over revisions typically present in the given CSV file.
 
     The input is an iterator that produces 3 elements per row:
 
       (id, date, root)
 
     where:
     - id: is the id (sha1_git) of the revision
     - date: is the author date
     - root: sha1 of the directory
     """
 
     def __init__(
         self,
         revisions: Iterable[Tuple[Sha1Git, datetime, Sha1Git]],
         limit: Optional[int] = None,
     ) -> None:
         self.revisions: Iterator[Tuple[Sha1Git, datetime, Sha1Git]]
         if limit is not None:
             from itertools import islice
 
             self.revisions = islice(revisions, limit)
         else:
             self.revisions = iter(revisions)
 
     def __iter__(self) -> Generator[RevisionEntry, None, None]:
         for id, date, root in self.revisions:
             if date.tzinfo is None:
                 date = date.replace(tzinfo=timezone.utc)
             yield RevisionEntry(id, date=date, root=root)
 
 
 @statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "main"})
 def revision_add(
     provenance: ProvenanceInterface,
     archive: ArchiveInterface,
     revisions: List[RevisionEntry],
     trackall: bool = True,
+    flatten: bool = True,
     lower: bool = True,
     mindepth: int = 1,
     minsize: int = 0,
     commit: bool = True,
 ) -> None:
     for revision in revisions:
         assert revision.date is not None
         assert revision.root is not None
         # Processed content starting from the revision's root directory.
         date = provenance.revision_get_date(revision)
         if date is None or revision.date < date:
             graph = build_isochrone_graph(
                 provenance,
                 archive,
                 revision,
                 DirectoryEntry(revision.root),
                 minsize=minsize,
             )
             revision_process_content(
                 provenance,
                 archive,
                 revision,
                 graph,
                 trackall=trackall,
+                flatten=flatten,
                 lower=lower,
                 mindepth=mindepth,
                 minsize=minsize,
             )
     if commit:
         provenance.flush()
 
 
 @statsd.timed(metric=REVISION_DURATION_METRIC, tags={"method": "process_content"})
 def revision_process_content(
     provenance: ProvenanceInterface,
     archive: ArchiveInterface,
     revision: RevisionEntry,
     graph: IsochroneNode,
     trackall: bool = True,
+    flatten: bool = True,
     lower: bool = True,
     mindepth: int = 1,
     minsize: int = 0,
 ) -> None:
     assert revision.date is not None
     provenance.revision_add(revision)
 
     stack = [graph]
     while stack:
         current = stack.pop()
         if current.dbdate is not None:
             assert current.dbdate <= revision.date
             if trackall:
                 # Current directory is an outer isochrone frontier for a previously
                 # processed revision. It should be reused as is.
                 provenance.directory_add_to_revision(
                     revision, current.entry, current.path
                 )
         else:
             assert current.maxdate is not None
             # Current directory is not an outer isochrone frontier for any previous
             # revision. It might be eligible for this one.
             if is_new_frontier(
                 current,
                 revision=revision,
                 trackall=trackall,
                 lower=lower,
                 mindepth=mindepth,
             ):
                 # Outer frontier should be moved to current position in the isochrone
                 # graph. This is the first time this directory is found in the isochrone
                 # frontier.
                 provenance.directory_set_date_in_isochrone_frontier(
                     current.entry, current.maxdate
                 )
                 if trackall:
                     provenance.directory_add_to_revision(
                         revision, current.entry, current.path
                     )
-                    directory_flatten(
-                        provenance, archive, current.entry, minsize=minsize
-                    )
+                    if flatten:
+                        directory_flatten(
+                            provenance, archive, current.entry, minsize=minsize
+                        )
             else:
                 # If current node is an invalidated frontier, update its date for future
                 # revisions to get the proper value.
                 if current.invalid:
                     provenance.directory_set_date_in_isochrone_frontier(
                         current.entry, current.maxdate
                     )
                 # No point moving the frontier here. Either there are no files or they
                 # are being seen for the first time here. Add all blobs to current
                 # revision updating date if necessary, and recursively analyse
                 # subdirectories as candidates to the outer frontier.
                 for blob in current.entry.files:
                     date = provenance.content_get_early_date(blob)
                     if date is None or revision.date < date:
                         provenance.content_set_early_date(blob, revision.date)
                     provenance.content_add_to_revision(revision, blob, current.path)
                 for child in current.children:
                     stack.append(child)
 
 
 def is_new_frontier(
     node: IsochroneNode,
     revision: RevisionEntry,
     trackall: bool = True,
     lower: bool = True,
     mindepth: int = 1,
 ) -> bool:
     assert node.maxdate is not None  # for mypy
     assert revision.date is not None  # idem
     if trackall:
         # The only real condition for a directory to be a frontier is that its content
         # is already known and its maxdate is less (or equal) than current revision's
         # date. Checking mindepth is meant to skip root directories (or any arbitrary
         # depth) to improve the result. The option lower tries to maximize the reuse
         # rate of previously defined  frontiers by keeping them low in the directory
         # tree.
         return (
             node.known
             and node.maxdate <= revision.date  # all content is earlier than revision
             and node.depth
             >= mindepth  # current node is deeper than the min allowed depth
             and (has_blobs(node) if lower else True)  # there is at least one blob in it
         )
     else:
         # If we are only tracking first occurrences, we want to ensure that all first
         # occurrences end up in the content_early_in_rev relation. Thus, we force for
         # every blob outside a frontier to have an strictly earlier date.
         return (
             node.maxdate < revision.date  # all content is earlier than revision
             and node.depth >= mindepth  # deeper than the min allowed depth
             and (has_blobs(node) if lower else True)  # there is at least one blob
         )
 
 
 def has_blobs(node: IsochroneNode) -> bool:
     # We may want to look for files in different ways to decide whether to define a
     # frontier or not:
     # 1. Only files in current node:
     return any(node.entry.files)
     # 2. Files anywhere in the isochrone graph
     # stack = [node]
     # while stack:
     #     current = stack.pop()
     #     if any(
     #         map(lambda child: isinstance(child.entry, FileEntry), current.children)):
     #         return True
     #     else:
     #         # All children are directory entries.
     #         stack.extend(current.children)
     # return False
     # 3. Files in the intermediate directories between current node and any previously
     #    defined frontier:
     # TODO: complete this case!
     # return any(
     #     map(lambda child: isinstance(child.entry, FileEntry), node.children)
     # ) or all(
     #     map(
     #         lambda child: (
     #             not (isinstance(child.entry, DirectoryEntry) and child.date is None)
     #         )
     #         or has_blobs(child),
     #         node.children,
     #     )
     # )
diff --git a/swh/provenance/tests/test_revision_content_layer.py b/swh/provenance/tests/test_revision_content_layer.py
index 6b81a37..01ef380 100644
--- a/swh/provenance/tests/test_revision_content_layer.py
+++ b/swh/provenance/tests/test_revision_content_layer.py
@@ -1,453 +1,481 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import re
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple
 
 import pytest
 from typing_extensions import TypedDict
 
 from swh.model.hashutil import hash_to_bytes
 from swh.model.model import Sha1Git
 from swh.provenance.archive import ArchiveInterface
+from swh.provenance.directory import directory_add
 from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType
-from swh.provenance.model import RevisionEntry
+from swh.provenance.model import DirectoryEntry, RevisionEntry
 from swh.provenance.revision import revision_add
 from swh.provenance.tests.conftest import (
     fill_storage,
     get_datafile,
     load_repo_data,
     ts2dt,
 )
 
 
 class SynthRelation(TypedDict):
     prefix: Optional[str]
     path: str
     src: Sha1Git
     dst: Sha1Git
     rel_ts: float
 
 
 class SynthRevision(TypedDict):
     sha1: Sha1Git
     date: float
     msg: str
     R_C: List[SynthRelation]
     R_D: List[SynthRelation]
     D_C: List[SynthRelation]
 
 
 def synthetic_revision_content_result(filename: str) -> Iterator[SynthRevision]:
     """Generates dict representations of synthetic revisions found in the synthetic
     file (from the data/ directory) given as argument of the generator.
 
     Generated SynthRevision (typed dict) with the following elements:
 
       "sha1": (Sha1Git) sha1 of the revision,
       "date": (float) timestamp of the revision,
       "msg": (str) commit message of the revision,
       "R_C": (list) new R---C relations added by this revision
       "R_D": (list) new R-D   relations added by this revision
       "D_C": (list) new   D-C relations added by this revision
 
     Each relation above is a SynthRelation typed dict with:
 
       "path": (str) location
       "src": (Sha1Git) sha1 of the source of the relation
       "dst": (Sha1Git) sha1 of the destination of the relation
       "rel_ts": (float) timestamp of the target of the relation
                 (related to the timestamp of the revision)
 
     """
 
     with open(get_datafile(filename), "r") as fobj:
         yield from _parse_synthetic_revision_content_file(fobj)
 
 
 def _parse_synthetic_revision_content_file(
     fobj: Iterable[str],
 ) -> Iterator[SynthRevision]:
     """Read a 'synthetic' file and generate a dict representation of the synthetic
     revision for each revision listed in the synthetic file.
     """
     regs = [
         "(?P<revname>R[0-9]{2,4})?",
         "(?P<reltype>[^| ]*)",
         "([+] )?(?P<path>[^| +]*?)[/]?",
         "(?P<type>[RDC]) (?P<sha1>[0-9a-f]{40})",
         "(?P<ts>-?[0-9]+(.[0-9]+)?)",
     ]
     regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$")
     current_rev: List[dict] = []
     for m in (regex.match(line) for line in fobj):
         if m:
             d = m.groupdict()
             if d["revname"]:
                 if current_rev:
                     yield _mk_synth_rev(current_rev)
                 current_rev.clear()
             current_rev.append(d)
     if current_rev:
         yield _mk_synth_rev(current_rev)
 
 
 def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision:
     assert synth_rev[0]["type"] == "R"
     rev = SynthRevision(
         sha1=hash_to_bytes(synth_rev[0]["sha1"]),
         date=float(synth_rev[0]["ts"]),
         msg=synth_rev[0]["revname"],
         R_C=[],
         R_D=[],
         D_C=[],
     )
     current_path = None
     # path of the last R-D relation we parsed, used a prefix for next D-C
     # relations
 
     for row in synth_rev[1:]:
         if row["reltype"] == "R---C":
             assert row["type"] == "C"
             rev["R_C"].append(
                 SynthRelation(
                     prefix=None,
                     path=row["path"],
                     src=rev["sha1"],
                     dst=hash_to_bytes(row["sha1"]),
                     rel_ts=float(row["ts"]),
                 )
             )
             current_path = None
         elif row["reltype"] == "R-D":
             assert row["type"] == "D"
             rev["R_D"].append(
                 SynthRelation(
                     prefix=None,
                     path=row["path"],
                     src=rev["sha1"],
                     dst=hash_to_bytes(row["sha1"]),
                     rel_ts=float(row["ts"]),
                 )
             )
             current_path = row["path"]
         elif row["reltype"] == "D-C":
             assert row["type"] == "C"
             rev["D_C"].append(
                 SynthRelation(
                     prefix=current_path,
                     path=row["path"],
                     src=rev["R_D"][-1]["dst"],
                     dst=hash_to_bytes(row["sha1"]),
                     rel_ts=float(row["ts"]),
                 )
             )
     return rev
 
 
 @pytest.mark.parametrize(
-    "repo, lower, mindepth",
+    "repo, lower, mindepth, flatten",
     (
-        ("cmdbts2", True, 1),
-        ("cmdbts2", False, 1),
-        ("cmdbts2", True, 2),
-        ("cmdbts2", False, 2),
-        ("out-of-order", True, 1),
+        ("cmdbts2", True, 1, True),
+        ("cmdbts2", True, 1, False),
+        ("cmdbts2", False, 1, True),
+        ("cmdbts2", False, 1, False),
+        ("cmdbts2", True, 2, True),
+        ("cmdbts2", True, 2, False),
+        ("cmdbts2", False, 2, True),
+        ("cmdbts2", False, 2, False),
+        ("out-of-order", True, 1, True),
+        ("out-of-order", True, 1, False),
     ),
 )
 def test_revision_content_result(
     provenance: ProvenanceInterface,
     archive: ArchiveInterface,
     repo: str,
     lower: bool,
     mindepth: int,
+    flatten: bool,
 ) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(archive.storage, data)
     syntheticfile = get_datafile(
         f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
     )
 
     revisions = {rev["id"]: rev for rev in data["revision"]}
 
     rows: Dict[str, Set[Any]] = {
         "content": set(),
         "content_in_directory": set(),
         "content_in_revision": set(),
         "directory": set(),
         "directory_in_revision": set(),
         "location": set(),
         "revision": set(),
     }
 
     def maybe_path(path: str) -> Optional[bytes]:
         if provenance.storage.with_path():
             return path.encode("utf-8")
         return None
 
     for synth_rev in synthetic_revision_content_result(syntheticfile):
         revision = revisions[synth_rev["sha1"]]
         entry = RevisionEntry(
             id=revision["id"],
             date=ts2dt(revision["date"]),
             root=revision["directory"],
         )
-        revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)
+
+        if flatten:
+            revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)
+        else:
+            prev_directories = provenance.storage.entity_get_all(EntityType.DIRECTORY)
+            revision_add(
+                provenance,
+                archive,
+                [entry],
+                lower=lower,
+                mindepth=mindepth,
+                flatten=False,
+            )
+            directories = [
+                DirectoryEntry(id=sha1)
+                for sha1 in provenance.storage.entity_get_all(
+                    EntityType.DIRECTORY
+                ).difference(prev_directories)
+            ]
+            for directory in directories:
+                assert not provenance.directory_already_flattenned(directory)
+            directory_add(provenance, archive, directories)
 
         # each "entry" in the synth file is one new revision
         rows["revision"].add(synth_rev["sha1"])
         assert rows["revision"] == provenance.storage.entity_get_all(
             EntityType.REVISION
         ), synth_rev["msg"]
         # check the timestamp of the revision
         rev_ts = synth_rev["date"]
         rev_data = provenance.storage.revision_get([synth_rev["sha1"]])[
             synth_rev["sha1"]
         ]
         assert (
             rev_data.date is not None and rev_ts == rev_data.date.timestamp()
         ), synth_rev["msg"]
 
         # this revision might have added new content objects
         rows["content"] |= set(x["dst"] for x in synth_rev["R_C"])
         rows["content"] |= set(x["dst"] for x in synth_rev["D_C"])
         assert rows["content"] == provenance.storage.entity_get_all(
             EntityType.CONTENT
         ), synth_rev["msg"]
 
         # check for R-C (direct) entries
         # these are added directly in the content_early_in_rev table
         rows["content_in_revision"] |= set(
             (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_C"]
         )
         assert rows["content_in_revision"] == {
             (src, rel.dst, rel.path)
             for src, rels in provenance.storage.relation_get_all(
                 RelationType.CNT_EARLY_IN_REV
             ).items()
             for rel in rels
         }, synth_rev["msg"]
         # check timestamps
         for rc in synth_rev["R_C"]:
             assert (
                 rev_ts + rc["rel_ts"]
                 == provenance.storage.content_get([rc["dst"]])[rc["dst"]].timestamp()
             ), synth_rev["msg"]
 
         # check directories
         # each directory stored in the provenance index is an entry
         #      in the "directory" table...
         rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"])
         assert rows["directory"] == provenance.storage.entity_get_all(
             EntityType.DIRECTORY
         ), synth_rev["msg"]
 
         # ... + a number of rows in the "directory_in_rev" table...
         # check for R-D entries
         rows["directory_in_revision"] |= set(
             (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_D"]
         )
         assert rows["directory_in_revision"] == {
             (src, rel.dst, rel.path)
             for src, rels in provenance.storage.relation_get_all(
                 RelationType.DIR_IN_REV
             ).items()
             for rel in rels
         }, synth_rev["msg"]
         # check timestamps
         for rd in synth_rev["R_D"]:
             dir_data = provenance.storage.directory_get([rd["dst"]])[rd["dst"]]
             assert rev_ts + rd["rel_ts"] == dir_data.date.timestamp(), synth_rev["msg"]
             assert dir_data.flat, synth_rev["msg"]
 
         # ... + a number of rows in the "content_in_dir" table
         #     for content of the directory.
         # check for D-C entries
         rows["content_in_directory"] |= set(
             (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["D_C"]
         )
         assert rows["content_in_directory"] == {
             (src, rel.dst, rel.path)
             for src, rels in provenance.storage.relation_get_all(
                 RelationType.CNT_IN_DIR
             ).items()
             for rel in rels
         }, synth_rev["msg"]
         # check timestamps
         for dc in synth_rev["D_C"]:
             assert (
                 rev_ts + dc["rel_ts"]
                 == provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp()
             ), synth_rev["msg"]
 
         if provenance.storage.with_path():
             # check for location entries
             rows["location"] |= set(x["path"].encode() for x in synth_rev["R_C"])
             rows["location"] |= set(x["path"].encode() for x in synth_rev["D_C"])
             rows["location"] |= set(x["path"].encode() for x in synth_rev["R_D"])
             assert rows["location"] == provenance.storage.location_get_all(), synth_rev[
                 "msg"
             ]
 
 
 @pytest.mark.parametrize(
     "repo, lower, mindepth",
     (
         ("cmdbts2", True, 1),
         ("cmdbts2", False, 1),
         ("cmdbts2", True, 2),
         ("cmdbts2", False, 2),
         ("out-of-order", True, 1),
     ),
 )
 @pytest.mark.parametrize("batch", (True, False))
 def test_provenance_heuristics_content_find_all(
     provenance: ProvenanceInterface,
     archive: ArchiveInterface,
     repo: str,
     lower: bool,
     mindepth: int,
     batch: bool,
 ) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(archive.storage, data)
     revisions = [
         RevisionEntry(
             id=revision["id"],
             date=ts2dt(revision["date"]),
             root=revision["directory"],
         )
         for revision in data["revision"]
     ]
 
     def maybe_path(path: str) -> str:
         if provenance.storage.with_path():
             return path
         return ""
 
     if batch:
         revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
     else:
         for revision in revisions:
             revision_add(
                 provenance, archive, [revision], lower=lower, mindepth=mindepth
             )
 
     syntheticfile = get_datafile(
         f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
     )
     expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {}
     for synth_rev in synthetic_revision_content_result(syntheticfile):
         rev_id = synth_rev["sha1"].hex()
         rev_ts = synth_rev["date"]
 
         for rc in synth_rev["R_C"]:
             expected_occurrences.setdefault(rc["dst"].hex(), []).append(
                 (rev_id, rev_ts, None, maybe_path(rc["path"]))
             )
         for dc in synth_rev["D_C"]:
             assert dc["prefix"] is not None  # to please mypy
             expected_occurrences.setdefault(dc["dst"].hex(), []).append(
                 (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
             )
 
     for content_id, results in expected_occurrences.items():
         expected = [(content_id, *result) for result in results]
         db_occurrences = [
             (
                 occur.content.hex(),
                 occur.revision.hex(),
                 occur.date.timestamp(),
                 occur.origin,
                 occur.path.decode(),
             )
             for occur in provenance.content_find_all(hash_to_bytes(content_id))
         ]
         if provenance.storage.with_path():
             # this is not true if the db stores no path, because a same content
             # that appears several times in a given revision may be reported
             # only once by content_find_all()
             assert len(db_occurrences) == len(expected)
         assert set(db_occurrences) == set(expected)
 
 
 @pytest.mark.parametrize(
     "repo, lower, mindepth",
     (
         ("cmdbts2", True, 1),
         ("cmdbts2", False, 1),
         ("cmdbts2", True, 2),
         ("cmdbts2", False, 2),
         ("out-of-order", True, 1),
     ),
 )
 @pytest.mark.parametrize("batch", (True, False))
 def test_provenance_heuristics_content_find_first(
     provenance: ProvenanceInterface,
     archive: ArchiveInterface,
     repo: str,
     lower: bool,
     mindepth: int,
     batch: bool,
 ) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(archive.storage, data)
     revisions = [
         RevisionEntry(
             id=revision["id"],
             date=ts2dt(revision["date"]),
             root=revision["directory"],
         )
         for revision in data["revision"]
     ]
 
     if batch:
         revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
     else:
         for revision in revisions:
             revision_add(
                 provenance, archive, [revision], lower=lower, mindepth=mindepth
             )
 
     syntheticfile = get_datafile(
         f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
     )
     expected_first: Dict[str, Tuple[str, float, List[str]]] = {}
     # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path
     # is a list because a content can be added at several places in a single
     # revision, in which case the result of content_find_first() is one of
     # those path, but we have no guarantee which one it will return.
     for synth_rev in synthetic_revision_content_result(syntheticfile):
         rev_id = synth_rev["sha1"].hex()
         rev_ts = synth_rev["date"]
 
         for rc in synth_rev["R_C"]:
             sha1 = rc["dst"].hex()
             if sha1 not in expected_first:
                 assert rc["rel_ts"] == 0
                 expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
             else:
                 if rev_ts == expected_first[sha1][1]:
                     expected_first[sha1][2].append(rc["path"])
                 elif rev_ts < expected_first[sha1][1]:
                     expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
 
         for dc in synth_rev["D_C"]:
             sha1 = rc["dst"].hex()
             assert sha1 in expected_first
             # nothing to do there, this content cannot be a "first seen file"
 
     for content_id, (rev_id, ts, paths) in expected_first.items():
         occur = provenance.content_find_first(hash_to_bytes(content_id))
         assert occur is not None
         assert occur.content.hex() == content_id
         assert occur.revision.hex() == rev_id
         assert occur.date.timestamp() == ts
         assert occur.origin is None
         if provenance.storage.with_path():
             assert occur.path.decode() in paths