diff --git a/setup.py b/setup.py
index 9718259..23bfc8b 100755
--- a/setup.py
+++ b/setup.py
@@ -1,74 +1,74 @@
 #!/usr/bin/env python3
 # Copyright (C) 2019-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from setuptools import setup, find_packages
-
-from os import path
 from io import open
+from os import path
+
+from setuptools import find_packages, setup
 
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
 with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 
 def parse_requirements(name=None):
     if name:
         reqf = "requirements-%s.txt" % name
     else:
         reqf = "requirements.txt"
 
     requirements = []
     if not path.exists(reqf):
         return requirements
 
     with open(reqf) as f:
         for line in f.readlines():
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
             requirements.append(line)
     return requirements
 
 
 # Edit this part to match your module.
 # Full sample:
 #   https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py
 setup(
     name="swh.provenance",
     description="Software Heritage code provenance",
     long_description=long_description,
     long_description_content_type="text/markdown",
     python_requires=">=3.7",
     author="Software Heritage developers",
     author_email="swh-devel@inria.fr",
     url="https://forge.softwareheritage.org/diffusion/222/",
     packages=find_packages(),  # packages's modules
     install_requires=parse_requirements() + parse_requirements("swh"),
     tests_require=parse_requirements("test"),
     setup_requires=["setuptools-scm"],
     use_scm_version=True,
     extras_require={"testing": parse_requirements("test")},
     include_package_data=True,
     entry_points="""
         [swh.cli.subcommands]
         provenance=swh.provenance.cli
     """,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Development Status :: 3 - Alpha",
     ],
     project_urls={
         "Bug Reports": "https://forge.softwareheritage.org/maniphest",
         "Funding": "https://www.softwareheritage.org/donate",
         "Source": "https://forge.softwareheritage.org/source/swh-provenance",
         "Documentation": "https://docs.softwareheritage.org/devel/swh-provenance/",
     },
 )
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
index 7fe778d..cfb764b 100644
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -1,28 +1,28 @@
 from .archive import ArchiveInterface
-from .provenance import ProvenanceInterface
-from .storage.archive import ArchiveStorage
 from .postgresql.archive import ArchivePostgreSQL
 from .postgresql.db_utils import connect
 from .postgresql.provenance import ProvenancePostgreSQL
 from .postgresql_nopath.provenance import ProvenancePostgreSQLNoPath
+from .provenance import ProvenanceInterface
+from .storage.archive import ArchiveStorage
 
 
 def get_archive(cls: str, **kwargs) -> ArchiveInterface:
     if cls == "api":
         return ArchiveStorage(**kwargs["storage"])
     elif cls == "ps":
         conn = connect(kwargs["db"])
         return ArchivePostgreSQL(conn)
     else:
         raise NotImplementedError
 
 
 def get_provenance(cls: str, **kwargs) -> ProvenanceInterface:
     if cls == "ps":
         conn = connect(kwargs["db"])
         return ProvenancePostgreSQL(conn)
     elif cls == "ps_np":
         conn = connect(kwargs["db"])
         return ProvenancePostgreSQLNoPath(conn)
     else:
         raise NotImplementedError
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
index daafe4f..acaaa15 100644
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -1,210 +1,210 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
 import os
 from typing import Any, Dict, Optional
 
 import click
 import yaml
 
 from swh.core import config
 from swh.core.cli import CONTEXT_SETTINGS
 from swh.core.cli import swh as swh_cli_group
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 
 # All generic config code should reside in swh.core.config
 CONFIG_ENVVAR = "SWH_CONFIG_FILE"
 DEFAULT_CONFIG_PATH = os.path.join(click.get_app_dir("swh"), "global.yml")
 DEFAULT_PATH = os.environ.get(CONFIG_ENVVAR, DEFAULT_CONFIG_PATH)
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "archive": {
         "cls": "api",
         "storage": {
             "cls": "remote",
             "url": "http://uffizi.internal.softwareheritage.org:5002",
         }
         # "cls": "ps",
         # "db": {
         #     "host": "db.internal.softwareheritage.org",
         #     "dbname": "softwareheritage",
         #     "user": "guest"
         # }
     },
     "provenance": {"cls": "ps", "db": {"host": "localhost", "dbname": "provenance"}},
 }
 
 
 CONFIG_FILE_HELP = f"""Configuration file:
 
 \b
 The CLI option or the environment variable will fail if invalid.
 CLI option is checked first.
 Then, environment variable {CONFIG_ENVVAR} is checked.
 Then, if cannot load the default path, a set of default values are used.
 Default config path is {DEFAULT_CONFIG_PATH}.
 Default config values are:
 
 \b
 {yaml.dump(DEFAULT_CONFIG)}"""
 PROVENANCE_HELP = f"""Software Heritage Scanner tools.
 
 {CONFIG_FILE_HELP}"""
 
 
 @swh_cli_group.group(
     name="provenance", context_settings=CONTEXT_SETTINGS, help=PROVENANCE_HELP
 )
 @click.option(
     "-C",
     "--config-file",
     default=None,
     type=click.Path(exists=False, dir_okay=False, path_type=str),
     help="""YAML configuration file.""",
 )
 @click.option(
     "-P",
     "--profile",
     default=None,
     type=click.Path(exists=False, dir_okay=False, path_type=str),
     help="""Enable profiling to specified file.""",
 )
 @click.pass_context
 def cli(ctx, config_file: Optional[str], profile: str):
     if config_file is None and config.config_exists(DEFAULT_PATH):
         config_file = DEFAULT_PATH
 
     if config_file is None:
         conf = DEFAULT_CONFIG
     else:
         # read_raw_config do not fail on ENOENT
         if not config.config_exists(config_file):
             raise FileNotFoundError(config_file)
         conf = config.read_raw_config(config.config_basepath(config_file))
         conf = config.merge_configs(DEFAULT_CONFIG, conf)
 
     ctx.ensure_object(dict)
     ctx.obj["config"] = conf
 
     if profile:
-        import cProfile
         import atexit
+        import cProfile
 
         print("Profiling...")
         pr = cProfile.Profile()
         pr.enable()
 
         def exit():
             pr.disable()
             pr.dump_stats(profile)
 
         atexit.register(exit)
 
 
 @cli.command(name="create")
 @click.option("--name", default=None)
 @click.pass_context
 def create(ctx, name):
     """Create new provenance database."""
     from .postgresql.db_utils import connect
 
     # Connect to server without selecting a database
     conninfo = ctx.obj["config"]["provenance"]["db"]
     conn = connect(conninfo)
 
     if ctx.obj["config"]["provenance"]["cls"] == "ps":
         from .postgresql.provenance import create_database
 
         create_database(conn, conninfo, name)
     elif ctx.obj["config"]["provenance"]["cls"] == "ps_np":
         from .postgresql_nopath.provenance import create_database
 
         create_database(conn, conninfo, name)
     else:
         raise NotImplementedError
 
 
 @cli.command(name="iter-revisions")
 @click.argument("filename")
 @click.option("-l", "--limit", type=int)
 @click.pass_context
 def iter_revisions(ctx, filename, limit):
     # TODO: add file size filtering
     """Process a provided list of revisions."""
     from . import get_archive, get_provenance
-    from .revision import FileRevisionIterator
     from .provenance import revision_add
+    from .revision import FileRevisionIterator
 
     archive = get_archive(**ctx.obj["config"]["archive"])
     provenance = get_provenance(**ctx.obj["config"]["provenance"])
     revisions = FileRevisionIterator(filename, archive, limit=limit)
 
     while True:
         revision = revisions.next()
         if revision is None:
             break
         revision_add(provenance, archive, revision)
 
 
 @cli.command(name="iter-origins")
 @click.argument("filename")
 @click.option("-l", "--limit", type=int)
 @click.pass_context
 def iter_origins(ctx, filename, limit):
     """Process a provided list of origins."""
     from . import get_archive, get_provenance
     from .origin import FileOriginIterator
     from .provenance import origin_add
 
     archive = get_archive(**ctx.obj["config"]["archive"])
     provenance = get_provenance(**ctx.obj["config"]["provenance"])
 
     for origin in FileOriginIterator(filename, archive, limit=limit):
         origin_add(provenance, origin)
 
 
 @cli.command(name="find-first")
 @click.argument("swhid")
 @click.pass_context
 def find_first(ctx, swhid):
     """Find first occurrence of the requested blob."""
     from . import get_provenance
 
     provenance = get_provenance(**ctx.obj["config"]["provenance"])
     # TODO: return a dictionary with proper keys for each field
     row = provenance.content_find_first(hash_to_bytes(swhid))
     if row is not None:
         print(
             "{blob}, {rev}, {date}, {path}".format(
                 blob=hash_to_hex(row[0]),
                 rev=hash_to_hex(row[1]),
                 date=row[2],
                 path=os.fsdecode(row[3]),
             )
         )
     else:
         print(f"Cannot find a content with the id {swhid}")
 
 
 @cli.command(name="find-all")
 @click.argument("swhid")
 @click.pass_context
 def find_all(ctx, swhid):
     """Find all occurrences of the requested blob."""
     from swh.provenance import get_provenance
 
     provenance = get_provenance(**ctx.obj["config"]["provenance"])
     # TODO: return a dictionary with proper keys for each field
     for row in provenance.content_find_all(hash_to_bytes(swhid)):
         print(
             "{blob}, {rev}, {date}, {path}".format(
                 blob=hash_to_hex(row[0]),
                 rev=hash_to_hex(row[1]),
                 date=row[2],
                 path=os.fsdecode(row[3]),
             )
         )
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
index d54d79d..660e0f4 100644
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -1,111 +1,111 @@
-from .archive import ArchiveInterface
-from .revision import RevisionEntry
-
 from typing import Optional
 
-from swh.model.model import Origin, ObjectType, TargetType
+from swh.model.model import ObjectType, Origin, TargetType
+
+from .archive import ArchiveInterface
+from .revision import RevisionEntry
 
 
 class OriginEntry:
     def __init__(self, url, revisions, id=None):
         self.id = id
         self.url = url
         self.revisions = revisions
 
 
 ################################################################################
 ################################################################################
 
 
 class OriginIterator:
     """Iterator interface."""
 
     def __iter__(self):
         pass
 
     def __next__(self):
         pass
 
 
 class FileOriginIterator(OriginIterator):
     """Iterator over origins present in the given CSV file."""
 
     def __init__(
         self, filename: str, archive: ArchiveInterface, limit: Optional[int] = None
     ):
         self.file = open(filename)
         self.limit = limit
         # self.mutex = threading.Lock()
         self.archive = archive
 
     def __iter__(self):
         yield from iterate_statuses(
             [Origin(url.strip()) for url in self.file], self.archive, self.limit
         )
 
 
 class ArchiveOriginIterator:
     """Iterator over origins present in the given storage."""
 
     def __init__(self, archive: ArchiveInterface, limit: Optional[int] = None):
         self.limit = limit
         # self.mutex = threading.Lock()
         self.archive = archive
 
     def __iter__(self):
         yield from iterate_statuses(
             self.archive.iter_origins(), self.archive, self.limit
         )
 
 
 def iterate_statuses(origins, archive: ArchiveInterface, limit: Optional[int] = None):
     idx = 0
     for origin in origins:
         for visit in archive.iter_origin_visits(origin.url):
             for status in archive.iter_origin_visit_statuses(origin.url, visit.visit):
                 # TODO: may filter only those whose status is 'full'??
                 targets = []
                 releases = []
 
                 snapshot = archive.snapshot_get_all_branches(status.snapshot)
                 if snapshot is not None:
                     for branch in snapshot.branches:
                         if snapshot.branches[branch].target_type == TargetType.REVISION:
                             targets.append(snapshot.branches[branch].target)
 
                         elif (
                             snapshot.branches[branch].target_type == TargetType.RELEASE
                         ):
                             releases.append(snapshot.branches[branch].target)
 
                 # This is done to keep the query in release_get small, hence avoiding
                 # a timeout.
                 limit = 100
                 for i in range(0, len(releases), limit):
                     for release in archive.release_get(releases[i : i + limit]):
                         if release is not None:
                             if release.target_type == ObjectType.REVISION:
                                 targets.append(release.target)
 
                 # This is done to keep the query in revision_get small, hence avoiding
                 # a timeout.
                 revisions = []
                 limit = 100
                 for i in range(0, len(targets), limit):
                     for revision in archive.revision_get(targets[i : i + limit]):
                         if revision is not None:
                             parents = list(
                                 map(
                                     lambda id: RevisionEntry(archive, id),
                                     revision.parents,
                                 )
                             )
                             revisions.append(
                                 RevisionEntry(archive, revision.id, parents=parents)
                             )
 
                 yield OriginEntry(status.origin, revisions)
 
                 idx = idx + 1
                 if idx == limit:
                     return
diff --git a/swh/provenance/postgresql/archive.py b/swh/provenance/postgresql/archive.py
index 0ff885a..6fe527c 100644
--- a/swh/provenance/postgresql/archive.py
+++ b/swh/provenance/postgresql/archive.py
@@ -1,85 +1,79 @@
-import psycopg2
+from typing import Any, Dict, List
 
-# import threading
+from methodtools import lru_cache
+import psycopg2
 
 from ..archive import ArchiveInterface
 
-# from functools import lru_cache
-from methodtools import lru_cache
-from typing import Any, Dict, List
-
 
 class ArchivePostgreSQL(ArchiveInterface):
     def __init__(self, conn: psycopg2.extensions.connection):
         self.conn = conn
-        # self.mutex = threading.Lock()
 
     def directory_ls(self, id: bytes) -> List[Dict[str, Any]]:
         # TODO: only call directory_ls_internal if the id is not being queried by
         # someone else. Otherwise wait until results get properly cached.
-        # self.mutex.acquire()
         entries = self.directory_ls_internal(id)
-        # self.mutex.release()
         return entries
 
     @lru_cache(maxsize=1000000)
     def directory_ls_internal(self, id: bytes) -> List[Dict[str, Any]]:
         # TODO: add file size filtering
         cursor = self.conn.cursor()
         cursor.execute(
             """WITH
             dir  AS (SELECT id AS dir_id, dir_entries, file_entries, rev_entries
                         FROM directory WHERE id=%s),
             ls_d AS (SELECT dir_id, UNNEST(dir_entries)  AS entry_id FROM dir),
             ls_f AS (SELECT dir_id, UNNEST(file_entries) AS entry_id FROM dir),
             ls_r AS (SELECT dir_id, UNNEST(rev_entries)  AS entry_id FROM dir)
             (SELECT 'dir'::directory_entry_type AS type, e.target, e.name,
                     NULL::sha1_git
                 FROM ls_d
                 LEFT JOIN directory_entry_dir e ON ls_d.entry_id=e.id)
             UNION
             (WITH known_contents AS
                 (SELECT 'file'::directory_entry_type AS type, e.target, e.name,
                         c.sha1_git
                     FROM ls_f
                     LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id
                     INNER JOIN content c ON e.target=c.sha1_git)
                 SELECT * FROM known_contents
                 UNION
                 (SELECT 'file'::directory_entry_type AS type, e.target, e.name,
                         c.sha1_git
                     FROM ls_f
                     LEFT JOIN directory_entry_file e ON ls_f.entry_id=e.id
                     LEFT JOIN skipped_content c ON e.target=c.sha1_git
                     WHERE NOT EXISTS (
                         SELECT 1 FROM known_contents
                             WHERE known_contents.sha1_git=e.target
                     )
                 )
             )
             ORDER BY name
             """,
             (id,),
         )
         return [
             {"type": row[0], "target": row[1], "name": row[2]}
             for row in cursor.fetchall()
         ]
 
     def iter_origins(self):
         raise NotImplementedError
 
     def iter_origin_visits(self, origin: str):
         raise NotImplementedError
 
     def iter_origin_visit_statuses(self, origin: str, visit: int):
         raise NotImplementedError
 
     def release_get(self, ids: List[bytes]):
         raise NotImplementedError
 
     def revision_get(self, ids: List[bytes]):
         raise NotImplementedError
 
     def snapshot_get_all_branches(self, snapshot: bytes):
         raise NotImplementedError
diff --git a/swh/provenance/postgresql/db_utils.py b/swh/provenance/postgresql/db_utils.py
index 8e406f3..cea4887 100644
--- a/swh/provenance/postgresql/db_utils.py
+++ b/swh/provenance/postgresql/db_utils.py
@@ -1,61 +1,61 @@
+from configparser import ConfigParser
 import io
-import psycopg2
 
-from configparser import ConfigParser
+import psycopg2
 
 
 def config(filename: str, section: str):
     # create a parser
     parser = ConfigParser()
     # read config file
     parser.read(filename)
 
     # get section, default to postgresql
     db = {}
     if parser.has_section(section):
         params = parser.items(section)
         for param in params:
             db[param[0]] = param[1]
     else:
         raise Exception(f"Section {section} not found in the {filename} file")
 
     return db
 
 
 def typecast_bytea(value, cur):
     if value is not None:
         data = psycopg2.BINARY(value, cur)
         return data.tobytes()
 
 
 def adapt_conn(conn):
     """Makes psycopg2 use 'bytes' to decode bytea instead of
     'memoryview', for this connection."""
     t_bytes = psycopg2.extensions.new_type((17,), "bytea", typecast_bytea)
     psycopg2.extensions.register_type(t_bytes, conn)
 
     t_bytes_array = psycopg2.extensions.new_array_type((1001,), "bytea[]", t_bytes)
     psycopg2.extensions.register_type(t_bytes_array, conn)
 
 
 def connect(params: dict):
     """ Connect to the PostgreSQL database server """
     conn = None
 
     try:
         # connect to the PostgreSQL server
         conn = psycopg2.connect(**params)
         adapt_conn(conn)
 
     except (Exception, psycopg2.DatabaseError) as error:
         print(error)
 
     return conn
 
 
 def execute_sql(conn: psycopg2.extensions.connection, filename: str):
     with io.open(filename) as file:
         cur = conn.cursor()
         cur.execute(file.read())
         cur.close()
         conn.commit()
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
index b3455d4..1e19773 100644
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -1,505 +1,505 @@
 import itertools
 import logging
 import operator
 import os
+from datetime import datetime
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
 import psycopg2
 import psycopg2.extras
 
 from ..model import DirectoryEntry, FileEntry
 from ..origin import OriginEntry
-from .db_utils import connect, execute_sql
 from ..provenance import ProvenanceInterface
 from ..revision import RevisionEntry
-
-from datetime import datetime
-from typing import Any, Dict, Generator, List, Optional, Tuple
+from .db_utils import connect, execute_sql
 
 
 def normalize(path: bytes) -> bytes:
     return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path
 
 
 def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
     conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
 
     # Normalize dbname to avoid issues when reconnecting below
     name = name.casefold()
 
     # Create new database dropping previous one if exists
     cursor = conn.cursor()
     cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
     cursor.execute(f"""CREATE DATABASE {name}""")
     conn.close()
 
     # Reconnect to server selecting newly created database to add tables
     conninfo["dbname"] = name
     conn = connect(conninfo)
 
     sqldir = os.path.dirname(os.path.realpath(__file__))
     execute_sql(conn, os.path.join(sqldir, "provenance.sql"))
 
 
 ########################################################################################
 ########################################################################################
 ########################################################################################
 
 
 class ProvenancePostgreSQL(ProvenanceInterface):
     def __init__(self, conn: psycopg2.extensions.connection):
         # TODO: consider adding a mutex for thread safety
         conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         self.conn = conn
         self.cursor = self.conn.cursor()
         self.insert_cache: Dict[str, Any] = {}
         self.remove_cache: Dict[str, Any] = {}
         self.select_cache: Dict[str, Any] = {}
         self.clear_caches()
 
     def clear_caches(self):
         self.insert_cache = {
             "content": dict(),
             "content_early_in_rev": list(),
             "content_in_dir": list(),
             "directory": dict(),
             "directory_in_rev": list(),
             "revision": dict(),
             "revision_before_rev": list(),
             "revision_in_org": list(),
         }
         self.remove_cache = {"directory": dict()}
         self.select_cache = {"content": dict(), "directory": dict(), "revision": dict()}
 
     def commit(self):
         result = False
         try:
             self.insert_all()
             self.clear_caches()
             result = True
 
         except Exception as error:
             # Unexpected error occurred, rollback all changes and log message
             logging.error(f"Unexpected error: {error}")
 
         return result
 
     def content_add_to_directory(
         self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_in_dir"].append(
             (blob.id, directory.id, normalize(os.path.join(prefix, blob.name)))
         )
 
     def content_add_to_revision(
         self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_early_in_rev"].append(
             (blob.id, revision.id, normalize(os.path.join(prefix, blob.name)))
         )
 
     def content_find_first(
         self, blobid: bytes
     ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
         self.cursor.execute(
             """SELECT content_location.sha1 AS blob,
                       revision.sha1 AS rev,
                       revision.date AS date,
                       content_location.path AS path
                  FROM (SELECT content_hex.sha1,
                               content_hex.rev,
                               location.path
                         FROM (SELECT content.sha1,
                                      content_early_in_rev.rev,
                                      content_early_in_rev.loc
                                FROM content_early_in_rev
                                JOIN content
                                  ON content.id=content_early_in_rev.blob
                                WHERE content.sha1=%s
                              ) AS content_hex
                         JOIN location
                             ON location.id=content_hex.loc
                       ) AS content_location
                  JOIN revision
                    ON revision.id=content_location.rev
                  ORDER BY date, rev, path ASC LIMIT 1""",
             (blobid,),
         )
         return self.cursor.fetchone()
 
     def content_find_all(
         self, blobid: bytes
     ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
         self.cursor.execute(
             """(SELECT content_location.sha1 AS blob,
                        revision.sha1 AS rev,
                        revision.date AS date,
                        content_location.path AS path
                  FROM (SELECT content_hex.sha1,
                               content_hex.rev,
                               location.path
                         FROM (SELECT content.sha1,
                                      content_early_in_rev.rev,
                                      content_early_in_rev.loc
                                FROM content_early_in_rev
                                JOIN content
                                  ON content.id=content_early_in_rev.blob
                                WHERE content.sha1=%s
                              ) AS content_hex
                         JOIN location
                           ON location.id=content_hex.loc
                       ) AS content_location
                  JOIN revision
                    ON revision.id=content_location.rev
                  )
                UNION
                (SELECT content_prefix.sha1 AS blob,
                        revision.sha1 AS rev,
                        revision.date AS date,
                        content_prefix.path AS path
                  FROM (SELECT content_in_rev.sha1,
                               content_in_rev.rev,
                               CASE location.path
                                 WHEN '' THEN content_in_rev.suffix
                                 WHEN '.' THEN content_in_rev.suffix
                                 ELSE (location.path || '/' ||
                                          content_in_rev.suffix)::unix_path
                               END AS path
                         FROM (SELECT content_suffix.sha1,
                                      directory_in_rev.rev,
                                      directory_in_rev.loc,
                                      content_suffix.path AS suffix
                                FROM (SELECT content_hex.sha1,
                                             content_hex.dir,
                                             location.path
                                       FROM (SELECT content.sha1,
                                                    content_in_dir.dir,
                                                    content_in_dir.loc
                                              FROM content_in_dir
                                              JOIN content
                                                ON content_in_dir.blob=content.id
                                              WHERE content.sha1=%s
                                            ) AS content_hex
                                       JOIN location
                                         ON location.id=content_hex.loc
                                     ) AS content_suffix
                                JOIN directory_in_rev
                                  ON directory_in_rev.dir=content_suffix.dir
                              ) AS content_in_rev
                         JOIN location
                           ON location.id=content_in_rev.loc
                       ) AS content_prefix
                  JOIN revision
                    ON revision.id=content_prefix.rev
                )
                ORDER BY date, rev, path""",
             (blobid, blobid),
         )
         # TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
         yield from self.cursor.fetchall()
 
     def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["content"].get(blob.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["content"].get(blob.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM content WHERE sha1=%s""", (blob.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["content"][blob.id] = date
         return date
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for blob in blobs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["content"].get(blob.id, None)
             if date is not None:
                 dates[blob.id] = date
             else:
                 # If not, check whether it's been query before
                 date = self.select_cache["content"].get(blob.id, None)
                 if date is not None:
                     dates[blob.id] = date
                 else:
                     pending.append(blob.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM content WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["content"][row[0]] = row[1]
         return dates
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         self.insert_cache["content"][blob.id] = date
 
     def directory_add_to_revision(
         self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
     ):
         self.insert_cache["directory_in_rev"].append(
             (directory.id, revision.id, normalize(path))
         )
 
     def directory_get_date_in_isochrone_frontier(
         self, directory: DirectoryEntry
     ) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["directory"].get(directory.id, None)
         if date is None and directory.id not in self.remove_cache["directory"]:
             # If not, check whether it's been query before
             date = self.select_cache["directory"].get(directory.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM directory WHERE sha1=%s""", (directory.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["directory"][directory.id] = date
         return date
 
     def directory_get_dates_in_isochrone_frontier(
         self, dirs: List[DirectoryEntry]
     ) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for directory in dirs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["directory"].get(directory.id, None)
             if date is not None:
                 dates[directory.id] = date
             elif directory.id not in self.remove_cache["directory"]:
                 # If not, check whether it's been query before
                 date = self.select_cache["directory"].get(directory.id, None)
                 if date is not None:
                     dates[directory.id] = date
                 else:
                     pending.append(directory.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM directory WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["directory"][row[0]] = row[1]
         return dates
 
     def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
         self.remove_cache["directory"][directory.id] = None
         self.insert_cache["directory"].pop(directory.id, None)
 
     def directory_set_date_in_isochrone_frontier(
         self, directory: DirectoryEntry, date: datetime
     ):
         self.insert_cache["directory"][directory.id] = date
         self.remove_cache["directory"].pop(directory.id, None)
 
     def insert_all(self):
         # Performe insertions with cached information
         if self.insert_cache["content"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY content;
                    INSERT INTO content(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,content.date)""",
                 self.insert_cache["content"].items(),
             )
             self.insert_cache["content"].clear()
 
         if self.insert_cache["directory"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY directory;
                    INSERT INTO directory(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,directory.date)""",
                 self.insert_cache["directory"].items(),
             )
             self.insert_cache["directory"].clear()
 
         if self.insert_cache["revision"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY revision;
                    INSERT INTO revision(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,revision.date)""",
                 self.insert_cache["revision"].items(),
             )
             self.insert_cache["revision"].clear()
 
         # Relations should come after ids for elements were resolved
         if self.insert_cache["content_early_in_rev"]:
             self.insert_location("content", "revision", "content_early_in_rev")
 
         if self.insert_cache["content_in_dir"]:
             self.insert_location("content", "directory", "content_in_dir")
 
         if self.insert_cache["directory_in_rev"]:
             self.insert_location("directory", "revision", "directory_in_rev")
 
         # if self.insert_cache["revision_before_rev"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_before_rev VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_before_rev"],
         #     )
         #     self.insert_cache["revision_before_rev"].clear()
 
         # if self.insert_cache["revision_in_org"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_in_org VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_in_org"],
         #     )
         #     self.insert_cache["revision_in_org"].clear()
 
     def insert_location(self, src0_table, src1_table, dst_table):
         # Resolve src0 ids
         src0_values = dict().fromkeys(
             map(operator.itemgetter(0), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src0_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({values})""",
             tuple(src0_values),
         )
         src0_values = dict(self.cursor.fetchall())
 
         # Resolve src1 ids
         src1_values = dict().fromkeys(
             map(operator.itemgetter(1), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src1_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({values})""",
             tuple(src1_values),
         )
         src1_values = dict(self.cursor.fetchall())
 
         # Resolve location ids
         location = dict().fromkeys(
             map(operator.itemgetter(2), self.insert_cache[dst_table])
         )
         location = dict(
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY location;
                    INSERT INTO location(path) VALUES %s
                      ON CONFLICT (path) DO
                        UPDATE SET path=EXCLUDED.path
                      RETURNING path, id""",
                 map(lambda path: (path,), location.keys()),
                 fetch=True,
             )
         )
 
         # Insert values in dst_table
         rows = map(
             lambda row: (src0_values[row[0]], src1_values[row[1]], location[row[2]]),
             self.insert_cache[dst_table],
         )
         psycopg2.extras.execute_values(
             self.cursor,
             f"""INSERT INTO {dst_table} VALUES %s
                   ON CONFLICT DO NOTHING""",
             rows,
         )
         self.insert_cache[dst_table].clear()
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         if origin.id is None:
             # Insert origin in the DB and return the assigned id
             self.cursor.execute(
                 """INSERT INTO origin (url) VALUES (%s)
                      ON CONFLICT DO NOTHING
                      RETURNING id""",
                 (origin.url,),
             )
             return self.cursor.fetchone()[0]
         else:
             return origin.id
 
     def revision_add(self, revision: RevisionEntry):
         # Add current revision to the compact DB
         self.insert_cache["revision"][revision.id] = revision.date
 
     def revision_add_before_revision(
         self, relative: RevisionEntry, revision: RevisionEntry
     ):
         self.insert_cache["revision_before_rev"].append((revision.id, relative.id))
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         self.insert_cache["revision_in_org"].append((revision.id, origin.id))
 
     def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
         date = self.insert_cache["revision"].get(revision.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["revision"].get(revision.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM revision WHERE sha1=%s""", (revision.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["revision"][revision.id] = date
         return date
 
     def revision_get_preferred_origin(self, revision: RevisionEntry) -> int:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT COALESCE(org,0) FROM revision WHERE sha1=%s""", (revision.id,)
         )
         row = self.cursor.fetchone()
         # None means revision is not in database;
         # 0 means revision has no preferred origin
         return row[0] if row is not None and row[0] != 0 else None
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_before_rev
                  JOIN revision
                    ON revision.id=revision_before_rev.prev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
 
     def revision_set_preferred_origin(
         self, origin: OriginEntry, revision: RevisionEntry
     ):
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """UPDATE revision SET org=%s WHERE sha1=%s""", (origin.id, revision.id)
         )
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_in_org
                  JOIN revision
                    ON revision.id=revision_in_org.rev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
diff --git a/swh/provenance/postgresql/provenance.sql b/swh/provenance/postgresql/provenance.sql
index dda567a..7d37528 100644
--- a/swh/provenance/postgresql/provenance.sql
+++ b/swh/provenance/postgresql/provenance.sql
@@ -1,150 +1,150 @@
 -- a Git object ID, i.e., a Git-style salted SHA1 checksum
 drop domain if exists sha1_git cascade;
 create domain sha1_git as bytea check (length(value) = 20);
 
 -- UNIX path (absolute, relative, individual path component, etc.)
 drop domain if exists unix_path cascade;
 create domain unix_path as bytea;
 
 
 drop table if exists content;
 create table content
 (
     id      bigserial primary key,      -- internal identifier of the content blob
     sha1    sha1_git unique not null,   -- intrinsic identifier of the content blob
     date    timestamptz not null        -- timestamp of the revision where the blob appears early
 );
 
 comment on column content.id is 'Content internal identifier';
 comment on column content.sha1 is 'Content intrinsic identifier';
 comment on column content.date is 'Earliest timestamp for the content (first seen time)';
 
 
 drop table if exists content_early_in_rev;
 create table content_early_in_rev
 (
     blob    bigint not null,            -- internal identifier of the content blob
     rev     bigint not null,            -- internal identifier of the revision where the blob appears for the first time
     loc     bigint not null,            -- location of the content relative to the revision root directory
     primary key (blob, rev, loc)
     -- foreign key (blob) references content (id),
     -- foreign key (rev) references revision (id),
     -- foreign key (loc) references location (id)
 );
 
 comment on column content_early_in_rev.blob is 'Content internal identifier';
 comment on column content_early_in_rev.rev is 'Revision internal identifier';
 comment on column content_early_in_rev.loc is 'Location of content in revision';
 
 
 drop table if exists content_in_dir;
 create table content_in_dir
 (
     blob    bigint not null,            -- internal identifier of the content blob
-    dir     bigint not null,            -- internal identifier of the directory contaning the blob
+    dir     bigint not null,            -- internal identifier of the directory containing the blob
     loc     bigint not null,            -- location of the content relative to its parent directory in the isochrone frontier
     primary key (blob, dir, loc)
     -- foreign key (blob) references content (id),
     -- foreign key (dir) references directory (id),
     -- foreign key (loc) references location (id)
 );
 
 comment on column content_in_dir.blob is 'Content internal identifier';
 comment on column content_in_dir.dir is 'Directory internal identifier';
 comment on column content_in_dir.loc is 'Location of content in directory';
 
 
 drop table if exists directory;
 create table directory
 (
     id      bigserial primary key,      -- internal identifier of the directory appearing in an isochrone inner frontier
     sha1    sha1_git unique not null,   -- intrinsic identifier of the directory
     date    timestamptz not null        -- max timestamp among those of the directory children's
 );
 
 comment on column directory.id is 'Directory internal identifier';
 comment on column directory.sha1 is 'Directory intrinsic identifier';
 comment on column directory.date is 'Latest timestamp for the content in the directory';
 
 
 drop table if exists directory_in_rev;
 create table directory_in_rev
 (
     dir     bigint not null,            -- internal identifier of the directory appearing in the revision
     rev     bigint not null,            -- internal identifier of the revision containing the directory
     loc     bigint not null,            -- location of the directory relative to the revision root directory
     primary key (dir, rev, loc)
     -- foreign key (dir) references directory (id),
     -- foreign key (rev) references revision (id),
     -- foreign key (loc) references location (id)
 );
 
 comment on column directory_in_rev.dir is 'Directory internal identifier';
 comment on column directory_in_rev.rev is 'Revision internal identifier';
 comment on column directory_in_rev.loc is 'Location of directory in revision';
 
 
 drop table if exists location;
 create table location
 (
     id      bigserial primary key,      -- internal identifier of the location
     path    unix_path unique not null   -- path to the location
 );
 
 comment on column location.id is 'Location internal identifier';
 comment on column location.path is 'Path to the location';
 
 
 drop table if exists origin;
 create table origin
 (
     id      bigserial primary key,      -- internal identifier of the origin
     url     unix_path unique not null   -- url of the origin
 );
 
 comment on column origin.id is 'Origin internal identifier';
 comment on column origin.url is 'URL of the origin';
 
 
 drop table if exists revision;
 create table revision
 (
     id      bigserial primary key,      -- internal identifier of the revision
     sha1    sha1_git unique not null,   -- intrinsic identifier of the revision
     date    timestamptz not null,       -- timestamp of the revision
     org     bigint                      -- id of the preferred origin
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision.id is 'Revision internal identifier';
 comment on column revision.sha1 is 'Revision intrinsic identifier';
 comment on column revision.date is 'Revision timestamp';
 comment on column revision.org is 'preferred origin for the revision';
 
 
 drop table if exists revision_before_rev;
 create table revision_before_rev
 (
     prev    bigserial not null,         -- internal identifier of the source revision
     next    bigserial not null,         -- internal identifier of the destination revision
     primary key (prev, next)
     -- foreign key (prev) references revision (id),
     -- foreign key (next) references revision (id)
 );
 
 comment on column revision_before_rev.prev is 'Source revision internal identifier';
 comment on column revision_before_rev.next is 'Destination revision internal identifier';
 
 
 drop table if exists revision_in_org;
 create table revision_in_org
 (
     rev     bigint not null,            -- internal identifier of the revision poined by the origin
     org     bigint not null,            -- internal identifier of the origin that points to the revision
     primary key (rev, org)
     -- foreign key (rev) references revision (id),
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision_in_org.rev is 'Revision internal identifier';
 comment on column revision_in_org.org is 'Origin internal identifier';
diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py
index 391770f..8170da0 100644
--- a/swh/provenance/postgresql_nopath/provenance.py
+++ b/swh/provenance/postgresql_nopath/provenance.py
@@ -1,442 +1,442 @@
 import itertools
 import logging
 import operator
 import os
+from datetime import datetime
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
 import psycopg2
 import psycopg2.extras
 
 from ..model import DirectoryEntry, FileEntry
 from ..origin import OriginEntry
 from ..postgresql.db_utils import connect, execute_sql
 from ..provenance import ProvenanceInterface
 from ..revision import RevisionEntry
 
-from datetime import datetime
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
 
 def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
     conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
 
     # Normalize dbname to avoid issues when reconnecting below
     name = name.casefold()
 
     # Create new database dropping previous one if exists
     cursor = conn.cursor()
     cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
     cursor.execute(f"""CREATE DATABASE {name}""")
     conn.close()
 
     # Reconnect to server selecting newly created database to add tables
     conninfo["dbname"] = name
     conn = connect(conninfo)
 
     sqldir = os.path.dirname(os.path.realpath(__file__))
     execute_sql(conn, os.path.join(sqldir, "provenance.sql"))
 
 
 ########################################################################################
 ########################################################################################
 ########################################################################################
 
 
 class ProvenancePostgreSQLNoPath(ProvenanceInterface):
     def __init__(self, conn: psycopg2.extensions.connection):
         # TODO: consider adding a mutex for thread safety
         conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         self.conn = conn
         self.cursor = self.conn.cursor()
         self.insert_cache: Dict[str, Any] = {}
         self.remove_cache: Dict[str, Any] = {}
         self.select_cache: Dict[str, Any] = {}
         self.clear_caches()
 
     def clear_caches(self):
         self.insert_cache = {
             "content": dict(),
             "content_early_in_rev": set(),
             "content_in_dir": set(),
             "directory": dict(),
             "directory_in_rev": set(),
             "revision": dict(),
             "revision_before_rev": list(),
             "revision_in_org": list(),
         }
         self.remove_cache = {"directory": dict()}
         self.select_cache = {"content": dict(), "directory": dict(), "revision": dict()}
 
     def commit(self):
         result = False
         try:
             self.insert_all()
             self.clear_caches()
             result = True
 
         except Exception as error:
             # Unexpected error occurred, rollback all changes and log message
             logging.error(f"Unexpected error: {error}")
 
         return result
 
     def content_add_to_directory(
         self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_in_dir"].add((blob.id, directory.id))
 
     def content_add_to_revision(
         self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_early_in_rev"].add((blob.id, revision.id))
 
     def content_find_first(
         self, blobid: bytes
     ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
         self.cursor.execute(
             """SELECT revision.sha1 AS rev,
                       revision.date AS date
                  FROM (SELECT content_early_in_rev.rev
                           FROM content_early_in_rev
                           JOIN content
                             ON content.id=content_early_in_rev.blob
                           WHERE content.sha1=%s
                       ) AS content_in_rev
                  JOIN revision
                    ON revision.id=content_in_rev.rev
                  ORDER BY date, rev ASC LIMIT 1""",
             (blobid,),
         )
         row = self.cursor.fetchone()
         if row is not None:
             # TODO: query revision from the archive and look for blobid into a
             # recursive directory_ls of the revision's root.
             return blobid, row[0], row[1], b""
         return None
 
     def content_find_all(
         self, blobid: bytes
     ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
         self.cursor.execute(
             """(SELECT revision.sha1 AS rev,
                        revision.date AS date
                  FROM (SELECT content_early_in_rev.rev
                           FROM content_early_in_rev
                           JOIN content
                             ON content.id=content_early_in_rev.blob
                           WHERE content.sha1=%s
                       ) AS content_in_rev
                  JOIN revision
                    ON revision.id=content_in_rev.rev
                  )
                UNION
                (SELECT revision.sha1 AS rev,
                        revision.date AS date
                  FROM (SELECT directory_in_rev.rev
                           FROM (SELECT content_in_dir.dir
                                    FROM content_in_dir
                                    JOIN content
                                      ON content_in_dir.blob=content.id
                                    WHERE content.sha1=%s
                                ) AS content_dir
                           JOIN directory_in_rev
                             ON directory_in_rev.dir=content_dir.dir
                       ) AS content_in_rev
                  JOIN revision
                    ON revision.id=content_in_rev.rev
                )
                ORDER BY date, rev""",
             (blobid, blobid),
         )
         # TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
         for row in self.cursor.fetchall():
             # TODO: query revision from the archive and look for blobid into a
             # recursive directory_ls of the revision's root.
             yield blobid, row[0], row[1], b""
 
     def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["content"].get(blob.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["content"].get(blob.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM content WHERE sha1=%s""", (blob.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["content"][blob.id] = date
         return date
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for blob in blobs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["content"].get(blob.id, None)
             if date is not None:
                 dates[blob.id] = date
             else:
                 # If not, check whether it's been query before
                 date = self.select_cache["content"].get(blob.id, None)
                 if date is not None:
                     dates[blob.id] = date
                 else:
                     pending.append(blob.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM content WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["content"][row[0]] = row[1]
         return dates
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         self.insert_cache["content"][blob.id] = date
 
     def directory_add_to_revision(
         self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
     ):
         self.insert_cache["directory_in_rev"].add((directory.id, revision.id))
 
     def directory_get_date_in_isochrone_frontier(
         self, directory: DirectoryEntry
     ) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["directory"].get(directory.id, None)
         if date is None and directory.id not in self.remove_cache["directory"]:
             # If not, check whether it's been query before
             date = self.select_cache["directory"].get(directory.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM directory WHERE sha1=%s""", (directory.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["directory"][directory.id] = date
         return date
 
     def directory_get_dates_in_isochrone_frontier(
         self, dirs: List[DirectoryEntry]
     ) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for directory in dirs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["directory"].get(directory.id, None)
             if date is not None:
                 dates[directory.id] = date
             elif directory.id not in self.remove_cache["directory"]:
                 # If not, check whether it's been query before
                 date = self.select_cache["directory"].get(directory.id, None)
                 if date is not None:
                     dates[directory.id] = date
                 else:
                     pending.append(directory.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM directory WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["directory"][row[0]] = row[1]
         return dates
 
     def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
         self.remove_cache["directory"][directory.id] = None
         self.insert_cache["directory"].pop(directory.id, None)
 
     def directory_set_date_in_isochrone_frontier(
         self, directory: DirectoryEntry, date: datetime
     ):
         self.insert_cache["directory"][directory.id] = date
         self.remove_cache["directory"].pop(directory.id, None)
 
     def insert_all(self):
         # Performe insertions with cached information
         if self.insert_cache["content"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY content;
                    INSERT INTO content(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,content.date)""",
                 self.insert_cache["content"].items(),
             )
             self.insert_cache["content"].clear()
 
         if self.insert_cache["directory"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY directory;
                    INSERT INTO directory(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,directory.date)""",
                 self.insert_cache["directory"].items(),
             )
             self.insert_cache["directory"].clear()
 
         if self.insert_cache["revision"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY revision;
                    INSERT INTO revision(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,revision.date)""",
                 self.insert_cache["revision"].items(),
             )
             self.insert_cache["revision"].clear()
 
         # Relations should come after ids for elements were resolved
         if self.insert_cache["content_early_in_rev"]:
             self.insert_location("content", "revision", "content_early_in_rev")
 
         if self.insert_cache["content_in_dir"]:
             self.insert_location("content", "directory", "content_in_dir")
 
         if self.insert_cache["directory_in_rev"]:
             self.insert_location("directory", "revision", "directory_in_rev")
 
         # if self.insert_cache["revision_before_rev"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_before_rev VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_before_rev"],
         #     )
         #     self.insert_cache["revision_before_rev"].clear()
 
         # if self.insert_cache["revision_in_org"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_in_org VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_in_org"],
         #     )
         #     self.insert_cache["revision_in_org"].clear()
 
     def insert_location(self, src0_table, src1_table, dst_table):
         # Resolve src0 ids
         src0_values = dict().fromkeys(
             map(operator.itemgetter(0), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src0_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({values})""",
             tuple(src0_values),
         )
         src0_values = dict(self.cursor.fetchall())
 
         # Resolve src1 ids
         src1_values = dict().fromkeys(
             map(operator.itemgetter(1), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src1_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({values})""",
             tuple(src1_values),
         )
         src1_values = dict(self.cursor.fetchall())
 
         # Insert values in dst_table
         rows = map(
             lambda row: (src0_values[row[0]], src1_values[row[1]]),
             self.insert_cache[dst_table],
         )
         psycopg2.extras.execute_values(
             self.cursor,
             f"""INSERT INTO {dst_table} VALUES %s
                   ON CONFLICT DO NOTHING""",
             rows,
         )
         self.insert_cache[dst_table].clear()
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         if origin.id is None:
             # Insert origin in the DB and return the assigned id
             self.cursor.execute(
                 """INSERT INTO origin (url) VALUES (%s)
                      ON CONFLICT DO NOTHING
                      RETURNING id""",
                 (origin.url,),
             )
             return self.cursor.fetchone()[0]
         else:
             return origin.id
 
     def revision_add(self, revision: RevisionEntry):
         # Add current revision to the compact DB
         self.insert_cache["revision"][revision.id] = revision.date
 
     def revision_add_before_revision(
         self, relative: RevisionEntry, revision: RevisionEntry
     ):
         self.insert_cache["revision_before_rev"].append((revision.id, relative.id))
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         self.insert_cache["revision_in_org"].append((revision.id, origin.id))
 
     def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
         date = self.insert_cache["revision"].get(revision.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["revision"].get(revision.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM revision WHERE sha1=%s""", (revision.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["revision"][revision.id] = date
         return date
 
     def revision_get_preferred_origin(self, revision: RevisionEntry) -> int:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT COALESCE(org,0) FROM revision WHERE sha1=%s""", (revision.id,)
         )
         row = self.cursor.fetchone()
         # None means revision is not in database;
         # 0 means revision has no preferred origin
         return row[0] if row is not None and row[0] != 0 else None
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_before_rev
                  JOIN revision
                    ON revision.id=revision_before_rev.prev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
 
     def revision_set_preferred_origin(
         self, origin: OriginEntry, revision: RevisionEntry
     ):
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """UPDATE revision SET org=%s WHERE sha1=%s""", (origin.id, revision.id)
         )
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_in_org
                  JOIN revision
                    ON revision.id=revision_in_org.rev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
diff --git a/swh/provenance/postgresql_nopath/provenance.sql b/swh/provenance/postgresql_nopath/provenance.sql
index 15d3644..568d4be 100644
--- a/swh/provenance/postgresql_nopath/provenance.sql
+++ b/swh/provenance/postgresql_nopath/provenance.sql
@@ -1,130 +1,130 @@
 -- a Git object ID, i.e., a Git-style salted SHA1 checksum
 drop domain if exists sha1_git cascade;
 create domain sha1_git as bytea check (length(value) = 20);
 
 -- UNIX path (absolute, relative, individual path component, etc.)
 drop domain if exists unix_path cascade;
 create domain unix_path as bytea;
 
 
 drop table if exists content;
 create table content
 (
     id      bigserial primary key,      -- internal identifier of the content blob
     sha1    sha1_git unique not null,   -- intrinsic identifier of the content blob
     date    timestamptz not null        -- timestamp of the revision where the blob appears early
 );
 
 comment on column content.id is 'Content internal identifier';
 comment on column content.sha1 is 'Content intrinsic identifier';
 comment on column content.date is 'Earliest timestamp for the content (first seen time)';
 
 
 drop table if exists content_early_in_rev;
 create table content_early_in_rev
 (
     blob    bigint not null,            -- internal identifier of the content blob
     rev     bigint not null,            -- internal identifier of the revision where the blob appears for the first time
     primary key (blob, rev)
     -- foreign key (blob) references content (id),
     -- foreign key (rev) references revision (id)
 );
 
 comment on column content_early_in_rev.blob is 'Content internal identifier';
 comment on column content_early_in_rev.rev is 'Revision internal identifier';
 
 
 drop table if exists content_in_dir;
 create table content_in_dir
 (
     blob    bigint not null,            -- internal identifier of the content blob
-    dir     bigint not null,            -- internal identifier of the directory contaning the blob
+    dir     bigint not null,            -- internal identifier of the directory containing the blob
     primary key (blob, dir)
     -- foreign key (blob) references content (id),
     -- foreign key (dir) references directory (id)
 );
 
 comment on column content_in_dir.blob is 'Content internal identifier';
 comment on column content_in_dir.dir is 'Directory internal identifier';
 
 
 drop table if exists directory;
 create table directory
 (
     id      bigserial primary key,      -- internal identifier of the directory appearing in an isochrone inner frontier
     sha1    sha1_git unique not null,   -- intrinsic identifier of the directory
     date    timestamptz not null        -- max timestamp among those of the directory children's
 );
 
 comment on column directory.id is 'Directory internal identifier';
 comment on column directory.sha1 is 'Directory intrinsic identifier';
 comment on column directory.date is 'Latest timestamp for the content in the directory';
 
 
 drop table if exists directory_in_rev;
 create table directory_in_rev
 (
     dir     bigint not null,            -- internal identifier of the directory appearing in the revision
     rev     bigint not null,            -- internal identifier of the revision containing the directory
     primary key (dir, rev)
     -- foreign key (dir) references directory (id),
     -- foreign key (rev) references revision (id)
 );
 
 comment on column directory_in_rev.dir is 'Directory internal identifier';
 comment on column directory_in_rev.rev is 'Revision internal identifier';
 
 
 drop table if exists origin;
 create table origin
 (
     id      bigserial primary key,      -- internal identifier of the origin
     url     unix_path unique not null   -- url of the origin
 );
 
 comment on column origin.id is 'Origin internal identifier';
 comment on column origin.url is 'URL of the origin';
 
 
 drop table if exists revision;
 create table revision
 (
     id      bigserial primary key,      -- internal identifier of the revision
     sha1    sha1_git unique not null,   -- intrinsic identifier of the revision
     date    timestamptz not null,       -- timestamp of the revision
     org     bigint                      -- id of the preferred origin
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision.id is 'Revision internal identifier';
 comment on column revision.sha1 is 'Revision intrinsic identifier';
 comment on column revision.date is 'Revision timestamp';
 comment on column revision.org is 'preferred origin for the revision';
 
 
 drop table if exists revision_before_rev;
 create table revision_before_rev
 (
     prev    bigserial not null,         -- internal identifier of the source revision
     next    bigserial not null,         -- internal identifier of the destination revision
     primary key (prev, next)
     -- foreign key (prev) references revision (id),
     -- foreign key (next) references revision (id)
 );
 
 comment on column revision_before_rev.prev is 'Source revision internal identifier';
 comment on column revision_before_rev.next is 'Destination revision internal identifier';
 
 
 drop table if exists revision_in_org;
 create table revision_in_org
 (
     rev     bigint not null,            -- internal identifier of the revision poined by the origin
     org     bigint not null,            -- internal identifier of the origin that points to the revision
     primary key (rev, org)
     -- foreign key (rev) references revision (id),
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision_in_org.rev is 'Revision internal identifier';
 comment on column revision_in_org.org is 'Origin internal identifier';
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
index 03db259..6af9730 100644
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -1,365 +1,362 @@
 import os
+from datetime import datetime
+from typing import Dict, Generator, List, Optional, Tuple
 
 from .archive import ArchiveInterface
 from .model import DirectoryEntry, FileEntry, TreeEntry
 from .origin import OriginEntry
 from .revision import RevisionEntry
 
-from datetime import datetime
-from typing import Dict, Generator, List, Optional, Tuple
-
 
 # TODO: consider moving to path utils file together with normalize.
 def is_child(path: bytes, prefix: bytes) -> bool:
     return path != prefix and os.path.dirname(path) == prefix
 
 
 class ProvenanceInterface:
     def __init__(self, **kwargs):
         raise NotImplementedError
 
     def commit(self):
         raise NotImplementedError
 
     def content_add_to_directory(
         self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
     ):
         raise NotImplementedError
 
     def content_add_to_revision(
         self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
     ):
         raise NotImplementedError
 
     def content_find_first(
         self, blobid: bytes
     ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
         raise NotImplementedError
 
     def content_find_all(
         self, blobid: bytes
     ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
         raise NotImplementedError
 
     def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
         raise NotImplementedError
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         raise NotImplementedError
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         raise NotImplementedError
 
     def directory_add_to_revision(
         self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
     ):
         raise NotImplementedError
 
     def directory_get_date_in_isochrone_frontier(
         self, directory: DirectoryEntry
     ) -> Optional[datetime]:
         raise NotImplementedError
 
     def directory_get_dates_in_isochrone_frontier(
         self, dirs: List[DirectoryEntry]
     ) -> Dict[bytes, datetime]:
         raise NotImplementedError
 
     def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
         raise NotImplementedError
 
     def directory_set_date_in_isochrone_frontier(
         self, directory: DirectoryEntry, date: datetime
     ):
         raise NotImplementedError
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         raise NotImplementedError
 
     def revision_add(self, revision: RevisionEntry):
         raise NotImplementedError
 
     def revision_add_before_revision(
         self, relative: RevisionEntry, revision: RevisionEntry
     ):
         raise NotImplementedError
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         raise NotImplementedError
 
     def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
         raise NotImplementedError
 
     def revision_get_preferred_origin(self, revision: RevisionEntry) -> int:
         raise NotImplementedError
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         raise NotImplementedError
 
     def revision_set_preferred_origin(
         self, origin: OriginEntry, revision: RevisionEntry
     ):
         raise NotImplementedError
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         raise NotImplementedError
 
 
 def directory_process_content(
     provenance: ProvenanceInterface, directory: DirectoryEntry, relative: DirectoryEntry
 ):
     stack = [(directory, b"")]
     while stack:
         current, prefix = stack.pop()
         for child in iter(current):
             if isinstance(child, FileEntry):
                 # Add content to the relative directory with the computed prefix.
                 provenance.content_add_to_directory(relative, child, prefix)
             else:
                 # Recursively walk the child directory.
                 stack.append((child, os.path.join(prefix, child.name)))
 
 
 def origin_add(provenance: ProvenanceInterface, origin: OriginEntry):
     # TODO: refactor to iterate over origin visit statuses and commit only once
     # per status.
     origin.id = provenance.origin_get_id(origin)
     for revision in origin.revisions:
         origin_add_revision(provenance, origin, revision)
         # Commit after each revision
         provenance.commit()  # TODO: verify this!
 
 
 def origin_add_revision(
     provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry
 ):
     stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)]
 
     while stack:
         relative, current = stack.pop()
 
         # Check if current revision has no preferred origin and update if necessary.
         preferred = provenance.revision_get_preferred_origin(current)
 
         if preferred is None:
             provenance.revision_set_preferred_origin(origin, current)
         ########################################################################
 
         if relative is None:
             # This revision is pointed directly by the origin.
             visited = provenance.revision_visited(current)
             provenance.revision_add_to_origin(origin, current)
 
             if not visited:
                 stack.append((current, current))
 
         else:
             # This revision is a parent of another one in the history of the
             # relative revision.
             for parent in iter(current):
                 visited = provenance.revision_visited(parent)
 
                 if not visited:
                     # The parent revision has never been seen before pointing
                     # directly to an origin.
                     known = provenance.revision_in_history(parent)
 
                     if known:
                         # The parent revision is already known in some other
                         # revision's history. We should point it directly to
                         # the origin and (eventually) walk its history.
                         stack.append((None, parent))
                     else:
                         # The parent revision was never seen before. We should
                         # walk its history and associate it with the same
                         # relative revision.
                         provenance.revision_add_before_revision(relative, parent)
                         stack.append((relative, parent))
                 else:
                     # The parent revision already points to an origin, so its
                     # history was properly processed before. We just need to
                     # make sure it points to the current origin as well.
                     provenance.revision_add_to_origin(origin, parent)
 
 
 def revision_add(
     provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry
 ):
     assert revision.date is not None
     assert revision.root is not None
     # Processed content starting from the revision's root directory.
     date = provenance.revision_get_early_date(revision)
     if date is None or revision.date < date:
         provenance.revision_add(revision)
         # TODO: add file size filtering
         revision_process_content(
             provenance, revision, DirectoryEntry(archive, revision.root, b"")
         )
     # TODO: improve this! Maybe using a max attempt counter?
     # Ideally Provenance class should guarantee that a commit never fails.
     while not provenance.commit():
         continue
 
 
 class IsochroneNode:
     def __init__(self, entry: TreeEntry, dates: Dict[bytes, datetime] = {}):
         self.entry = entry
         self.date = dates.get(self.entry.id, None)
         self.children: List[IsochroneNode] = []
         self.maxdate: Optional[datetime] = None
 
     def add_child(
         self, child: TreeEntry, dates: Dict[bytes, datetime] = {}
     ) -> "IsochroneNode":
         assert isinstance(self.entry, DirectoryEntry) and self.date is None
         node = IsochroneNode(child, dates=dates)
         self.children.append(node)
         return node
 
 
 def build_isochrone_graph(
     provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry
 ):
     assert revision.date is not None
     # Build the nodes structure
     root = IsochroneNode(directory)
     root.date = provenance.directory_get_date_in_isochrone_frontier(directory)
     stack = [root]
     while stack:
         current = stack.pop()
         assert isinstance(current.entry, DirectoryEntry)
         if current.date is None or current.date >= revision.date:
             # If current directory has an associated date in the isochrone frontier that
             # is greater or equal to the current revision's one, it should be ignored as
             # the revision is being processed out of order.
             if current.date is not None and current.date >= revision.date:
                 provenance.directory_invalidate_in_isochrone_frontier(current.entry)
                 current.date = None
             # Pre-query all known dates for content/directories in the current directory
             # for the provenance object to have them cached and (potentially) improve
             # performance.
             ddates = provenance.directory_get_dates_in_isochrone_frontier(
                 [child for child in current.entry if isinstance(child, DirectoryEntry)]
             )
             fdates = provenance.content_get_early_dates(
                 [child for child in current.entry if isinstance(child, FileEntry)]
             )
             for child in current.entry:
                 # Recursively analyse directory nodes.
                 if isinstance(child, DirectoryEntry):
                     node = current.add_child(child, dates=ddates)
                     stack.append(node)
                 else:
                     current.add_child(child, dates=fdates)
     # Precalculate max known date for each node in the graph.
     stack = [root]
     while stack:
         current = stack.pop()
         if current.date is None:
             if any(map(lambda child: child.maxdate is None, current.children)):
                 # Current node needs to be analysed again after its children.
                 stack.append(current)
                 for child in current.children:
                     if isinstance(child.entry, FileEntry):
                         if child.date is not None:
                             # File node that has been seen before, just use its known
                             # date.
                             child.maxdate = child.date
                         else:
                             # File node that has never been seen before, use current
                             # revision date.
                             child.maxdate = revision.date
                     else:
                         # Recursively analyse directory nodes.
                         stack.append(child)
             else:
                 maxdates = []
                 for child in current.children:
                     assert child.maxdate is not None
                     maxdates.append(child.maxdate)
                 current.maxdate = max(maxdates) if maxdates else revision.date
         else:
             # Directory node in the frontier, just use its known date.
             current.maxdate = current.date
     return root
 
 
 def revision_process_content(
     provenance: ProvenanceInterface, revision: RevisionEntry, root: DirectoryEntry
 ):
     assert revision.date is not None
     stack = [(build_isochrone_graph(provenance, revision, root), root.name)]
     while stack:
         current, path = stack.pop()
         if current.date is not None:
             assert current.date < revision.date
             # Current directory is an outer isochrone frontier for a previously
             # processed revision. It should be reused as is.
             provenance.directory_add_to_revision(revision, current.entry, path)
         else:
             # Current directory is not an outer isochrone frontier for any previous
             # revision. It might be eligible for this one.
             if is_new_frontier(current, revision):
                 assert current.maxdate is not None
                 # Outer frontier should be moved to current position in the isochrone
                 # graph. This is the first time this directory is found in the isochrone
                 # frontier.
                 provenance.directory_set_date_in_isochrone_frontier(
                     current.entry, current.maxdate
                 )
                 provenance.directory_add_to_revision(revision, current.entry, path)
                 directory_process_content(
-                    provenance,
-                    directory=current.entry,
-                    relative=current.entry,
+                    provenance, directory=current.entry, relative=current.entry,
                 )
             else:
                 # No point moving the frontier here. Either there are no files or they
                 # are being seen for the first time here. Add all blobs to current
                 # revision updating date if necessary, and recursively analyse
                 # subdirectories as candidates to the outer frontier.
                 for child in current.children:
                     if isinstance(child.entry, FileEntry):
                         blob = child.entry
                         if child.date is None or revision.date < child.date:
                             provenance.content_set_early_date(blob, revision.date)
                         provenance.content_add_to_revision(revision, blob, path)
                     else:
                         stack.append((child, os.path.join(path, child.entry.name)))
 
 
 def is_new_frontier(node: IsochroneNode, revision: RevisionEntry) -> bool:
     assert node.maxdate is not None and revision.date is not None
     # Using the following condition should we should get an algorithm equivalent to old
     # version where frontiers are pushed up in the tree whenever possible.
     return node.maxdate < revision.date
     # return node.maxdate < revision.date and has_blobs(node)
 
 
 def has_blobs(node: IsochroneNode) -> bool:
     # We may want to look for files in different ways to decide whether to define a
     # frontier or not:
     # 1. Only files in current node:
     # return any(map(lambda child: isinstance(child.entry, FileEntry), node.children))
     # 2. Files anywhere in the isochrone graph
     # stack = [node]
     # while stack:
     #     current = stack.pop()
     #     if any(
     #         map(lambda child: isinstance(child.entry, FileEntry), current.children)):
     #         return True
     #     else:
     #         # All children are directory entries.
     #         stack.extend(current.children)
     # return False
     # 3. Files in the intermediate directories between current node and any previously
     #    defined frontier:
     return (
         any(map(lambda child: isinstance(child.entry, FileEntry), node.children)) or
         all(
             map(
                 lambda child: (not (isinstance(child.entry, DirectoryEntry) and child.date is None)) or has_blobs(child),
                 node.children
             )
         )
     )
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
index 6df3f0b..3868a6c 100644
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -1,184 +1,183 @@
-import threading
-
-from .archive import ArchiveInterface
-
 from datetime import datetime
+import threading
 from typing import Optional
 
 from swh.model.hashutil import hash_to_bytes
 
+from .archive import ArchiveInterface
+
 
 class RevisionEntry:
     def __init__(
         self,
         archive: ArchiveInterface,
         id: bytes,
         date: Optional[datetime] = None,
         root: Optional[bytes] = None,
         parents: Optional[list] = None,
     ):
         self.archive = archive
         self.id = id
         self.date = date
         self.parents = parents
         self.root = root
 
     def __iter__(self):
         if self.parents is None:
             self.parents = []
             for parent in self.archive.revision_get([self.id]):
                 if parent is not None:
                     self.parents.append(
                         RevisionEntry(
                             self.archive,
                             parent.id,
                             parents=[
                                 RevisionEntry(self.archive, id) for id in parent.parents
                             ],
                         )
                     )
 
         return iter(self.parents)
 
 
 ########################################################################################
 ########################################################################################
 
 
 class RevisionIterator:
     """Iterator interface."""
 
     def __iter__(self):
         pass
 
     def __next__(self):
         pass
 
 
 class FileRevisionIterator(RevisionIterator):
     """Iterator over revisions present in the given CSV file."""
 
     def __init__(
         self, filename: str, archive: ArchiveInterface, limit: Optional[int] = None
     ):
         self.file = open(filename)
         self.idx = 0
         self.limit = limit
         self.mutex = threading.Lock()
         self.archive = archive
 
     def next(self):
         self.mutex.acquire()
         line = self.file.readline().strip()
         if line and (self.limit is None or self.idx < self.limit):
             self.idx = self.idx + 1
             id, date, root = line.strip().split(",")
             self.mutex.release()
 
             return RevisionEntry(
                 self.archive,
                 hash_to_bytes(id),
                 date=datetime.fromisoformat(date),
                 root=hash_to_bytes(root),
             )
         else:
             self.mutex.release()
             return None
 
 
 # class ArchiveRevisionIterator(RevisionIterator):
 #     """Iterator over revisions present in the given database."""
 #
 #     def __init__(self, conn, limit=None, chunksize=100):
 #         self.cur = conn.cursor()
 #         self.chunksize = chunksize
 #         self.records = []
 #         if limit is None:
 #             self.cur.execute('''SELECT id, date, committer_date, directory
 #                             FROM revision''')
 #         else:
 #             self.cur.execute('''SELECT id, date, committer_date, directory
 #                             FROM revision
 #                             LIMIT %s''', (limit,))
 #         for row in self.cur.fetchmany(self.chunksize):
 #             record = self.make_record(row)
 #             if record is not None:
 #                 self.records.append(record)
 #         self.mutex = threading.Lock()
 #
 #     def __del__(self):
 #         self.cur.close()
 #
 #     def next(self):
 #         self.mutex.acquire()
 #         if not self.records:
 #             self.records.clear()
 #             for row in self.cur.fetchmany(self.chunksize):
 #                 record = self.make_record(row)
 #                 if record is not None:
 #                     self.records.append(record)
 #
 #         if self.records:
 #             revision, *self.records = self.records
 #             self.mutex.release()
 #             return revision
 #         else:
 #             self.mutex.release()
 #             return None
 #
 #     def make_record(self, row):
 #         # Only revision with author or committer date are considered
 #         if row[1] is not None:
 #             # If the revision has author date, it takes precedence
 #             return RevisionEntry(row[0], row[1], row[3])
 #         elif row[2] is not None:
 #             # If not, we use the committer date
 #             return RevisionEntry(row[0], row[2], row[3])
 
 
 ########################################################################################
 ########################################################################################
 
 # class RevisionWorker(threading.Thread):
 #     def __init__(
 #         self,
 #         id: int,
 #         conninfo: dict,
 #         archive: ArchiveInterface,
 #         revisions: RevisionIterator
 #     ):
 #         from .provenance import get_provenance
 #
 #         super().__init__()
 #         self.archive = archive
 #         self.id = id
 #         self.provenance = get_provenance(conninfo)
 #         self.revisions = revisions
 #
 #
 #     def run(self):
 #         from .provenance import revision_add
 #
 #
 #         while True:
 #             revision = self.revisions.next()
 #             if revision is None: break
 #
 #             processed = False
 #             while not processed:
 #                 logging.info(
 #                     f'Thread {(
 #                         self.id
 #                     )} - Processing revision {(
 #                         hash_to_hex(revision.id)
 #                     )} (timestamp: {revision.date})'
 #                 )
 #                 processed = revision_add(self.provenance, self.archive, revision)
 #                 if not processed:
 #                     logging.warning(
 #                         f'Thread {(
 #                              self.id
 #                         )} - Failed to process revision {(
 #                             hash_to_hex(revision.id)
 #                         )} (timestamp: {revision.date})'
 #                     )
diff --git a/swh/provenance/storage/archive.py b/swh/provenance/storage/archive.py
index b1d9186..003412f 100644
--- a/swh/provenance/storage/archive.py
+++ b/swh/provenance/storage/archive.py
@@ -1,47 +1,49 @@
-from ..archive import ArchiveInterface
+from typing import List
 
 # from functools import lru_cache
 from methodtools import lru_cache
-from typing import List
+
 from swh.storage import get_storage
 
+from ..archive import ArchiveInterface
+
 
 class ArchiveStorage(ArchiveInterface):
     def __init__(self, cls: str, **kwargs):
         self.storage = get_storage(cls, **kwargs)
 
     @lru_cache(maxsize=1000000)
     def directory_ls(self, id: bytes):
         # TODO: filter unused fields
         return [entry for entry in self.storage.directory_ls(id)]
 
     def iter_origins(self):
         from swh.storage.algos.origin import iter_origins
 
         yield from iter_origins(self.storage)
 
     def iter_origin_visits(self, origin: str):
         from swh.storage.algos.origin import iter_origin_visits
 
         # TODO: filter unused fields
         yield from iter_origin_visits(self.storage, origin)
 
     def iter_origin_visit_statuses(self, origin: str, visit: int):
         from swh.storage.algos.origin import iter_origin_visit_statuses
 
         # TODO: filter unused fields
         yield from iter_origin_visit_statuses(self.storage, origin, visit)
 
     def release_get(self, ids: List[bytes]):
         # TODO: filter unused fields
         yield from self.storage.release_get(ids)
 
     def revision_get(self, ids: List[bytes]):
         # TODO: filter unused fields
         yield from self.storage.revision_get(ids)
 
     def snapshot_get_all_branches(self, snapshot: bytes):
         from swh.storage.algos.snapshot import snapshot_get_all_branches
 
         # TODO: filter unused fields
         return snapshot_get_all_branches(self.storage, snapshot)