diff --git a/.gitignore b/.gitignore
index d3a8e3a..d01f7be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,16 @@
 *.csv
 *.egg*
 *.log
 *.prof
 *.zip
+.tox
 .vscode
 __pycache__
 compact.egg.info
 content-revision
 build
 data
 debian
 dumps
 PKG-INFO
 revision-origin
diff --git a/mypy.ini b/mypy.ini
index 46f8db8..5bf1e3b 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,15 +1,18 @@
 [mypy]
 namespace_packages = True
 warn_unused_ignores = True
 
 
 # 3rd party libraries without stubs (yet)
 
 [mypy-pkg_resources.*]
 ignore_missing_imports = True
 
 [mypy-pytest.*]
 ignore_missing_imports = True
 
-# [mypy-add_your_lib_here.*]
-# ignore_missing_imports = True
+[mypy-psycopg2.*]
+ignore_missing_imports = True
+
+[mypy-methodtools.*]
+ignore_missing_imports = True
diff --git a/swh/provenance/origin.py b/swh/provenance/origin.py
index 3bad168..624ac8c 100644
--- a/swh/provenance/origin.py
+++ b/swh/provenance/origin.py
@@ -1,98 +1,100 @@
 from .archive import ArchiveInterface
 from .revision import RevisionEntry
 
+from typing import Optional
+
 from swh.model.model import Origin, ObjectType, TargetType
 
 
 class OriginEntry:
     def __init__(self, url, revisions, id=None):
         self.id = id
         self.url = url
         self.revisions = revisions
 
 
 ################################################################################
 ################################################################################
 
 class OriginIterator:
     """Iterator interface."""
 
     def __iter__(self):
         pass
 
     def __next__(self):
         pass
 
 
 class FileOriginIterator(OriginIterator):
     """Iterator over origins present in the given CSV file."""
 
-    def __init__(self, filename: str, archive: ArchiveInterface, limit: int=None):
+    def __init__(self, filename: str, archive: ArchiveInterface, limit: Optional[int]=None):
         self.file = open(filename)
         self.limit = limit
         # self.mutex = threading.Lock()
         self.archive = archive
 
     def __iter__(self):
         yield from iterate_statuses(
             [Origin(url.strip()) for url in self.file],
             self.archive,
             self.limit
         )
 
 
 class ArchiveOriginIterator:
     """Iterator over origins present in the given storage."""
 
-    def __init__(self, archive: ArchiveInterface, limit: int=None):
+    def __init__(self, archive: ArchiveInterface, limit: Optional[int]=None):
         self.limit = limit
         # self.mutex = threading.Lock()
         self.archive = archive
 
     def __iter__(self):
         yield from iterate_statuses(
             self.archive.iter_origins(),
             self.archive,
             self.limit
         )
 
 
-def iterate_statuses(origins, archive: ArchiveInterface, limit: int=None):
+def iterate_statuses(origins, archive: ArchiveInterface, limit: Optional[int]=None):
     idx = 0
     for origin in origins:
         for visit in archive.iter_origin_visits(origin.url):
             for status in archive.iter_origin_visit_statuses(origin.url, visit.visit):
                 # TODO: may filter only those whose status is 'full'??
                 targets = []
                 releases = []
 
                 snapshot = archive.snapshot_get_all_branches(status.snapshot)
                 if snapshot is not None:
                     for branch in snapshot.branches:
                         if snapshot.branches[branch].target_type == TargetType.REVISION:
                             targets.append(snapshot.branches[branch].target)
 
                         elif snapshot.branches[branch].target_type == TargetType.RELEASE:
                             releases.append(snapshot.branches[branch].target)
 
                 # This is done to keep the query in release_get small, hence avoiding a timeout.
                 limit = 100
                 for i in range(0, len(releases), limit):
                     for release in archive.release_get(releases[i:i+limit]):
-                        if revision is not None:
+                        if release is not None:
                             if release.target_type == ObjectType.REVISION:
                                 targets.append(release.target)
 
                 # This is done to keep the query in revision_get small, hence avoiding a timeout.
                 revisions = []
                 limit = 100
                 for i in range(0, len(targets), limit):
                     for revision in archive.revision_get(targets[i:i+limit]):
                         if revision is not None:
                             parents = list(map(lambda id: RevisionEntry(archive, id), revision.parents))
                             revisions.append(RevisionEntry(archive, revision.id, parents=parents))
 
                 yield OriginEntry(status.origin, revisions)
 
                 idx = idx + 1
                 if idx == limit: return
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
index 0005fa8..525821b 100644
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -1,435 +1,435 @@
 import itertools
 import logging
 import os
 import psycopg2
 import psycopg2.extras
 
 from ..model import DirectoryEntry, FileEntry
 from ..origin import OriginEntry
 from .db_utils import connect, execute_sql
 from ..provenance import ProvenanceInterface
 from ..revision import RevisionEntry
 
 from datetime import datetime
 from pathlib import PosixPath
-from typing import Dict, List
+from typing import Any, Dict, List
 
 from swh.model.hashutil import hash_to_hex
 
 
 def normalize(path: PosixPath) -> PosixPath:
     spath = str(path)
     if spath.startswith('./'):
         return PosixPath(spath[2:])
     return path
 
 
 def create_database(
     conn: psycopg2.extensions.connection,
     conninfo: dict,
     name: str
 ):
     conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
 
     # Normalize dbname to avoid issues when reconnecting below
     name = name.casefold()
 
     # Create new database dropping previous one if exists
     cursor = conn.cursor()
     cursor.execute(f'''DROP DATABASE IF EXISTS {name}''')
     cursor.execute(f'''CREATE DATABASE {name}''')
     conn.close()
 
     # Reconnect to server selecting newly created database to add tables
     conninfo['dbname'] = name
     conn = connect(conninfo)
 
     sqldir = os.path.dirname(os.path.realpath(__file__))
-    execute_sql(conn, os.path.join(sqldir, 'provenance.sql'))
+    execute_sql(conn, PosixPath(os.path.join(sqldir, 'provenance.sql')))
 
 
 ################################################################################
 ################################################################################
 ################################################################################
 
 class ProvenancePostgreSQL(ProvenanceInterface):
     def __init__(self, conn: psycopg2.extensions.connection):
         # TODO: consider adding a mutex for thread safety
         conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         self.conn = conn
         self.cursor = self.conn.cursor()
-        self.insert_cache = None
-        self.select_cache = None
+        self.insert_cache: Dict[str, Any] = {}
+        self.select_cache: Dict[str, Any] = {}
         self.clear_caches()
 
 
     def clear_caches(self):
         self.insert_cache = {
             "content": dict(),
             "content_early_in_rev": list(),
             "content_in_dir": list(),
             "directory": dict(),
             "directory_in_rev": list(),
             "revision": dict(),
             "revision_before_rev": list(),
             "revision_in_org": list()
         }
         self.select_cache = {
             "content": dict(),
             "directory": dict(),
             "revision": dict()
         }
 
 
     def commit(self):
         result = False
         try:
             self.insert_all()
             # self.conn.commit()
             result = True
 
         # except psycopg2.DatabaseError:
         #     # Database error occurred, rollback all changes
         #     self.conn.rollback()
         #     # TODO: maybe serialize and auto-merge transations.
         #     # The only conflicts are on:
         #     #   - content: we keep the earliest date
         #     #   - directory: we keep the earliest date
         #     #   - content_in_dir: there should be just duplicated entries.
 
         except Exception as error:
             # Unexpected error occurred, rollback all changes and log message
             logging.warning(f'Unexpected error: {error}')
             # self.conn.rollback()
 
         finally:
             self.clear_caches()
 
         return result
 
 
     def content_add_to_directory(
         self,
         directory: DirectoryEntry,
         blob: FileEntry,
         prefix: PosixPath
     ):
         # logging.debug(f'NEW occurrence of content {hash_to_hex(blob.id)} in directory {hash_to_hex(directory.id)} (path: {prefix / blob.name})')
         # self.cursor.execute('''INSERT INTO content_in_dir VALUES (%s,%s,%s)''',
         #                        (blob.id, directory.id, bytes(normalize(prefix / blob.name))))
         self.insert_cache['content_in_dir'].append(
             (blob.id, directory.id, bytes(normalize(prefix / blob.name)))
         )
 
 
     def content_add_to_revision(
         self,
         revision: RevisionEntry,
         blob: FileEntry,
         prefix: PosixPath
     ):
         # logging.debug(f'EARLY occurrence of blob {hash_to_hex(blob.id)} in revision {hash_to_hex(revision.id)} (path: {prefix / blob.name})')
         # self.cursor.execute('''INSERT INTO content_early_in_rev VALUES (%s,%s,%s)''',
         #                        (blob.id, revision.id, bytes(normalize(prefix / blob.name))))
         self.insert_cache['content_early_in_rev'].append(
             (blob.id, revision.id, bytes(normalize(prefix / blob.name)))
         )
 
 
     def content_find_first(self, blobid: str):
         logging.info(f'Retrieving first occurrence of content {hash_to_hex(blobid)}')
         self.cursor.execute('''SELECT blob, rev, date, path
                           FROM content_early_in_rev JOIN revision ON revision.id=content_early_in_rev.rev
                           WHERE content_early_in_rev.blob=%s ORDER BY date, rev, path ASC LIMIT 1''', (blobid,))
         return self.cursor.fetchone()
 
 
     def content_find_all(self, blobid: str):
         logging.info(f'Retrieving all occurrences of content {hash_to_hex(blobid)}')
         self.cursor.execute('''(SELECT blob, rev, date, path
                           FROM content_early_in_rev JOIN revision ON revision.id=content_early_in_rev.rev
                           WHERE content_early_in_rev.blob=%s)
                           UNION
                           (SELECT content_in_rev.blob, content_in_rev.rev, revision.date, content_in_rev.path
                           FROM (SELECT content_in_dir.blob, directory_in_rev.rev,
                                     CASE directory_in_rev.path
                                         WHEN '.' THEN content_in_dir.path
                                         ELSE (directory_in_rev.path || '/' || content_in_dir.path)::unix_path
                                     END AS path
                                 FROM content_in_dir
                                 JOIN directory_in_rev ON content_in_dir.dir=directory_in_rev.dir
                                 WHERE content_in_dir.blob=%s) AS content_in_rev
                           JOIN revision ON revision.id=content_in_rev.rev)
                           ORDER BY date, rev, path''', (blobid, blobid))
                           # POSTGRESQL EXPLAIN
         yield from self.cursor.fetchall()
 
 
     def content_get_early_date(self, blob: FileEntry) -> datetime:
         logging.debug(f'Getting content {hash_to_hex(blob.id)} early date')
         # First check if the date is being modified by current transection.
         date = self.insert_cache['content'].get(blob.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache['content'].get(blob.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute('''SELECT date FROM content WHERE id=%s''',
                                        (blob.id,))
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache['content'][blob.id] = date
         return date
 
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for blob in blobs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache['content'].get(blob.id, None)
             if date is not None:
                 dates[blob.id] = date
             else:
                 # If not, check whether it's been query before
                 date = self.select_cache['content'].get(blob.id, None)
                 if date is not None:
                     dates[blob.id] = date
                 else:
                     pending.append(blob.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ', '.join(itertools.repeat('%s', len(pending)))
             self.cursor.execute(f'''SELECT id, date FROM content WHERE id IN ({values})''',
                                     tuple(pending))
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache['content'][row[0]] = row[1]
         return dates
 
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         # logging.debug(f'EARLY occurrence of blob {hash_to_hex(blob.id)} (timestamp: {date})')
         # self.cursor.execute('''INSERT INTO content VALUES (%s,%s)
         #                        ON CONFLICT (id) DO UPDATE SET date=%s''',
         #                        (blob.id, date, date))
         self.insert_cache['content'][blob.id] = date
 
 
     def directory_add_to_revision(
         self,
         revision: RevisionEntry,
         directory: DirectoryEntry,
         path: PosixPath
     ):
         # logging.debug(f'NEW occurrence of directory {hash_to_hex(directory.id)} on the ISOCHRONE FRONTIER of revision {hash_to_hex(revision.id)} (path: {path})')
         # self.cursor.execute('''INSERT INTO directory_in_rev VALUES (%s,%s,%s)''',
         #                        (directory.id, revision.id, bytes(normalize(path))))
         self.insert_cache['directory_in_rev'].append(
             (directory.id, revision.id, bytes(normalize(path)))
         )
 
 
     def directory_get_date_in_isochrone_frontier(self, directory: DirectoryEntry) -> datetime:
         # logging.debug(f'Getting directory {hash_to_hex(directory.id)} early date')
         # First check if the date is being modified by current transection.
         date = self.insert_cache['directory'].get(directory.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache['directory'].get(directory.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute('''SELECT date FROM directory WHERE id=%s''',
                                        (directory.id,))
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache['directory'][directory.id] = date
         return date
 
 
     def directory_get_early_dates(self, dirs: List[DirectoryEntry]) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for dir in dirs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache['directory'].get(dir.id, None)
             if date is not None:
                 dates[dir.id] = date
             else:
                 # If not, check whether it's been query before
                 date = self.select_cache['directory'].get(dir.id, None)
                 if date is not None:
                     dates[dir.id] = date
                 else:
                     pending.append(dir.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ', '.join(itertools.repeat('%s', len(pending)))
             self.cursor.execute(f'''SELECT id, date FROM directory WHERE id IN ({values})''',
                                     tuple(pending))
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache['directory'][row[0]] = row[1]
         return dates
 
 
     def directory_set_date_in_isochrone_frontier(self, directory: DirectoryEntry, date: datetime):
         # logging.debug(f'EARLY occurrence of directory {hash_to_hex(directory.id)} on the ISOCHRONE FRONTIER (timestamp: {date})')
         # self.cursor.execute('''INSERT INTO directory VALUES (%s,%s)
         #                        ON CONFLICT (id) DO UPDATE SET date=%s''',
         #                        (directory.id, date, date))
         self.insert_cache['directory'][directory.id] = date
 
 
     def insert_all(self):
         # Performe insertions with cached information
         if self.insert_cache['content']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO content(id, date) VALUES %s
                    ON CONFLICT (id) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,content.date)''',
                 self.insert_cache['content'].items()
             )
 
         if self.insert_cache['content_early_in_rev']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO content_early_in_rev VALUES %s
                    ON CONFLICT DO NOTHING''',
                 self.insert_cache['content_early_in_rev']
             )
 
         if self.insert_cache['content_in_dir']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO content_in_dir VALUES %s
                    ON CONFLICT DO NOTHING''',
                 self.insert_cache['content_in_dir']
             )
 
         if self.insert_cache['directory']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO directory(id, date) VALUES %s
                    ON CONFLICT (id) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,directory.date)''',
                 self.insert_cache['directory'].items()
             )
 
         if self.insert_cache['directory_in_rev']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO directory_in_rev VALUES %s
                    ON CONFLICT DO NOTHING''',
                 self.insert_cache['directory_in_rev']
             )
 
         if self.insert_cache['revision']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO revision(id, date) VALUES %s
                    ON CONFLICT (id) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,revision.date)''',
                 self.insert_cache['revision'].items()
             )
 
         if self.insert_cache['revision_before_rev']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO revision_before_rev VALUES %s
                    ON CONFLICT DO NOTHING''',
                 self.insert_cache['revision_before_rev']
             )
 
         if self.insert_cache['revision_in_org']:
             psycopg2.extras.execute_values(
                 self.cursor,
                 '''INSERT INTO revision_in_org VALUES %s
                    ON CONFLICT DO NOTHING''',
                 self.insert_cache['revision_in_org']
             )
 
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         if origin.id is None:
             # Check if current origin is already known and retrieve its internal id.
             self.cursor.execute('''SELECT id FROM origin WHERE url=%s''', (origin.url,))
             row = self.cursor.fetchone()
 
             if row is None:
                 # If the origin is seen for the first time, current revision is
                 # the prefered one.
                 self.cursor.execute('''INSERT INTO origin (url) VALUES (%s) RETURNING id''',
                                        (origin.url,))
                 return self.cursor.fetchone()[0]
             else:
                 return row[0]
         else:
             return origin.id
 
 
     def revision_add(self, revision: RevisionEntry):
         # Add current revision to the compact DB
         self.insert_cache['revision'][revision.id] = revision.date
 
 
     def revision_add_before_revision(self, relative: RevisionEntry, revision: RevisionEntry):
         # self.cursor.execute('''INSERT INTO revision_before_rev VALUES (%s,%s)
         #                        ON CONFLICT DO NOTHING''',
         #                        (revision.id, relative.id))
         self.insert_cache['revision_before_rev'].append((revision.id, relative.id))
 
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         # self.cursor.execute('''INSERT INTO revision_in_org VALUES (%s,%s)
         #                        ON CONFLICT DO NOTHING''',
         #                        (revision.id, origin.id))
         self.insert_cache['revision_in_org'].append((revision.id, origin.id))
 
 
     def revision_get_early_date(self, revision: RevisionEntry) -> datetime:
         # logging.debug(f'Getting revision {hash_to_hex(revision.id)} early date')
         # First check if the date is being modified by current transection.
         date = self.insert_cache['revision'].get(revision.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache['revision'].get(revision.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute('''SELECT date FROM revision WHERE id=%s''',
                                        (revision.id,))
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache['revision'][revision.id] = date
         return date
 
 
     def revision_get_prefered_origin(self, revision: RevisionEntry) -> int:
         # TODO: adapt this method to consider cached values
         self.cursor.execute('''SELECT COALESCE(org,0) FROM revision WHERE id=%s''',
                                (revision.id,))
         row = self.cursor.fetchone()
         # None means revision is not in database
         # 0 means revision has no prefered origin
         return row[0] if row is not None and row[0] != 0 else None
 
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute('''SELECT 1 FROM revision_before_rev WHERE prev=%s''',
                                (revision.id,))
         return self.cursor.fetchone() is not None
 
 
     def revision_set_prefered_origin(self, origin: OriginEntry, revision: RevisionEntry):
         # TODO: adapt this method to consider cached values
         self.cursor.execute('''UPDATE revision SET org=%s WHERE id=%s''',
                                (origin.id, revision.id))
 
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute('''SELECT 1 FROM revision_in_org WHERE rev=%s''',
                                (revision.id,))
         return self.cursor.fetchone() is not None
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
index 2252024..dd4e72e 100644
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -1,310 +1,346 @@
 import logging
 
 from .archive import ArchiveInterface
 from .model import DirectoryEntry, FileEntry
 from .origin import OriginEntry
 from .revision import RevisionEntry
 
 from datetime import datetime
 from pathlib import PosixPath
-from typing import Dict, List
+from typing import Dict, List, Optional, Tuple
 
 from swh.model.hashutil import hash_to_hex
 
 
 class ProvenanceInterface:
     def __init__(self, **kwargs):
         raise NotImplementedError
 
 
     def commit(self):
         raise NotImplementedError
 
 
     def content_add_to_directory(self, directory: DirectoryEntry, blob: FileEntry, prefix: PosixPath):
         raise NotImplementedError
 
 
     def content_add_to_revision(self, revision: RevisionEntry, blob: FileEntry, prefix: PosixPath):
         raise NotImplementedError
 
 
     def content_find_first(self, blobid: str):
         raise NotImplementedError
 
 
     def content_find_all(self, blobid: str):
         raise NotImplementedError
 
 
     def content_get_early_date(self, blob: FileEntry) -> datetime:
         raise NotImplementedError
 
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         raise NotImplementedError
 
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         raise NotImplementedError
 
 
     def directory_add_to_revision(self, revision: RevisionEntry, directory: DirectoryEntry, path: PosixPath):
         raise NotImplementedError
 
 
     def directory_get_date_in_isochrone_frontier(self, directory: DirectoryEntry) -> datetime:
         raise NotImplementedError
 
 
     def directory_get_early_dates(self, dirs: List[DirectoryEntry]) -> Dict[bytes, datetime]:
         raise NotImplementedError
 
 
     def directory_set_date_in_isochrone_frontier(self, directory: DirectoryEntry, date: datetime):
         raise NotImplementedError
 
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         raise NotImplementedError
 
 
     def revision_add(self, revision: RevisionEntry):
         raise NotImplementedError
 
 
     def revision_add_before_revision(self, relative: RevisionEntry, revision: RevisionEntry):
         raise NotImplementedError
 
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         raise NotImplementedError
 
 
     def revision_get_early_date(self, revision: RevisionEntry) -> datetime:
         raise NotImplementedError
 
 
     def revision_get_prefered_origin(self, revision: RevisionEntry) -> int:
         raise NotImplementedError
 
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         raise NotImplementedError
 
 
     def revision_set_prefered_origin(self, origin: OriginEntry, revision: RevisionEntry):
         raise NotImplementedError
 
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         raise NotImplementedError
 
 
 def directory_process_content(
     provenance: ProvenanceInterface,
     directory: DirectoryEntry,
     relative: DirectoryEntry,
     prefix: PosixPath
 ):
     stack = [(directory, prefix)]
 
     while stack:
-        dir, path = stack.pop()
+        current, path = stack.pop()
 
-        for child in iter(dir):
+        for child in iter(current):
             if isinstance(child, FileEntry):
                 # Add content to the relative directory with the computed path.
                 provenance.content_add_to_directory(relative, child, path)
             else:
                 # Recursively walk the child directory.
                 stack.append((child, path / child.name))
 
 
+def directory_update_content(
+    stack: List[Tuple[DirectoryEntry, PosixPath]],
+    provenance: ProvenanceInterface,
+    revision: RevisionEntry,
+    directory: DirectoryEntry,
+    path: PosixPath,
+    subdirs: Optional[List[DirectoryEntry]]=None,
+    blobs: Optional[List[FileEntry]]=None,
+    blobdates: Optional[Dict[bytes, datetime]]=None
+):
+    assert revision.date is not None
+
+    # Init optional parameters if not provided.
+    if subdirs is None:
+        subdirs = [child for child in iter(directory) if isinstance(child, DirectoryEntry)]
+
+    if blobs is None:
+        blobs = [child for child in iter(directory) if isinstance(child, FileEntry)]
+
+    if blobdates is None:
+        blobdates = provenance.content_get_early_dates(blobs)
+
+    # Iterate over blobs updating their date if necessary.
+    for blob in blobs:
+        date = blobdates.get(blob.id, None)
+        if date is None or revision.date < date:
+            provenance.content_set_early_date(blob, revision.date)
+        provenance.content_add_to_revision(revision, blob, path)
+
+    # Push all subdirectories with its corresponding path to analyze them
+    # recursively.
+    for subdir in subdirs:
+        stack.append((subdir, path / subdir.name))
+
+
 def origin_add(
     provenance: ProvenanceInterface,
     origin: OriginEntry
 ):
+    # TODO: refactor to iterate over origin visit statuses and commit only once
+    # per status.
     origin.id = provenance.origin_get_id(origin)
 
     for revision in origin.revisions:
         # logging.info(f'Processing revision {hash_to_hex(revision.id)} from origin {origin.url}')
         origin_add_revision(provenance, origin, revision)
 
         # Commit after each revision
         provenance.commit()      # TODO: verify this!
 
 
 def origin_add_revision(
     provenance: ProvenanceInterface,
     origin: OriginEntry,
     revision: RevisionEntry
 ):
-    stack = [(None, revision)]
+    stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)]
 
     while stack:
-        relative, rev = stack.pop()
+        relative, current = stack.pop()
 
         # Check if current revision has no prefered origin and update if necessary.
-        prefered = provenance.revision_get_prefered_origin(rev)
+        prefered = provenance.revision_get_prefered_origin(current)
         # logging.debug(f'Prefered origin for revision {hash_to_hex(rev.id)}: {prefered}')
 
         if prefered is None:
-            provenance.revision_set_prefered_origin(origin, rev)
+            provenance.revision_set_prefered_origin(origin, current)
         ########################################################################
 
         if relative is None:
             # This revision is pointed directly by the origin.
-            visited = provenance.revision_visited(rev)
-            logging.debug(f'Revision {hash_to_hex(rev.id)} in origin {origin.id}: {visited}')
+            visited = provenance.revision_visited(current)
+            logging.debug(f'Revision {hash_to_hex(current.id)} in origin {origin.id}: {visited}')
 
-            logging.debug(f'Adding revision {hash_to_hex(rev.id)} to origin {origin.id}')
-            provenance.revision_add_to_origin(origin, rev)
+            logging.debug(f'Adding revision {hash_to_hex(current.id)} to origin {origin.id}')
+            provenance.revision_add_to_origin(origin, current)
 
             if not visited:
-                stack.append((rev, rev))
+                stack.append((current, current))
 
         else:
             # This revision is a parent of another one in the history of the
             # relative revision.
-            for parent in iter(rev):
+            for parent in iter(current):
                 visited = provenance.revision_visited(parent)
                 logging.debug(f'Parent {hash_to_hex(parent.id)} in some origin: {visited}')
 
                 if not visited:
                     # The parent revision has never been seen before pointing
                     # directly to an origin.
                     known = provenance.revision_in_history(parent)
                     logging.debug(f'Revision {hash_to_hex(parent.id)} before revision: {known}')
 
                     if known:
                         # The parent revision is already known in some other
                         # revision's history. We should point it directly to
                         # the origin and (eventually) walk its history.
                         logging.debug(f'Adding revision {hash_to_hex(parent.id)} directly to origin {origin.id}')
                         stack.append((None, parent))
                     else:
                         # The parent revision was never seen before. We should
                         # walk its history and associate it with the same
                         # relative revision.
                         logging.debug(f'Adding parent revision {hash_to_hex(parent.id)} to revision {hash_to_hex(relative.id)}')
                         provenance.revision_add_before_revision(relative, parent)
                         stack.append((relative, parent))
                 else:
                     # The parent revision already points to an origin, so its
                     # history was properly processed before. We just need to
                     # make sure it points to the current origin as well.
                     logging.debug(f'Adding parent revision {hash_to_hex(parent.id)} to origin {origin.id}')
                     provenance.revision_add_to_origin(origin, parent)
 
 
 def revision_add(
     provenance: ProvenanceInterface,
     archive: ArchiveInterface,
     revision: RevisionEntry
 ):
+    assert revision.date is not None
+    assert revision.root is not None
+
     # Processed content starting from the revision's root directory
     date = provenance.revision_get_early_date(revision)
     if date is None or revision.date < date:
         provenance.revision_add(revision)
         revision_process_content(
             provenance,
             revision,
             DirectoryEntry(archive, revision.root, PosixPath('.'))
         )
+
     return provenance.commit()
 
 
 def revision_process_content(
     provenance: ProvenanceInterface,
     revision: RevisionEntry,
     directory: DirectoryEntry
 ):
-    date = provenance.directory_get_date_in_isochrone_frontier(directory)
-    stack = [(directory, date, directory.name)]
-    # stack = [(directory, directory.name)]
+    assert revision.date is not None
+
+    stack = [(directory, directory.name)]
 
     while stack:
-        dir, date, path = stack.pop()
-        # dir, path = stack.pop()
-        # date = provenance.directory_get_date_in_isochrone_frontier(dir)
+        # Get next directory to process and query its date right before
+        # processing to be sure we get the most recently updated value.
+        current, path = stack.pop()
+        date = provenance.directory_get_date_in_isochrone_frontier(current)
 
         if date is None:
             # The directory has never been seen on the isochrone graph of a
             # revision. Its children should be checked.
-            blobs = [child for child in iter(dir) if isinstance(child, FileEntry)]
-            dirs = [child for child in iter(dir) if isinstance(child, DirectoryEntry)]
+            blobs = [child for child in iter(current) if isinstance(child, FileEntry)]
+            dirs = [child for child in iter(current) if isinstance(child, DirectoryEntry)]
 
             blobdates = provenance.content_get_early_dates(blobs)
-            # TODO: this will only return timestamps for diretories that were
-            # seen in an isochrone frontier. But a directory may only cointain a
-            # subdirectory whose contents are already known. Which one should be
-            # added to the frontier then (the root or the sub directory)?
             dirdates = provenance.directory_get_early_dates(dirs)
 
+            # Get the list of ids with no duplicates to ensure we have
+            # available dates for all the elements. This prevents takign a
+            # wrong decision when a blob occurres more than once in the same
+            # directory.
             ids = list(dict.fromkeys([child.id for child in blobs + dirs]))
             if ids:
                 dates = list(blobdates.values()) + list(dirdates.values())
 
-                if len(dates) == len(ids) and max(dates) <= revision.date:
-                    # The directory belongs to the isochrone frontier of the
-                    # current revision, and this is the first time it appears
-                    # as such.
-                    provenance.directory_set_date_in_isochrone_frontier(dir, max(dates))
-                    provenance.directory_add_to_revision(revision, dir, path)
-                    directory_process_content(
-                        provenance,
-                        directory=dir,
-                        relative=dir,
-                        prefix=PosixPath('.')
-                    )
+                if len(dates) == len(ids):
+                    # All child nodes of current directory are already known.
+                    maxdate = max(dates) < revision.date
+
+                    if maxdate < revision.date:
+                        # The directory belongs to the outer isochrone frontier
+                        # of the current revision, and this is the first time
+                        # it appears as such.
+                        # TODO: maybe the parent directory is the one that has
+                        # to be added to the frontier instead!
+                        provenance.directory_set_date_in_isochrone_frontier(current, maxdate)
+                        provenance.directory_add_to_revision(revision, current, path)
+                        directory_process_content(
+                            provenance,
+                            directory=current,
+                            relative=current,
+                            prefix=PosixPath('.')
+                        )
+
+                    else:
+                        # This revision is either in the inner frontiern or out
+                        # of order. All the children from the current directory
+                        # should be analyzed (and timestamps eventually
+                        # updated) yet current.
+                        directory_update_content(
+                            stack, provenance, revision, current, path,
+                            subdirs=dirs, blobs=blobs, blobdates=blobdates)
 
                 else:
-                    # The directory is not on the isochrone frontier of the
-                    # current revision. Its child nodes should be analyzed.
-                    ############################################################
-                    for child in blobs:
-                        date = blobdates.get(child.id, None)
-                        # date = provenance.content_get_early_date(child)
-                        if date is None or revision.date < date:
-                            provenance.content_set_early_date(child, revision.date)
-                        provenance.content_add_to_revision(revision, child, path)
-
-                    for child in dirs:
-                        date = dirdates.get(child.id, None)
-                        # date = provenance.directory_get_date_in_isochrone_frontier(child)
-                        stack.append((child, date, path / child.name))
-                        # stack.append((child, path / child.name))
-                    ############################################################
+                    # Al least one child node is known, ie. the directory is
+                    # not on the outer isochrone frontier of the current
+                    # revision. Its child nodes should be analyzed and current
+                    # directory updated before them.
+                    # FIXME: I believe the only different between this branche
+                    # and the two 'else' cases above is this push to the stack.
+                    # If so, we might refactor this to avoid so many branches.
+                    # stack.append((current, path))
+                    # TODO: to uncomment the above line is it necessary to
+                    # postpone the adding of dictories to the isochrone
+                    # frontier from the branch above (maxdate < revision.date).
+                    directory_update_content(
+                        stack, provenance, revision, current, path,
+                        subdirs=dirs, blobs=blobs, blobdates=blobdates)
 
         elif revision.date < date:
             # The directory has already been seen on the isochrone frontier of
             # a revision, but current revision is earlier. Its children should
             # be updated.
-            blobs = [child for child in iter(dir) if isinstance(child, FileEntry)]
-            dirs = [child for child in iter(dir) if isinstance(child, DirectoryEntry)]
-
-            blobdates = provenance.content_get_early_dates(blobs)
-            dirdates = provenance.directory_get_early_dates(dirs)
-
-            ####################################################################
-            for child in blobs:
-                # date = blobdates.get(child.id, None)
-                date = provenance.content_get_early_date(child)
-                if date is None or revision.date < date:
-                    provenance.content_set_early_date(child, revision.date)
-                provenance.content_add_to_revision(revision, child, path)
-
-            for child in dirs:
-                # date = dirdates.get(child.id, None)
-                date = provenance.directory_get_date_in_isochrone_frontier(child)
-                stack.append((child, date, path / child.name))
-                # stack.append((child, path / child.name))
-            ####################################################################
-
-            provenance.directory_set_date_in_isochrone_frontier(dir, revision.date)
+            directory_update_content(stack, provenance, revision, current, path)
+            provenance.directory_set_date_in_isochrone_frontier(current, revision.date)
 
         else:
-            # The directory has already been seen on the isochrone frontier of
-            # an earlier revision. Just add it to the current revision.
-            provenance.directory_add_to_revision(revision, dir, path)
+            # The directory has already been seen on the outer isochrone
+            # frontier of an earlier revision. Just add it to the current
+            # revision.
+            provenance.directory_add_to_revision(revision, current, path)
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
index d32a25d..4701bfe 100644
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -1,167 +1,168 @@
 import logging
 import threading
 
 from .archive import ArchiveInterface
 
 from datetime import datetime
+from typing import Optional
 
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 
 
 class RevisionEntry:
     def __init__(
         self,
         archive: ArchiveInterface,
         id: bytes,
-        date: datetime=None,
-        root: bytes=None,
-        parents: list=None
+        date: Optional[datetime]=None,
+        root: Optional[bytes]=None,
+        parents: Optional[list]=None
     ):
         self.archive = archive
         self.id = id
         self.date = date
         self.parents = parents
         self.root = root
 
     def __iter__(self):
         if self.parents is None:
             self.parents = []
             for parent in self.archive.revision_get([self.id]):
                 if parent is not None:
                     self.parents.append(
                         RevisionEntry(
                             self.archive,
                             parent.id,
                             parents=[RevisionEntry(self.archive, id) for id in parent.parents]
                         )
                     )
 
         return iter(self.parents)
 
 
 ################################################################################
 ################################################################################
 
 class RevisionIterator:
     """Iterator interface."""
 
     def __iter__(self):
         pass
 
     def __next__(self):
         pass
 
 
 class FileRevisionIterator(RevisionIterator):
     """Iterator over revisions present in the given CSV file."""
 
-    def __init__(self, filename: str, archive: ArchiveInterface, limit: int=None):
+    def __init__(self, filename: str, archive: ArchiveInterface, limit: Optional[int]=None):
         self.file = open(filename)
         self.idx = 0
         self.limit = limit
         self.mutex = threading.Lock()
         self.archive = archive
 
     def next(self):
         self.mutex.acquire()
         line = self.file.readline().strip()
         if line and (self.limit is None or self.idx < self.limit):
             self.idx = self.idx + 1
             id, date, root = line.strip().split(',')
             self.mutex.release()
 
             return RevisionEntry(
                 self.archive,
                 hash_to_bytes(id),
                 date=datetime.fromisoformat(date),
                 root=hash_to_bytes(root)
             )
         else:
             self.mutex.release()
             return None
 
 
 # class ArchiveRevisionIterator(RevisionIterator):
 #     """Iterator over revisions present in the given database."""
 #
 #     def __init__(self, conn, limit=None, chunksize=100):
 #         self.cur = conn.cursor()
 #         self.chunksize = chunksize
 #         self.records = []
 #         if limit is None:
 #             self.cur.execute('''SELECT id, date, committer_date, directory
 #                             FROM revision''')
 #         else:
 #             self.cur.execute('''SELECT id, date, committer_date, directory
 #                             FROM revision
 #                             LIMIT %s''', (limit,))
 #         for row in self.cur.fetchmany(self.chunksize):
 #             record = self.make_record(row)
 #             if record is not None:
 #                 self.records.append(record)
 #         self.mutex = threading.Lock()
 #
 #     def __del__(self):
 #         self.cur.close()
 #
 #     def next(self):
 #         self.mutex.acquire()
 #         if not self.records:
 #             self.records.clear()
 #             for row in self.cur.fetchmany(self.chunksize):
 #                 record = self.make_record(row)
 #                 if record is not None:
 #                     self.records.append(record)
 #
 #         if self.records:
 #             revision, *self.records = self.records
 #             self.mutex.release()
 #             return revision
 #         else:
 #             self.mutex.release()
 #             return None
 #
 #     def make_record(self, row):
 #         # Only revision with author or commiter date are considered
 #         if row[1] is not None:
 #             # If the revision has author date, it takes precedence
 #             return RevisionEntry(row[0], row[1], row[3])
 #         elif row[2] is not None:
 #             # If not, we use the commiter date
 #             return RevisionEntry(row[0], row[2], row[3])
 
 
 ################################################################################
 ################################################################################
 
 # class RevisionWorker(threading.Thread):
 #     def __init__(
 #         self,
 #         id: int,
 #         conninfo: dict,
 #         archive: ArchiveInterface,
 #         revisions: RevisionIterator
 #     ):
 #         from .provenance import get_provenance
 #
 #         super().__init__()
 #         self.archive = archive
 #         self.id = id
 #         self.provenance = get_provenance(conninfo)
 #         self.revisions = revisions
 #
 #
 #     def run(self):
 #         from .provenance import revision_add
 #
 #
 #         while True:
 #             revision = self.revisions.next()
 #             if revision is None: break
 #
 #             processed = False
 #             while not processed:
 #                 logging.info(f'Thread {self.id} - Processing revision {hash_to_hex(revision.id)} (timestamp: {revision.date})')
 #                 processed = revision_add(self.provenance, self.archive, revision)
 #                 if not processed:
 #                     logging.warning(f'Thread {self.id} - Failed to process revision {hash_to_hex(revision.id)} (timestamp: {revision.date})')