diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
index 267ba6c..b3455d4 100644
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -1,505 +1,505 @@
 import itertools
 import logging
 import operator
 import os
 import psycopg2
 import psycopg2.extras
 
 from ..model import DirectoryEntry, FileEntry
 from ..origin import OriginEntry
 from .db_utils import connect, execute_sql
 from ..provenance import ProvenanceInterface
 from ..revision import RevisionEntry
 
 from datetime import datetime
 from typing import Any, Dict, Generator, List, Optional, Tuple
 
 
 def normalize(path: bytes) -> bytes:
     return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path
 
 
 def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
     conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
 
     # Normalize dbname to avoid issues when reconnecting below
     name = name.casefold()
 
     # Create new database dropping previous one if exists
     cursor = conn.cursor()
     cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
     cursor.execute(f"""CREATE DATABASE {name}""")
     conn.close()
 
     # Reconnect to server selecting newly created database to add tables
     conninfo["dbname"] = name
     conn = connect(conninfo)
 
     sqldir = os.path.dirname(os.path.realpath(__file__))
     execute_sql(conn, os.path.join(sqldir, "provenance.sql"))
 
 
 ########################################################################################
 ########################################################################################
 ########################################################################################
 
 
 class ProvenancePostgreSQL(ProvenanceInterface):
     def __init__(self, conn: psycopg2.extensions.connection):
         # TODO: consider adding a mutex for thread safety
         conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         self.conn = conn
         self.cursor = self.conn.cursor()
         self.insert_cache: Dict[str, Any] = {}
         self.remove_cache: Dict[str, Any] = {}
         self.select_cache: Dict[str, Any] = {}
         self.clear_caches()
 
     def clear_caches(self):
         self.insert_cache = {
             "content": dict(),
             "content_early_in_rev": list(),
             "content_in_dir": list(),
             "directory": dict(),
             "directory_in_rev": list(),
             "revision": dict(),
             "revision_before_rev": list(),
             "revision_in_org": list(),
         }
         self.remove_cache = {"directory": dict()}
         self.select_cache = {"content": dict(), "directory": dict(), "revision": dict()}
 
     def commit(self):
         result = False
         try:
             self.insert_all()
             self.clear_caches()
             result = True
 
         except Exception as error:
             # Unexpected error occurred, rollback all changes and log message
             logging.error(f"Unexpected error: {error}")
 
         return result
 
     def content_add_to_directory(
         self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_in_dir"].append(
             (blob.id, directory.id, normalize(os.path.join(prefix, blob.name)))
         )
 
     def content_add_to_revision(
         self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_early_in_rev"].append(
             (blob.id, revision.id, normalize(os.path.join(prefix, blob.name)))
         )
 
     def content_find_first(
         self, blobid: bytes
     ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
         self.cursor.execute(
             """SELECT content_location.sha1 AS blob,
                       revision.sha1 AS rev,
                       revision.date AS date,
                       content_location.path AS path
                  FROM (SELECT content_hex.sha1,
                               content_hex.rev,
                               location.path
                         FROM (SELECT content.sha1,
                                      content_early_in_rev.rev,
                                      content_early_in_rev.loc
                                FROM content_early_in_rev
                                JOIN content
                                  ON content.id=content_early_in_rev.blob
                                WHERE content.sha1=%s
                              ) AS content_hex
                         JOIN location
                             ON location.id=content_hex.loc
                       ) AS content_location
                  JOIN revision
                    ON revision.id=content_location.rev
                  ORDER BY date, rev, path ASC LIMIT 1""",
             (blobid,),
         )
         return self.cursor.fetchone()
 
     def content_find_all(
         self, blobid: bytes
     ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
         self.cursor.execute(
             """(SELECT content_location.sha1 AS blob,
                        revision.sha1 AS rev,
                        revision.date AS date,
                        content_location.path AS path
                  FROM (SELECT content_hex.sha1,
                               content_hex.rev,
                               location.path
                         FROM (SELECT content.sha1,
                                      content_early_in_rev.rev,
                                      content_early_in_rev.loc
                                FROM content_early_in_rev
                                JOIN content
                                  ON content.id=content_early_in_rev.blob
                                WHERE content.sha1=%s
                              ) AS content_hex
                         JOIN location
                           ON location.id=content_hex.loc
                       ) AS content_location
                  JOIN revision
                    ON revision.id=content_location.rev
                  )
                UNION
                (SELECT content_prefix.sha1 AS blob,
                        revision.sha1 AS rev,
                        revision.date AS date,
                        content_prefix.path AS path
                  FROM (SELECT content_in_rev.sha1,
                               content_in_rev.rev,
                               CASE location.path
                                 WHEN '' THEN content_in_rev.suffix
                                 WHEN '.' THEN content_in_rev.suffix
                                 ELSE (location.path || '/' ||
                                          content_in_rev.suffix)::unix_path
                               END AS path
                         FROM (SELECT content_suffix.sha1,
                                      directory_in_rev.rev,
                                      directory_in_rev.loc,
                                      content_suffix.path AS suffix
                                FROM (SELECT content_hex.sha1,
                                             content_hex.dir,
                                             location.path
                                       FROM (SELECT content.sha1,
                                                    content_in_dir.dir,
                                                    content_in_dir.loc
                                              FROM content_in_dir
                                              JOIN content
                                                ON content_in_dir.blob=content.id
                                              WHERE content.sha1=%s
                                            ) AS content_hex
                                       JOIN location
                                         ON location.id=content_hex.loc
                                     ) AS content_suffix
                                JOIN directory_in_rev
                                  ON directory_in_rev.dir=content_suffix.dir
                              ) AS content_in_rev
                         JOIN location
                           ON location.id=content_in_rev.loc
                       ) AS content_prefix
                  JOIN revision
                    ON revision.id=content_prefix.rev
                )
                ORDER BY date, rev, path""",
             (blobid, blobid),
         )
         # TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
         yield from self.cursor.fetchall()
 
     def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["content"].get(blob.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["content"].get(blob.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM content WHERE sha1=%s""", (blob.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["content"][blob.id] = date
         return date
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for blob in blobs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["content"].get(blob.id, None)
             if date is not None:
                 dates[blob.id] = date
             else:
                 # If not, check whether it's been query before
                 date = self.select_cache["content"].get(blob.id, None)
                 if date is not None:
                     dates[blob.id] = date
                 else:
                     pending.append(blob.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM content WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["content"][row[0]] = row[1]
         return dates
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         self.insert_cache["content"][blob.id] = date
 
     def directory_add_to_revision(
         self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
     ):
         self.insert_cache["directory_in_rev"].append(
             (directory.id, revision.id, normalize(path))
         )
 
     def directory_get_date_in_isochrone_frontier(
         self, directory: DirectoryEntry
     ) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["directory"].get(directory.id, None)
         if date is None and directory.id not in self.remove_cache["directory"]:
             # If not, check whether it's been query before
             date = self.select_cache["directory"].get(directory.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM directory WHERE sha1=%s""", (directory.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["directory"][directory.id] = date
         return date
 
     def directory_get_dates_in_isochrone_frontier(
         self, dirs: List[DirectoryEntry]
     ) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for directory in dirs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["directory"].get(directory.id, None)
             if date is not None:
                 dates[directory.id] = date
             elif directory.id not in self.remove_cache["directory"]:
                 # If not, check whether it's been query before
                 date = self.select_cache["directory"].get(directory.id, None)
                 if date is not None:
                     dates[directory.id] = date
                 else:
                     pending.append(directory.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM directory WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["directory"][row[0]] = row[1]
         return dates
 
     def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
         self.remove_cache["directory"][directory.id] = None
         self.insert_cache["directory"].pop(directory.id, None)
 
     def directory_set_date_in_isochrone_frontier(
         self, directory: DirectoryEntry, date: datetime
     ):
         self.insert_cache["directory"][directory.id] = date
         self.remove_cache["directory"].pop(directory.id, None)
 
     def insert_all(self):
         # Performe insertions with cached information
         if self.insert_cache["content"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY content;
                    INSERT INTO content(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,content.date)""",
                 self.insert_cache["content"].items(),
             )
             self.insert_cache["content"].clear()
 
         if self.insert_cache["directory"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY directory;
                    INSERT INTO directory(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,directory.date)""",
                 self.insert_cache["directory"].items(),
             )
             self.insert_cache["directory"].clear()
 
         if self.insert_cache["revision"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY revision;
                    INSERT INTO revision(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,revision.date)""",
                 self.insert_cache["revision"].items(),
             )
             self.insert_cache["revision"].clear()
 
         # Relations should come after ids for elements were resolved
         if self.insert_cache["content_early_in_rev"]:
             self.insert_location("content", "revision", "content_early_in_rev")
 
         if self.insert_cache["content_in_dir"]:
             self.insert_location("content", "directory", "content_in_dir")
 
         if self.insert_cache["directory_in_rev"]:
             self.insert_location("directory", "revision", "directory_in_rev")
 
         # if self.insert_cache["revision_before_rev"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_before_rev VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_before_rev"],
         #     )
         #     self.insert_cache["revision_before_rev"].clear()
 
         # if self.insert_cache["revision_in_org"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_in_org VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_in_org"],
         #     )
         #     self.insert_cache["revision_in_org"].clear()
 
     def insert_location(self, src0_table, src1_table, dst_table):
         # Resolve src0 ids
         src0_values = dict().fromkeys(
             map(operator.itemgetter(0), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src0_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({values})""",
             tuple(src0_values),
         )
         src0_values = dict(self.cursor.fetchall())
 
         # Resolve src1 ids
         src1_values = dict().fromkeys(
             map(operator.itemgetter(1), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src1_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({values})""",
             tuple(src1_values),
         )
         src1_values = dict(self.cursor.fetchall())
 
         # Resolve location ids
         location = dict().fromkeys(
             map(operator.itemgetter(2), self.insert_cache[dst_table])
         )
         location = dict(
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY location;
                    INSERT INTO location(path) VALUES %s
                      ON CONFLICT (path) DO
                        UPDATE SET path=EXCLUDED.path
                      RETURNING path, id""",
                 map(lambda path: (path,), location.keys()),
                 fetch=True,
             )
         )
 
         # Insert values in dst_table
         rows = map(
             lambda row: (src0_values[row[0]], src1_values[row[1]], location[row[2]]),
             self.insert_cache[dst_table],
         )
         psycopg2.extras.execute_values(
             self.cursor,
             f"""INSERT INTO {dst_table} VALUES %s
                   ON CONFLICT DO NOTHING""",
             rows,
         )
         self.insert_cache[dst_table].clear()
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         if origin.id is None:
             # Insert origin in the DB and return the assigned id
             self.cursor.execute(
                 """INSERT INTO origin (url) VALUES (%s)
                      ON CONFLICT DO NOTHING
                      RETURNING id""",
                 (origin.url,),
             )
             return self.cursor.fetchone()[0]
         else:
             return origin.id
 
     def revision_add(self, revision: RevisionEntry):
         # Add current revision to the compact DB
         self.insert_cache["revision"][revision.id] = revision.date
 
     def revision_add_before_revision(
         self, relative: RevisionEntry, revision: RevisionEntry
     ):
         self.insert_cache["revision_before_rev"].append((revision.id, relative.id))
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         self.insert_cache["revision_in_org"].append((revision.id, origin.id))
 
     def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
         date = self.insert_cache["revision"].get(revision.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["revision"].get(revision.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM revision WHERE sha1=%s""", (revision.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["revision"][revision.id] = date
         return date
 
-    def revision_get_prefered_origin(self, revision: RevisionEntry) -> int:
+    def revision_get_preferred_origin(self, revision: RevisionEntry) -> int:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT COALESCE(org,0) FROM revision WHERE sha1=%s""", (revision.id,)
         )
         row = self.cursor.fetchone()
         # None means revision is not in database;
-        # 0 means revision has no prefered origin
+        # 0 means revision has no preferred origin
         return row[0] if row is not None and row[0] != 0 else None
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_before_rev
                  JOIN revision
                    ON revision.id=revision_before_rev.prev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
 
-    def revision_set_prefered_origin(
+    def revision_set_preferred_origin(
         self, origin: OriginEntry, revision: RevisionEntry
     ):
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """UPDATE revision SET org=%s WHERE sha1=%s""", (origin.id, revision.id)
         )
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_in_org
                  JOIN revision
                    ON revision.id=revision_in_org.rev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
diff --git a/swh/provenance/postgresql/provenance.sql b/swh/provenance/postgresql/provenance.sql
index 80e2271..dda567a 100644
--- a/swh/provenance/postgresql/provenance.sql
+++ b/swh/provenance/postgresql/provenance.sql
@@ -1,150 +1,150 @@
 -- a Git object ID, i.e., a Git-style salted SHA1 checksum
 drop domain if exists sha1_git cascade;
 create domain sha1_git as bytea check (length(value) = 20);
 
 -- UNIX path (absolute, relative, individual path component, etc.)
 drop domain if exists unix_path cascade;
 create domain unix_path as bytea;
 
 
 drop table if exists content;
 create table content
 (
     id      bigserial primary key,      -- internal identifier of the content blob
     sha1    sha1_git unique not null,   -- intrinsic identifier of the content blob
     date    timestamptz not null        -- timestamp of the revision where the blob appears early
 );
 
 comment on column content.id is 'Content internal identifier';
 comment on column content.sha1 is 'Content intrinsic identifier';
 comment on column content.date is 'Earliest timestamp for the content (first seen time)';
 
 
 drop table if exists content_early_in_rev;
 create table content_early_in_rev
 (
     blob    bigint not null,            -- internal identifier of the content blob
     rev     bigint not null,            -- internal identifier of the revision where the blob appears for the first time
     loc     bigint not null,            -- location of the content relative to the revision root directory
     primary key (blob, rev, loc)
     -- foreign key (blob) references content (id),
     -- foreign key (rev) references revision (id),
     -- foreign key (loc) references location (id)
 );
 
 comment on column content_early_in_rev.blob is 'Content internal identifier';
 comment on column content_early_in_rev.rev is 'Revision internal identifier';
 comment on column content_early_in_rev.loc is 'Location of content in revision';
 
 
 drop table if exists content_in_dir;
 create table content_in_dir
 (
     blob    bigint not null,            -- internal identifier of the content blob
     dir     bigint not null,            -- internal identifier of the directory contaning the blob
     loc     bigint not null,            -- location of the content relative to its parent directory in the isochrone frontier
     primary key (blob, dir, loc)
     -- foreign key (blob) references content (id),
     -- foreign key (dir) references directory (id),
     -- foreign key (loc) references location (id)
 );
 
 comment on column content_in_dir.blob is 'Content internal identifier';
 comment on column content_in_dir.dir is 'Directory internal identifier';
 comment on column content_in_dir.loc is 'Location of content in directory';
 
 
 drop table if exists directory;
 create table directory
 (
     id      bigserial primary key,      -- internal identifier of the directory appearing in an isochrone inner frontier
     sha1    sha1_git unique not null,   -- intrinsic identifier of the directory
     date    timestamptz not null        -- max timestamp among those of the directory children's
 );
 
 comment on column directory.id is 'Directory internal identifier';
 comment on column directory.sha1 is 'Directory intrinsic identifier';
 comment on column directory.date is 'Latest timestamp for the content in the directory';
 
 
 drop table if exists directory_in_rev;
 create table directory_in_rev
 (
     dir     bigint not null,            -- internal identifier of the directory appearing in the revision
     rev     bigint not null,            -- internal identifier of the revision containing the directory
     loc     bigint not null,            -- location of the directory relative to the revision root directory
     primary key (dir, rev, loc)
     -- foreign key (dir) references directory (id),
     -- foreign key (rev) references revision (id),
     -- foreign key (loc) references location (id)
 );
 
 comment on column directory_in_rev.dir is 'Directory internal identifier';
 comment on column directory_in_rev.rev is 'Revision internal identifier';
 comment on column directory_in_rev.loc is 'Location of directory in revision';
 
 
 drop table if exists location;
 create table location
 (
     id      bigserial primary key,      -- internal identifier of the location
     path    unix_path unique not null   -- path to the location
 );
 
 comment on column location.id is 'Location internal identifier';
 comment on column location.path is 'Path to the location';
 
 
 drop table if exists origin;
 create table origin
 (
     id      bigserial primary key,      -- internal identifier of the origin
     url     unix_path unique not null   -- url of the origin
 );
 
 comment on column origin.id is 'Origin internal identifier';
 comment on column origin.url is 'URL of the origin';
 
 
 drop table if exists revision;
 create table revision
 (
     id      bigserial primary key,      -- internal identifier of the revision
     sha1    sha1_git unique not null,   -- intrinsic identifier of the revision
     date    timestamptz not null,       -- timestamp of the revision
-    org     bigint                      -- id of the prefered origin
+    org     bigint                      -- id of the preferred origin
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision.id is 'Revision internal identifier';
 comment on column revision.sha1 is 'Revision intrinsic identifier';
 comment on column revision.date is 'Revision timestamp';
-comment on column revision.org is 'Prefered origin for the revision';
+comment on column revision.org is 'preferred origin for the revision';
 
 
 drop table if exists revision_before_rev;
 create table revision_before_rev
 (
     prev    bigserial not null,         -- internal identifier of the source revision
     next    bigserial not null,         -- internal identifier of the destination revision
     primary key (prev, next)
     -- foreign key (prev) references revision (id),
     -- foreign key (next) references revision (id)
 );
 
 comment on column revision_before_rev.prev is 'Source revision internal identifier';
 comment on column revision_before_rev.next is 'Destination revision internal identifier';
 
 
 drop table if exists revision_in_org;
 create table revision_in_org
 (
     rev     bigint not null,            -- internal identifier of the revision poined by the origin
     org     bigint not null,            -- internal identifier of the origin that points to the revision
     primary key (rev, org)
     -- foreign key (rev) references revision (id),
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision_in_org.rev is 'Revision internal identifier';
 comment on column revision_in_org.org is 'Origin internal identifier';
diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py
index 7728c8d..391770f 100644
--- a/swh/provenance/postgresql_nopath/provenance.py
+++ b/swh/provenance/postgresql_nopath/provenance.py
@@ -1,442 +1,442 @@
 import itertools
 import logging
 import operator
 import os
 import psycopg2
 import psycopg2.extras
 
 from ..model import DirectoryEntry, FileEntry
 from ..origin import OriginEntry
 from ..postgresql.db_utils import connect, execute_sql
 from ..provenance import ProvenanceInterface
 from ..revision import RevisionEntry
 
 from datetime import datetime
 from typing import Any, Dict, Generator, List, Optional, Tuple
 
 
 def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
     conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
 
     # Normalize dbname to avoid issues when reconnecting below
     name = name.casefold()
 
     # Create new database dropping previous one if exists
     cursor = conn.cursor()
     cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
     cursor.execute(f"""CREATE DATABASE {name}""")
     conn.close()
 
     # Reconnect to server selecting newly created database to add tables
     conninfo["dbname"] = name
     conn = connect(conninfo)
 
     sqldir = os.path.dirname(os.path.realpath(__file__))
     execute_sql(conn, os.path.join(sqldir, "provenance.sql"))
 
 
 ########################################################################################
 ########################################################################################
 ########################################################################################
 
 
 class ProvenancePostgreSQLNoPath(ProvenanceInterface):
     def __init__(self, conn: psycopg2.extensions.connection):
         # TODO: consider adding a mutex for thread safety
         conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         self.conn = conn
         self.cursor = self.conn.cursor()
         self.insert_cache: Dict[str, Any] = {}
         self.remove_cache: Dict[str, Any] = {}
         self.select_cache: Dict[str, Any] = {}
         self.clear_caches()
 
     def clear_caches(self):
         self.insert_cache = {
             "content": dict(),
             "content_early_in_rev": set(),
             "content_in_dir": set(),
             "directory": dict(),
             "directory_in_rev": set(),
             "revision": dict(),
             "revision_before_rev": list(),
             "revision_in_org": list(),
         }
         self.remove_cache = {"directory": dict()}
         self.select_cache = {"content": dict(), "directory": dict(), "revision": dict()}
 
     def commit(self):
         result = False
         try:
             self.insert_all()
             self.clear_caches()
             result = True
 
         except Exception as error:
             # Unexpected error occurred, rollback all changes and log message
             logging.error(f"Unexpected error: {error}")
 
         return result
 
     def content_add_to_directory(
         self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_in_dir"].add((blob.id, directory.id))
 
     def content_add_to_revision(
         self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
     ):
         self.insert_cache["content_early_in_rev"].add((blob.id, revision.id))
 
     def content_find_first(
         self, blobid: bytes
     ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
         self.cursor.execute(
             """SELECT revision.sha1 AS rev,
                       revision.date AS date
                  FROM (SELECT content_early_in_rev.rev
                           FROM content_early_in_rev
                           JOIN content
                             ON content.id=content_early_in_rev.blob
                           WHERE content.sha1=%s
                       ) AS content_in_rev
                  JOIN revision
                    ON revision.id=content_in_rev.rev
                  ORDER BY date, rev ASC LIMIT 1""",
             (blobid,),
         )
         row = self.cursor.fetchone()
         if row is not None:
             # TODO: query revision from the archive and look for blobid into a
             # recursive directory_ls of the revision's root.
             return blobid, row[0], row[1], b""
         return None
 
     def content_find_all(
         self, blobid: bytes
     ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
         self.cursor.execute(
             """(SELECT revision.sha1 AS rev,
                        revision.date AS date
                  FROM (SELECT content_early_in_rev.rev
                           FROM content_early_in_rev
                           JOIN content
                             ON content.id=content_early_in_rev.blob
                           WHERE content.sha1=%s
                       ) AS content_in_rev
                  JOIN revision
                    ON revision.id=content_in_rev.rev
                  )
                UNION
                (SELECT revision.sha1 AS rev,
                        revision.date AS date
                  FROM (SELECT directory_in_rev.rev
                           FROM (SELECT content_in_dir.dir
                                    FROM content_in_dir
                                    JOIN content
                                      ON content_in_dir.blob=content.id
                                    WHERE content.sha1=%s
                                ) AS content_dir
                           JOIN directory_in_rev
                             ON directory_in_rev.dir=content_dir.dir
                       ) AS content_in_rev
                  JOIN revision
                    ON revision.id=content_in_rev.rev
                )
                ORDER BY date, rev""",
             (blobid, blobid),
         )
         # TODO: use POSTGRESQL EXPLAIN looking for query optimizations.
         for row in self.cursor.fetchall():
             # TODO: query revision from the archive and look for blobid into a
             # recursive directory_ls of the revision's root.
             yield blobid, row[0], row[1], b""
 
     def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["content"].get(blob.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["content"].get(blob.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM content WHERE sha1=%s""", (blob.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["content"][blob.id] = date
         return date
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for blob in blobs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["content"].get(blob.id, None)
             if date is not None:
                 dates[blob.id] = date
             else:
                 # If not, check whether it's been query before
                 date = self.select_cache["content"].get(blob.id, None)
                 if date is not None:
                     dates[blob.id] = date
                 else:
                     pending.append(blob.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM content WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["content"][row[0]] = row[1]
         return dates
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         self.insert_cache["content"][blob.id] = date
 
     def directory_add_to_revision(
         self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
     ):
         self.insert_cache["directory_in_rev"].add((directory.id, revision.id))
 
     def directory_get_date_in_isochrone_frontier(
         self, directory: DirectoryEntry
     ) -> Optional[datetime]:
         # First check if the date is being modified by current transection.
         date = self.insert_cache["directory"].get(directory.id, None)
         if date is None and directory.id not in self.remove_cache["directory"]:
             # If not, check whether it's been query before
             date = self.select_cache["directory"].get(directory.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM directory WHERE sha1=%s""", (directory.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["directory"][directory.id] = date
         return date
 
     def directory_get_dates_in_isochrone_frontier(
         self, dirs: List[DirectoryEntry]
     ) -> Dict[bytes, datetime]:
         dates = {}
         pending = []
         for directory in dirs:
             # First check if the date is being modified by current transection.
             date = self.insert_cache["directory"].get(directory.id, None)
             if date is not None:
                 dates[directory.id] = date
             elif directory.id not in self.remove_cache["directory"]:
                 # If not, check whether it's been query before
                 date = self.select_cache["directory"].get(directory.id, None)
                 if date is not None:
                     dates[directory.id] = date
                 else:
                     pending.append(directory.id)
         if pending:
             # Otherwise, query the database and cache the values
             values = ", ".join(itertools.repeat("%s", len(pending)))
             self.cursor.execute(
                 f"""SELECT sha1, date FROM directory WHERE sha1 IN ({values})""",
                 tuple(pending),
             )
             for row in self.cursor.fetchall():
                 dates[row[0]] = row[1]
                 self.select_cache["directory"][row[0]] = row[1]
         return dates
 
     def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
         self.remove_cache["directory"][directory.id] = None
         self.insert_cache["directory"].pop(directory.id, None)
 
     def directory_set_date_in_isochrone_frontier(
         self, directory: DirectoryEntry, date: datetime
     ):
         self.insert_cache["directory"][directory.id] = date
         self.remove_cache["directory"].pop(directory.id, None)
 
     def insert_all(self):
         # Performe insertions with cached information
         if self.insert_cache["content"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY content;
                    INSERT INTO content(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,content.date)""",
                 self.insert_cache["content"].items(),
             )
             self.insert_cache["content"].clear()
 
         if self.insert_cache["directory"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY directory;
                    INSERT INTO directory(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,directory.date)""",
                 self.insert_cache["directory"].items(),
             )
             self.insert_cache["directory"].clear()
 
         if self.insert_cache["revision"]:
             psycopg2.extras.execute_values(
                 self.cursor,
                 """LOCK TABLE ONLY revision;
                    INSERT INTO revision(sha1, date) VALUES %s
                      ON CONFLICT (sha1) DO
                        UPDATE SET date=LEAST(EXCLUDED.date,revision.date)""",
                 self.insert_cache["revision"].items(),
             )
             self.insert_cache["revision"].clear()
 
         # Relations should come after ids for elements were resolved
         if self.insert_cache["content_early_in_rev"]:
             self.insert_location("content", "revision", "content_early_in_rev")
 
         if self.insert_cache["content_in_dir"]:
             self.insert_location("content", "directory", "content_in_dir")
 
         if self.insert_cache["directory_in_rev"]:
             self.insert_location("directory", "revision", "directory_in_rev")
 
         # if self.insert_cache["revision_before_rev"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_before_rev VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_before_rev"],
         #     )
         #     self.insert_cache["revision_before_rev"].clear()
 
         # if self.insert_cache["revision_in_org"]:
         #     psycopg2.extras.execute_values(
         #         self.cursor,
         #         """INSERT INTO revision_in_org VALUES %s
         #            ON CONFLICT DO NOTHING""",
         #         self.insert_cache["revision_in_org"],
         #     )
         #     self.insert_cache["revision_in_org"].clear()
 
     def insert_location(self, src0_table, src1_table, dst_table):
         # Resolve src0 ids
         src0_values = dict().fromkeys(
             map(operator.itemgetter(0), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src0_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src0_table} WHERE sha1 IN ({values})""",
             tuple(src0_values),
         )
         src0_values = dict(self.cursor.fetchall())
 
         # Resolve src1 ids
         src1_values = dict().fromkeys(
             map(operator.itemgetter(1), self.insert_cache[dst_table])
         )
         values = ", ".join(itertools.repeat("%s", len(src1_values)))
         self.cursor.execute(
             f"""SELECT sha1, id FROM {src1_table} WHERE sha1 IN ({values})""",
             tuple(src1_values),
         )
         src1_values = dict(self.cursor.fetchall())
 
         # Insert values in dst_table
         rows = map(
             lambda row: (src0_values[row[0]], src1_values[row[1]]),
             self.insert_cache[dst_table],
         )
         psycopg2.extras.execute_values(
             self.cursor,
             f"""INSERT INTO {dst_table} VALUES %s
                   ON CONFLICT DO NOTHING""",
             rows,
         )
         self.insert_cache[dst_table].clear()
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         if origin.id is None:
             # Insert origin in the DB and return the assigned id
             self.cursor.execute(
                 """INSERT INTO origin (url) VALUES (%s)
                      ON CONFLICT DO NOTHING
                      RETURNING id""",
                 (origin.url,),
             )
             return self.cursor.fetchone()[0]
         else:
             return origin.id
 
     def revision_add(self, revision: RevisionEntry):
         # Add current revision to the compact DB
         self.insert_cache["revision"][revision.id] = revision.date
 
     def revision_add_before_revision(
         self, relative: RevisionEntry, revision: RevisionEntry
     ):
         self.insert_cache["revision_before_rev"].append((revision.id, relative.id))
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         self.insert_cache["revision_in_org"].append((revision.id, origin.id))
 
     def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
         date = self.insert_cache["revision"].get(revision.id, None)
         if date is None:
             # If not, check whether it's been query before
             date = self.select_cache["revision"].get(revision.id, None)
             if date is None:
                 # Otherwise, query the database and cache the value
                 self.cursor.execute(
                     """SELECT date FROM revision WHERE sha1=%s""", (revision.id,)
                 )
                 row = self.cursor.fetchone()
                 date = row[0] if row is not None else None
                 self.select_cache["revision"][revision.id] = date
         return date
 
-    def revision_get_prefered_origin(self, revision: RevisionEntry) -> int:
+    def revision_get_preferred_origin(self, revision: RevisionEntry) -> int:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT COALESCE(org,0) FROM revision WHERE sha1=%s""", (revision.id,)
         )
         row = self.cursor.fetchone()
         # None means revision is not in database;
-        # 0 means revision has no prefered origin
+        # 0 means revision has no preferred origin
         return row[0] if row is not None and row[0] != 0 else None
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_before_rev
                  JOIN revision
                    ON revision.id=revision_before_rev.prev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
 
-    def revision_set_prefered_origin(
+    def revision_set_preferred_origin(
         self, origin: OriginEntry, revision: RevisionEntry
     ):
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """UPDATE revision SET org=%s WHERE sha1=%s""", (origin.id, revision.id)
         )
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         # TODO: adapt this method to consider cached values
         self.cursor.execute(
             """SELECT 1
                  FROM revision_in_org
                  JOIN revision
                    ON revision.id=revision_in_org.rev
                  WHERE revision.sha1=%s""",
             (revision.id,),
         )
         return self.cursor.fetchone() is not None
diff --git a/swh/provenance/postgresql_nopath/provenance.sql b/swh/provenance/postgresql_nopath/provenance.sql
index 7d7f4ae..15d3644 100644
--- a/swh/provenance/postgresql_nopath/provenance.sql
+++ b/swh/provenance/postgresql_nopath/provenance.sql
@@ -1,130 +1,130 @@
 -- a Git object ID, i.e., a Git-style salted SHA1 checksum
 drop domain if exists sha1_git cascade;
 create domain sha1_git as bytea check (length(value) = 20);
 
 -- UNIX path (absolute, relative, individual path component, etc.)
 drop domain if exists unix_path cascade;
 create domain unix_path as bytea;
 
 
 drop table if exists content;
 create table content
 (
     id      bigserial primary key,      -- internal identifier of the content blob
     sha1    sha1_git unique not null,   -- intrinsic identifier of the content blob
     date    timestamptz not null        -- timestamp of the revision where the blob appears early
 );
 
 comment on column content.id is 'Content internal identifier';
 comment on column content.sha1 is 'Content intrinsic identifier';
 comment on column content.date is 'Earliest timestamp for the content (first seen time)';
 
 
 drop table if exists content_early_in_rev;
 create table content_early_in_rev
 (
     blob    bigint not null,            -- internal identifier of the content blob
     rev     bigint not null,            -- internal identifier of the revision where the blob appears for the first time
     primary key (blob, rev)
     -- foreign key (blob) references content (id),
     -- foreign key (rev) references revision (id)
 );
 
 comment on column content_early_in_rev.blob is 'Content internal identifier';
 comment on column content_early_in_rev.rev is 'Revision internal identifier';
 
 
 drop table if exists content_in_dir;
 create table content_in_dir
 (
     blob    bigint not null,            -- internal identifier of the content blob
     dir     bigint not null,            -- internal identifier of the directory contaning the blob
     primary key (blob, dir)
     -- foreign key (blob) references content (id),
     -- foreign key (dir) references directory (id)
 );
 
 comment on column content_in_dir.blob is 'Content internal identifier';
 comment on column content_in_dir.dir is 'Directory internal identifier';
 
 
 drop table if exists directory;
 create table directory
 (
     id      bigserial primary key,      -- internal identifier of the directory appearing in an isochrone inner frontier
     sha1    sha1_git unique not null,   -- intrinsic identifier of the directory
     date    timestamptz not null        -- max timestamp among those of the directory children's
 );
 
 comment on column directory.id is 'Directory internal identifier';
 comment on column directory.sha1 is 'Directory intrinsic identifier';
 comment on column directory.date is 'Latest timestamp for the content in the directory';
 
 
 drop table if exists directory_in_rev;
 create table directory_in_rev
 (
     dir     bigint not null,            -- internal identifier of the directory appearing in the revision
     rev     bigint not null,            -- internal identifier of the revision containing the directory
     primary key (dir, rev)
     -- foreign key (dir) references directory (id),
     -- foreign key (rev) references revision (id)
 );
 
 comment on column directory_in_rev.dir is 'Directory internal identifier';
 comment on column directory_in_rev.rev is 'Revision internal identifier';
 
 
 drop table if exists origin;
 create table origin
 (
     id      bigserial primary key,      -- internal identifier of the origin
     url     unix_path unique not null   -- url of the origin
 );
 
 comment on column origin.id is 'Origin internal identifier';
 comment on column origin.url is 'URL of the origin';
 
 
 drop table if exists revision;
 create table revision
 (
     id      bigserial primary key,      -- internal identifier of the revision
     sha1    sha1_git unique not null,   -- intrinsic identifier of the revision
     date    timestamptz not null,       -- timestamp of the revision
-    org     bigint                      -- id of the prefered origin
+    org     bigint                      -- id of the preferred origin
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision.id is 'Revision internal identifier';
 comment on column revision.sha1 is 'Revision intrinsic identifier';
 comment on column revision.date is 'Revision timestamp';
-comment on column revision.org is 'Prefered origin for the revision';
+comment on column revision.org is 'preferred origin for the revision';
 
 
 drop table if exists revision_before_rev;
 create table revision_before_rev
 (
     prev    bigserial not null,         -- internal identifier of the source revision
     next    bigserial not null,         -- internal identifier of the destination revision
     primary key (prev, next)
     -- foreign key (prev) references revision (id),
     -- foreign key (next) references revision (id)
 );
 
 comment on column revision_before_rev.prev is 'Source revision internal identifier';
 comment on column revision_before_rev.next is 'Destination revision internal identifier';
 
 
 drop table if exists revision_in_org;
 create table revision_in_org
 (
     rev     bigint not null,            -- internal identifier of the revision poined by the origin
     org     bigint not null,            -- internal identifier of the origin that points to the revision
     primary key (rev, org)
     -- foreign key (rev) references revision (id),
     -- foreign key (org) references origin (id)
 );
 
 comment on column revision_in_org.rev is 'Revision internal identifier';
 comment on column revision_in_org.org is 'Origin internal identifier';
diff --git a/swh/provenance/provenance.py b/swh/provenance/provenance.py
index 1fd68cd..03db259 100644
--- a/swh/provenance/provenance.py
+++ b/swh/provenance/provenance.py
@@ -1,348 +1,365 @@
 import os
 
 from .archive import ArchiveInterface
 from .model import DirectoryEntry, FileEntry, TreeEntry
 from .origin import OriginEntry
 from .revision import RevisionEntry
 
 from datetime import datetime
 from typing import Dict, Generator, List, Optional, Tuple
 
 
 # TODO: consider moving to path utils file together with normalize.
 def is_child(path: bytes, prefix: bytes) -> bool:
     return path != prefix and os.path.dirname(path) == prefix
 
 
 class ProvenanceInterface:
     def __init__(self, **kwargs):
         raise NotImplementedError
 
     def commit(self):
         raise NotImplementedError
 
     def content_add_to_directory(
         self, directory: DirectoryEntry, blob: FileEntry, prefix: bytes
     ):
         raise NotImplementedError
 
     def content_add_to_revision(
         self, revision: RevisionEntry, blob: FileEntry, prefix: bytes
     ):
         raise NotImplementedError
 
     def content_find_first(
         self, blobid: bytes
     ) -> Optional[Tuple[bytes, bytes, datetime, bytes]]:
         raise NotImplementedError
 
     def content_find_all(
         self, blobid: bytes
     ) -> Generator[Tuple[bytes, bytes, datetime, bytes], None, None]:
         raise NotImplementedError
 
     def content_get_early_date(self, blob: FileEntry) -> Optional[datetime]:
         raise NotImplementedError
 
     def content_get_early_dates(self, blobs: List[FileEntry]) -> Dict[bytes, datetime]:
         raise NotImplementedError
 
     def content_set_early_date(self, blob: FileEntry, date: datetime):
         raise NotImplementedError
 
     def directory_add_to_revision(
         self, revision: RevisionEntry, directory: DirectoryEntry, path: bytes
     ):
         raise NotImplementedError
 
     def directory_get_date_in_isochrone_frontier(
         self, directory: DirectoryEntry
     ) -> Optional[datetime]:
         raise NotImplementedError
 
     def directory_get_dates_in_isochrone_frontier(
         self, dirs: List[DirectoryEntry]
     ) -> Dict[bytes, datetime]:
         raise NotImplementedError
 
     def directory_invalidate_in_isochrone_frontier(self, directory: DirectoryEntry):
         raise NotImplementedError
 
     def directory_set_date_in_isochrone_frontier(
         self, directory: DirectoryEntry, date: datetime
     ):
         raise NotImplementedError
 
     def origin_get_id(self, origin: OriginEntry) -> int:
         raise NotImplementedError
 
     def revision_add(self, revision: RevisionEntry):
         raise NotImplementedError
 
     def revision_add_before_revision(
         self, relative: RevisionEntry, revision: RevisionEntry
     ):
         raise NotImplementedError
 
     def revision_add_to_origin(self, origin: OriginEntry, revision: RevisionEntry):
         raise NotImplementedError
 
     def revision_get_early_date(self, revision: RevisionEntry) -> Optional[datetime]:
         raise NotImplementedError
 
-    def revision_get_prefered_origin(self, revision: RevisionEntry) -> int:
+    def revision_get_preferred_origin(self, revision: RevisionEntry) -> int:
         raise NotImplementedError
 
     def revision_in_history(self, revision: RevisionEntry) -> bool:
         raise NotImplementedError
 
-    def revision_set_prefered_origin(
+    def revision_set_preferred_origin(
         self, origin: OriginEntry, revision: RevisionEntry
     ):
         raise NotImplementedError
 
     def revision_visited(self, revision: RevisionEntry) -> bool:
         raise NotImplementedError
 
 
 def directory_process_content(
     provenance: ProvenanceInterface, directory: DirectoryEntry, relative: DirectoryEntry
 ):
     stack = [(directory, b"")]
     while stack:
         current, prefix = stack.pop()
         for child in iter(current):
             if isinstance(child, FileEntry):
                 # Add content to the relative directory with the computed prefix.
                 provenance.content_add_to_directory(relative, child, prefix)
             else:
                 # Recursively walk the child directory.
                 stack.append((child, os.path.join(prefix, child.name)))
 
 
 def origin_add(provenance: ProvenanceInterface, origin: OriginEntry):
     # TODO: refactor to iterate over origin visit statuses and commit only once
     # per status.
     origin.id = provenance.origin_get_id(origin)
     for revision in origin.revisions:
         origin_add_revision(provenance, origin, revision)
         # Commit after each revision
         provenance.commit()  # TODO: verify this!
 
 
 def origin_add_revision(
     provenance: ProvenanceInterface, origin: OriginEntry, revision: RevisionEntry
 ):
     stack: List[Tuple[Optional[RevisionEntry], RevisionEntry]] = [(None, revision)]
 
     while stack:
         relative, current = stack.pop()
 
-        # Check if current revision has no prefered origin and update if necessary.
-        prefered = provenance.revision_get_prefered_origin(current)
+        # Check if current revision has no preferred origin and update if necessary.
+        preferred = provenance.revision_get_preferred_origin(current)
 
-        if prefered is None:
-            provenance.revision_set_prefered_origin(origin, current)
+        if preferred is None:
+            provenance.revision_set_preferred_origin(origin, current)
         ########################################################################
 
         if relative is None:
             # This revision is pointed directly by the origin.
             visited = provenance.revision_visited(current)
             provenance.revision_add_to_origin(origin, current)
 
             if not visited:
                 stack.append((current, current))
 
         else:
             # This revision is a parent of another one in the history of the
             # relative revision.
             for parent in iter(current):
                 visited = provenance.revision_visited(parent)
 
                 if not visited:
                     # The parent revision has never been seen before pointing
                     # directly to an origin.
                     known = provenance.revision_in_history(parent)
 
                     if known:
                         # The parent revision is already known in some other
                         # revision's history. We should point it directly to
                         # the origin and (eventually) walk its history.
                         stack.append((None, parent))
                     else:
                         # The parent revision was never seen before. We should
                         # walk its history and associate it with the same
                         # relative revision.
                         provenance.revision_add_before_revision(relative, parent)
                         stack.append((relative, parent))
                 else:
                     # The parent revision already points to an origin, so its
                     # history was properly processed before. We just need to
                     # make sure it points to the current origin as well.
                     provenance.revision_add_to_origin(origin, parent)
 
 
 def revision_add(
     provenance: ProvenanceInterface, archive: ArchiveInterface, revision: RevisionEntry
 ):
     assert revision.date is not None
     assert revision.root is not None
     # Processed content starting from the revision's root directory.
     date = provenance.revision_get_early_date(revision)
     if date is None or revision.date < date:
         provenance.revision_add(revision)
         # TODO: add file size filtering
         revision_process_content(
             provenance, revision, DirectoryEntry(archive, revision.root, b"")
         )
     # TODO: improve this! Maybe using a max attempt counter?
-    # Idealy Provenance class should guarante that a commit never fails.
+    # Ideally Provenance class should guarantee that a commit never fails.
     while not provenance.commit():
         continue
 
 
 class IsochroneNode:
     def __init__(self, entry: TreeEntry, dates: Dict[bytes, datetime] = {}):
         self.entry = entry
         self.date = dates.get(self.entry.id, None)
         self.children: List[IsochroneNode] = []
         self.maxdate: Optional[datetime] = None
 
     def add_child(
         self, child: TreeEntry, dates: Dict[bytes, datetime] = {}
     ) -> "IsochroneNode":
         assert isinstance(self.entry, DirectoryEntry) and self.date is None
         node = IsochroneNode(child, dates=dates)
         self.children.append(node)
         return node
 
 
 def build_isochrone_graph(
     provenance: ProvenanceInterface, revision: RevisionEntry, directory: DirectoryEntry
 ):
     assert revision.date is not None
     # Build the nodes structure
     root = IsochroneNode(directory)
     root.date = provenance.directory_get_date_in_isochrone_frontier(directory)
     stack = [root]
     while stack:
         current = stack.pop()
         assert isinstance(current.entry, DirectoryEntry)
         if current.date is None or current.date >= revision.date:
             # If current directory has an associated date in the isochrone frontier that
             # is greater or equal to the current revision's one, it should be ignored as
             # the revision is being processed out of order.
             if current.date is not None and current.date >= revision.date:
                 provenance.directory_invalidate_in_isochrone_frontier(current.entry)
                 current.date = None
             # Pre-query all known dates for content/directories in the current directory
             # for the provenance object to have them cached and (potentially) improve
             # performance.
             ddates = provenance.directory_get_dates_in_isochrone_frontier(
                 [child for child in current.entry if isinstance(child, DirectoryEntry)]
             )
             fdates = provenance.content_get_early_dates(
                 [child for child in current.entry if isinstance(child, FileEntry)]
             )
             for child in current.entry:
                 # Recursively analyse directory nodes.
                 if isinstance(child, DirectoryEntry):
                     node = current.add_child(child, dates=ddates)
                     stack.append(node)
                 else:
                     current.add_child(child, dates=fdates)
     # Precalculate max known date for each node in the graph.
     stack = [root]
     while stack:
         current = stack.pop()
         if current.date is None:
             if any(map(lambda child: child.maxdate is None, current.children)):
                 # Current node needs to be analysed again after its children.
                 stack.append(current)
                 for child in current.children:
                     if isinstance(child.entry, FileEntry):
                         if child.date is not None:
                             # File node that has been seen before, just use its known
                             # date.
                             child.maxdate = child.date
                         else:
                             # File node that has never been seen before, use current
                             # revision date.
                             child.maxdate = revision.date
                     else:
                         # Recursively analyse directory nodes.
                         stack.append(child)
             else:
                 maxdates = []
                 for child in current.children:
                     assert child.maxdate is not None
                     maxdates.append(child.maxdate)
                 current.maxdate = max(maxdates) if maxdates else revision.date
         else:
             # Directory node in the frontier, just use its known date.
             current.maxdate = current.date
     return root
 
 
 def revision_process_content(
     provenance: ProvenanceInterface, revision: RevisionEntry, root: DirectoryEntry
 ):
     assert revision.date is not None
     stack = [(build_isochrone_graph(provenance, revision, root), root.name)]
     while stack:
         current, path = stack.pop()
         if current.date is not None:
             assert current.date < revision.date
             # Current directory is an outer isochrone frontier for a previously
             # processed revision. It should be reused as is.
             provenance.directory_add_to_revision(revision, current.entry, path)
         else:
             # Current directory is not an outer isochrone frontier for any previous
             # revision. It might be eligible for this one.
             if is_new_frontier(current, revision):
                 assert current.maxdate is not None
                 # Outer frontier should be moved to current position in the isochrone
                 # graph. This is the first time this directory is found in the isochrone
                 # frontier.
                 provenance.directory_set_date_in_isochrone_frontier(
                     current.entry, current.maxdate
                 )
                 provenance.directory_add_to_revision(revision, current.entry, path)
                 directory_process_content(
                     provenance,
                     directory=current.entry,
                     relative=current.entry,
                 )
             else:
                 # No point moving the frontier here. Either there are no files or they
                 # are being seen for the first time here. Add all blobs to current
                 # revision updating date if necessary, and recursively analyse
-                # subdirectories as canditates to the outer frontier.
+                # subdirectories as candidates to the outer frontier.
                 for child in current.children:
                     if isinstance(child.entry, FileEntry):
                         blob = child.entry
                         if child.date is None or revision.date < child.date:
                             provenance.content_set_early_date(blob, revision.date)
                         provenance.content_add_to_revision(revision, blob, path)
                     else:
                         stack.append((child, os.path.join(path, child.entry.name)))
 
 
 def is_new_frontier(node: IsochroneNode, revision: RevisionEntry) -> bool:
     assert node.maxdate is not None and revision.date is not None
     # Using the following condition should we should get an algorithm equivalent to old
     # version where frontiers are pushed up in the tree whenever possible.
     return node.maxdate < revision.date
     # return node.maxdate < revision.date and has_blobs(node)
 
 
 def has_blobs(node: IsochroneNode) -> bool:
-    stack = [node]
-    while stack:
-        current = stack.pop()
-        if any(map(lambda child: isinstance(child.entry, FileEntry), current.children)):
-            return True
-        else:
-            # All children are directory entries.
-            stack.extend(current.children)
-    return False
+    # We may want to look for files in different ways to decide whether to define a
+    # frontier or not:
+    # 1. Only files in current node:
+    # return any(map(lambda child: isinstance(child.entry, FileEntry), node.children))
+    # 2. Files anywhere in the isochrone graph
+    # stack = [node]
+    # while stack:
+    #     current = stack.pop()
+    #     if any(
+    #         map(lambda child: isinstance(child.entry, FileEntry), current.children)):
+    #         return True
+    #     else:
+    #         # All children are directory entries.
+    #         stack.extend(current.children)
+    # return False
+    # 3. Files in the intermediate directories between current node and any previously
+    #    defined frontier:
+    return (
+        any(map(lambda child: isinstance(child.entry, FileEntry), node.children)) or
+        all(
+            map(
+                lambda child: (not (isinstance(child.entry, DirectoryEntry) and child.date is None)) or has_blobs(child),
+                node.children
+            )
+        )
+    )
diff --git a/swh/provenance/revision.py b/swh/provenance/revision.py
index 8fdfb5b..6df3f0b 100644
--- a/swh/provenance/revision.py
+++ b/swh/provenance/revision.py
@@ -1,184 +1,184 @@
 import threading
 
 from .archive import ArchiveInterface
 
 from datetime import datetime
 from typing import Optional
 
 from swh.model.hashutil import hash_to_bytes
 
 
 class RevisionEntry:
     def __init__(
         self,
         archive: ArchiveInterface,
         id: bytes,
         date: Optional[datetime] = None,
         root: Optional[bytes] = None,
         parents: Optional[list] = None,
     ):
         self.archive = archive
         self.id = id
         self.date = date
         self.parents = parents
         self.root = root
 
     def __iter__(self):
         if self.parents is None:
             self.parents = []
             for parent in self.archive.revision_get([self.id]):
                 if parent is not None:
                     self.parents.append(
                         RevisionEntry(
                             self.archive,
                             parent.id,
                             parents=[
                                 RevisionEntry(self.archive, id) for id in parent.parents
                             ],
                         )
                     )
 
         return iter(self.parents)
 
 
 ########################################################################################
 ########################################################################################
 
 
 class RevisionIterator:
     """Iterator interface."""
 
     def __iter__(self):
         pass
 
     def __next__(self):
         pass
 
 
 class FileRevisionIterator(RevisionIterator):
     """Iterator over revisions present in the given CSV file."""
 
     def __init__(
         self, filename: str, archive: ArchiveInterface, limit: Optional[int] = None
     ):
         self.file = open(filename)
         self.idx = 0
         self.limit = limit
         self.mutex = threading.Lock()
         self.archive = archive
 
     def next(self):
         self.mutex.acquire()
         line = self.file.readline().strip()
         if line and (self.limit is None or self.idx < self.limit):
             self.idx = self.idx + 1
             id, date, root = line.strip().split(",")
             self.mutex.release()
 
             return RevisionEntry(
                 self.archive,
                 hash_to_bytes(id),
                 date=datetime.fromisoformat(date),
                 root=hash_to_bytes(root),
             )
         else:
             self.mutex.release()
             return None
 
 
 # class ArchiveRevisionIterator(RevisionIterator):
 #     """Iterator over revisions present in the given database."""
 #
 #     def __init__(self, conn, limit=None, chunksize=100):
 #         self.cur = conn.cursor()
 #         self.chunksize = chunksize
 #         self.records = []
 #         if limit is None:
 #             self.cur.execute('''SELECT id, date, committer_date, directory
 #                             FROM revision''')
 #         else:
 #             self.cur.execute('''SELECT id, date, committer_date, directory
 #                             FROM revision
 #                             LIMIT %s''', (limit,))
 #         for row in self.cur.fetchmany(self.chunksize):
 #             record = self.make_record(row)
 #             if record is not None:
 #                 self.records.append(record)
 #         self.mutex = threading.Lock()
 #
 #     def __del__(self):
 #         self.cur.close()
 #
 #     def next(self):
 #         self.mutex.acquire()
 #         if not self.records:
 #             self.records.clear()
 #             for row in self.cur.fetchmany(self.chunksize):
 #                 record = self.make_record(row)
 #                 if record is not None:
 #                     self.records.append(record)
 #
 #         if self.records:
 #             revision, *self.records = self.records
 #             self.mutex.release()
 #             return revision
 #         else:
 #             self.mutex.release()
 #             return None
 #
 #     def make_record(self, row):
-#         # Only revision with author or commiter date are considered
+#         # Only revision with author or committer date are considered
 #         if row[1] is not None:
 #             # If the revision has author date, it takes precedence
 #             return RevisionEntry(row[0], row[1], row[3])
 #         elif row[2] is not None:
-#             # If not, we use the commiter date
+#             # If not, we use the committer date
 #             return RevisionEntry(row[0], row[2], row[3])
 
 
 ########################################################################################
 ########################################################################################
 
 # class RevisionWorker(threading.Thread):
 #     def __init__(
 #         self,
 #         id: int,
 #         conninfo: dict,
 #         archive: ArchiveInterface,
 #         revisions: RevisionIterator
 #     ):
 #         from .provenance import get_provenance
 #
 #         super().__init__()
 #         self.archive = archive
 #         self.id = id
 #         self.provenance = get_provenance(conninfo)
 #         self.revisions = revisions
 #
 #
 #     def run(self):
 #         from .provenance import revision_add
 #
 #
 #         while True:
 #             revision = self.revisions.next()
 #             if revision is None: break
 #
 #             processed = False
 #             while not processed:
 #                 logging.info(
 #                     f'Thread {(
 #                         self.id
 #                     )} - Processing revision {(
 #                         hash_to_hex(revision.id)
 #                     )} (timestamp: {revision.date})'
 #                 )
 #                 processed = revision_add(self.provenance, self.archive, revision)
 #                 if not processed:
 #                     logging.warning(
 #                         f'Thread {(
 #                              self.id
 #                         )} - Failed to process revision {(
 #                             hash_to_hex(revision.id)
 #                         )} (timestamp: {revision.date})'
 #                     )