diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
index 08c078b..82128a4 100644
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -1,163 +1,162 @@
-# Copyright (C) 2021  The Software Heritage developers
+# Copyright (C) 2021-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from datetime import datetime, timedelta, timezone
+from datetime import datetime
 from os import path
 from typing import Any, Dict, Generator, List
 
 from _pytest.fixtures import SubRequest
 import msgpack
 import psycopg2.extensions
 import pytest
 from pytest_postgresql.factories import postgresql
 
 from swh.journal.serializers import msgpack_ext_hook
-from swh.model.model import BaseModel
+from swh.model.model import BaseModel, TimestampWithTimezone
 from swh.provenance import get_provenance, get_provenance_storage
 from swh.provenance.archive import ArchiveInterface
 from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
 from swh.provenance.storage.archive import ArchiveStorage
 from swh.storage.interface import StorageInterface
 from swh.storage.replay import OBJECT_CONVERTERS, OBJECT_FIXERS, process_replay_objects
 
 
 @pytest.fixture(
     params=[
         "with-path",
         "without-path",
         "with-path-denormalized",
         "without-path-denormalized",
     ]
 )
 def provenance_postgresqldb(
     request: SubRequest,
     postgresql: psycopg2.extensions.connection,
 ) -> Dict[str, str]:
     """return a working and initialized provenance db"""
     from swh.core.db.db_utils import (
         init_admin_extensions,
         populate_database_for_package,
     )
 
     init_admin_extensions("swh.provenance", postgresql.dsn)
     populate_database_for_package(
         "swh.provenance", postgresql.dsn, flavor=request.param
     )
     return postgresql.get_dsn_parameters()
 
 
 @pytest.fixture(params=["postgresql", "rabbitmq"])
 def provenance_storage(
     request: SubRequest,
     provenance_postgresqldb: Dict[str, str],
 ) -> Generator[ProvenanceStorageInterface, None, None]:
     """Return a working and initialized ProvenanceStorageInterface object"""
 
     if request.param == "rabbitmq":
         from swh.provenance.api.server import ProvenanceStorageRabbitMQServer
 
         rabbitmq = request.getfixturevalue("rabbitmq")
         host = rabbitmq.args["host"]
         port = rabbitmq.args["port"]
         rabbitmq_params: Dict[str, Any] = {
             "url": f"amqp://guest:guest@{host}:{port}/%2f",
             "storage_config": {
                 "cls": "postgresql",
                 "db": provenance_postgresqldb,
                 "raise_on_commit": True,
             },
         }
         server = ProvenanceStorageRabbitMQServer(
             url=rabbitmq_params["url"], storage_config=rabbitmq_params["storage_config"]
         )
         server.start()
         with get_provenance_storage(cls=request.param, **rabbitmq_params) as storage:
             yield storage
         server.stop()
 
     else:
         # in test sessions, we DO want to raise any exception occurring at commit time
         with get_provenance_storage(
             cls=request.param, db=provenance_postgresqldb, raise_on_commit=True
         ) as storage:
             yield storage
 
 
 provenance_postgresql = postgresql("postgresql_proc", dbname="provenance_tests")
 
 
 @pytest.fixture
 def provenance(
     provenance_postgresql: psycopg2.extensions.connection,
 ) -> Generator[ProvenanceInterface, None, None]:
     """Return a working and initialized ProvenanceInterface object"""
 
     from swh.core.db.db_utils import (
         init_admin_extensions,
         populate_database_for_package,
     )
 
     init_admin_extensions("swh.provenance", provenance_postgresql.dsn)
     populate_database_for_package(
         "swh.provenance", provenance_postgresql.dsn, flavor="with-path"
     )
     # in test sessions, we DO want to raise any exception occurring at commit time
     with get_provenance(
         cls="postgresql",
         db=provenance_postgresql.get_dsn_parameters(),
         raise_on_commit=True,
     ) as provenance:
         yield provenance
 
 
 @pytest.fixture
 def archive(swh_storage: StorageInterface) -> ArchiveInterface:
     """Return an ArchiveStorage-based ArchiveInterface object"""
     return ArchiveStorage(swh_storage)
 
 
 def fill_storage(storage: StorageInterface, data: Dict[str, List[dict]]) -> None:
     objects = {
         objtype: [objs_from_dict(objtype, d) for d in dicts]
         for objtype, dicts in data.items()
     }
     process_replay_objects(objects, storage=storage)
 
 
 def get_datafile(fname: str) -> str:
     return path.join(path.dirname(__file__), "data", fname)
 
 
 # TODO: this should return Dict[str, List[BaseModel]] directly, but it requires
 #       refactoring several tests
 def load_repo_data(repo: str) -> Dict[str, List[dict]]:
     data: Dict[str, List[dict]] = {}
     with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj:
         unpacker = msgpack.Unpacker(
             fobj,
             raw=False,
             ext_hook=msgpack_ext_hook,
             strict_map_key=False,
             timestamp=3,  # convert Timestamp in datetime objects (tz UTC)
         )
-        for objtype, objd in unpacker:
+        for msg in unpacker:
+            if len(msg) == 2:  # old format
+                objtype, objd = msg
+            else:  # now we should have a triplet (type, key, value)
+                objtype, _, objd = msg
             data.setdefault(objtype, []).append(objd)
     return data
 
 
 def objs_from_dict(object_type: str, dict_repr: dict) -> BaseModel:
     if object_type in OBJECT_FIXERS:
         dict_repr = OBJECT_FIXERS[object_type](dict_repr)
     obj = OBJECT_CONVERTERS[object_type](dict_repr)
     return obj
 
 
-# TODO: remove this function in favour of TimestampWithTimezone.to_datetime
-#       from swh.model.model
 def ts2dt(ts: Dict[str, Any]) -> datetime:
-    timestamp = datetime.fromtimestamp(
-        ts["timestamp"]["seconds"], timezone(timedelta(minutes=ts["offset"]))
-    )
-    return timestamp.replace(microsecond=ts["timestamp"]["microseconds"])
+    return TimestampWithTimezone.from_dict(ts).to_datetime()
diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md
index 81dff87..5fc5a33 100644
--- a/swh/provenance/tests/data/README.md
+++ b/swh/provenance/tests/data/README.md
@@ -1,166 +1,184 @@
 # Provenance Index Test Dataset
 
 This directory contains datasets used by `test_provenance_heurstics` tests of
 the provenance index database.
 
+## Datasets
+
+There are currently 3 dataset:
+
+- cmdbts2: original dataset
+- out-of-order: with unsorted revisions
+- with-merge: with merge revisions
+
 Each dataset `xxx` consist in several parts:
 
 - a description of a git repository as a yaml file named `xxx_repo.yaml`,
 - a msgpack file containing storage objects for the given repository, from
   which the storage is filled before each test using these data, and
 - a set of synthetic files, named `synthetic_xxx_(lower|upper)_<mindepth>.txt`,
   describing the expected result in the provenance database if ingested with
   the flag `lower` set or not set, and the `mindepth` value (integer, most
   often `1` or `2`).
+### Generate datasets files
 
+For each dataset `xxx`, execute a number of commands:
+
+```
+for dataset in cmdbts2 out-of-order with-merges; do
+  python generate_repo.py -C ${dataset}_repo.yaml $dataset > synthetic_${dataset}_template.txt
+  # you may want to edit/update synthetic files from this template, see below
+  python generate_storage_from_git.py $dataset
+done
+```
 
 ## Git repos description file
 
 The description of a git repository is a yaml file which contains a list dicts,
 each one representing a git revision to add (linearly) in the git repo used a
 base for the dataset. Each dict consist in a structure like:
 
 ``` yaml
 - msg: R00
 	date: 1000000000
 	content:
     A/B/C/a: "content a"
 
 ```
 
 this example will generate a git commit with the commit message "R00", the
 author and committer date 1000000000 (given as a unix timestamp), and a one
 file which path is `A/B/C/a` and content is "content a".
 
 The file is parsed to create git revisions in a temporary git repository, in
 order of appearance in the yaml file (so one may create an git repository with
 'out-of-order' commits).
 
 There is no way of creating branches and merges for now.
 
 The tool to generate this git repo is `generate_repo.py`:
 
 ```
  python generate_repo.py --help
 Usage: generate_repo.py [OPTIONS] INPUT_FILE OUTPUT_DIR
 
 Options:
   -C, --clean-output / --no-clean-output
   --help                          Show this message and exit.
 ```
 
 It generates a git repository in the `OUTPUT_DIR` as well as produces a
 template `synthetic` file on its standard output, which can be used to ease
 writing the expected `synthetic` files.
 
 Typical usage will be:
 
 ```
 python generate_repo.py repo2_repo.yaml repo2 > synthetic_repo2_template.txt
 ```
 
 Note that hashes (for revision, directories and content) of the git objects
 only depends on the content of the input yaml file. Calling the tool twice on
 the same input file should generate the exact same git repo twice.
 
 Also note that the tool will add a branch at each revision (using the commit
 message as bramch name), to make it easier to reference any point in the git
 history.
 
 ## Msgpack dump of the storage
 
 This file contains a set of storage objects (`Revision`, `Content` and
 `Directory`) and is usually generated from a local git repository (typically
 the one generated by the previous command) using the
 `generate_storage_from_git.py` tool:
 
 ```
 python generate_storage_from_git.py --help
 Usage: generate_storage_from_git.py [OPTIONS] GIT_REPO
 
   simple tool to generate the CMDBTS.msgpack dataset filed used in tests
 
 Options:
   -r, --head TEXT    head revision to start from
   -o, --output TEXT  output file
   --help             Show this message and exit.
 
 ```
 
 Typical usage would be, using the git repository `repo2` created previously:
 
 ```
 python generate_storage_from_git.py repo2
 Revision hash for master is 8363e8e98751dc9f264d2fedd6b829ad4b1218b0
 Wrote 86 objects in repo2.msgpack
 ```
 
 ### Adding extra visits/snapshots
 
 It is also possible to generate a storage from a git repo with extra origin
 visits, using the `--visit` option of the `generate_storage_from_git` tool.
 
 This option expect a yaml file as argument. This file contains a description of
 extra visits (and snapshots) you want to add to the storage.
 
 The format is simple, for example:
 
 ```
 # a visit pattern scenario for the 'repo_with_merges' repo
 
 - origin: http://repo_with_merges/1/
   date: 1000000015
   branches:
     - R01
 
 ```
 
 will create an OriginVisit (at given date) for the given origin URL (the Origin
 will be created as well), with a `Snapshot` including the listed
 branches.
 
 
 ## Synthetic files
 
 These files describe the expected content of the provenance database for each
 revision (in order of ingestion).
 
 The `generate_repo.py` tool will produce a template of synthetic file like:
 
 ```
 1000000000.0 b582a17b3fc37f72fc57877616f85c3f0abed064 R00
 R00   |       |                      | R b582a17b3fc37f72fc57877616f85c3f0abed064 | 1000000000.0
       |       | .                    | D a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1 | 0.0
       |       | A                    | D 1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b | 0.0
       |       | A/B                  | D 36876d475197b5ad86ad592e8e28818171455f16 | 0.0
       |       | A/B/C                | D 98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3 | 0.0
       |       | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0
 
 1000000010.0 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe R01
 R01   |       |                      | R 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe | 1000000010.0
       |       | .                    | D b3cf11b22c9f93c3c494cf90ab072f394155072d | 0.0
       |       | A                    | D baca735bf8b8720131b4bfdb47c51631a9260348 | 0.0
       |       | A/B                  | D 4b28979d88ed209a09c272bcc80f69d9b18339c2 | 0.0
       |       | A/B/C                | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | 0.0
       |       | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0
       |       | A/B/C/b              | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0.0
 
 [...]
 ```
 
 where all the content and directories of each revision are listed; it's then
 the responsibility of the user to create the expected synthetic file for a
 given heuristics configuration. For example, the 2 revisions above are to be
 adapted, for the `(lower=True, mindepth=1)` case, as:
 
 ```
 1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00
 R00   |       |                      | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000
       | R---C | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0
 
 1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01
 R01   |       |                      | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010
       | R---C | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10
       | R---C | A/B/C/b              | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0
 
 ```
diff --git a/swh/provenance/tests/data/cmdbts2.msgpack b/swh/provenance/tests/data/cmdbts2.msgpack
index 5dfd186..f98be7d 100644
Binary files a/swh/provenance/tests/data/cmdbts2.msgpack and b/swh/provenance/tests/data/cmdbts2.msgpack differ
diff --git a/swh/provenance/tests/data/cmdbts2_repo.yaml b/swh/provenance/tests/data/cmdbts2_repo.yaml
index d58e8d2..1832c75 100644
--- a/swh/provenance/tests/data/cmdbts2_repo.yaml
+++ b/swh/provenance/tests/data/cmdbts2_repo.yaml
@@ -1,80 +1,82 @@
 - msg: R00
   date: 1000000000
   content:
     A/B/C/a: "content a"
 - msg: R01
   date: 1000000010
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
 - msg: R02
   date: 1000000020
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
     A/B/c: "content c"
 - msg: R03
   date: 1000000030
   content:
     A/C/a: "content a"
     A/C/b: "content b"
     A/a: "content a"
 - msg: R04
   date: 1000000040
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
     A/B/c: "content c"
 - msg: R05
   date: 1000000050
   content:
     D/d: "content d"
 - msg: R06
   date: 1000000060
   content:
     D/E/D/d: "content d"
 - msg: R07
   date: 1000000070
   content:
     F/E/D/d: "content d"
     F/d: "content d"
 - msg: R08
   date: 1000000080
   content:
     F/E/D/d: "content d"
     F/E/D/e: "content d"  # /!\ same content as d
     F/a: "content a"
+  tag: "0.1"
 - msg: R09
   date: 1000000090
   content:
     F/E/D/f: "content f"
     F/E/D/g: "content f"  # /!\ same content as f
     F/g: "content f"
 - msg: R10
   date: 1000000100
   content:
     F/E/D/f: "content f"
     F/E/D/g: "content f"  # /!\
     F/h: "content h"
 - msg: R11
   date: 1000000110
   content:
     G/E/D/f: "content f"
     G/E/D/g: "content f"  # /!\
     G/i: "content i"
 - msg: R12
   date: 1000000120
   content:
     G/H/D/f: "content f"
     G/H/D/g: "content f"  # /!\
     G/j: "content j"
 - msg: R13
   date: 1000000130
   content:
     G/H/D/f: "content f"
     G/H/D/g: "content f"
     G/I/D/f: "content f"
     G/I/D/g: "content f"
     G/D/f: "content f"
     G/D/g: "content f"
     G/k: "content k"
+  tag: "1.0"
diff --git a/swh/provenance/tests/data/cmdbts2_visits-01.yaml b/swh/provenance/tests/data/cmdbts2_visits-01.yaml
new file mode 100644
index 0000000..4d4d3ea
--- /dev/null
+++ b/swh/provenance/tests/data/cmdbts2_visits-01.yaml
@@ -0,0 +1,39 @@
+# a visit pattern scenario for the 'cmdbts2' repo
+
+- origin: https://cmdbts2
+  date: 1000000150
+  branches:
+    - R13
+
+- origin: http://cmdbts2/1
+  date: 1000000015
+  branches:
+    - R01
+
+- origin: http://cmdbts2/1
+  date: 1000000025
+  branches:
+    - R03
+    - R06
+
+- origin: http://cmdbts2/2
+  date: 1000000035
+  branches:
+    - R05
+    - R06
+
+- origin: http://cmdbts2/1
+  date: 1000000045
+  branches:
+    - R06
+    - R07
+
+- origin: http://cmdbts2/1
+  date: 1000000055
+  branches:
+    - R08
+
+- origin: http://cmdbts2/2
+  date: 1000000065
+  branches:
+    - R08
diff --git a/swh/provenance/tests/data/generate_repo.py b/swh/provenance/tests/data/generate_repo.py
index 0a2e024..5b1ec2f 100644
--- a/swh/provenance/tests/data/generate_repo.py
+++ b/swh/provenance/tests/data/generate_repo.py
@@ -1,116 +1,122 @@
 import os
 import pathlib
 import shutil
 from subprocess import PIPE, check_call, check_output
 from typing import Any, Dict, List
 
 import click
 import yaml
 
 
 def clean_wd() -> None:
     _, dirnames, filenames = next(os.walk("."))
     for d in dirnames:
         if not d.startswith(".git"):
             shutil.rmtree(d)
     for f in filenames:
         if not f.startswith(".git"):
             os.unlink(f)
 
 
 def print_ids() -> None:
     revid = check_output(["git", "rev-parse", "HEAD"]).decode().strip()
     ts, msg = (
         check_output(["git", "log", "-1", '--format="%at %s"'])
         .decode()
         .strip()[1:-1]
         .split()
     )
     print(f"{ts}.0 {revid} {msg}")
     print(f"{msg:<5} | {'':>5} | {'':>20} | R {revid} | {ts}.0")
 
     for currentpath, dirnames, filenames in os.walk("."):
         if currentpath == ".":
             output = check_output(["git", "cat-file", "-p", "HEAD"]).decode()
             dirhash = output.splitlines()[0].split()[1]
         else:
             currentpath = currentpath[2:]
             output = check_output(["git", "ls-tree", "HEAD", currentpath]).decode()
             dirhash = output.split()[2]
 
         print(f"{'':>5} | {'':>5} | {currentpath:<20} | D {dirhash} | 0.0")
         for fname in filenames:
             fname = os.path.join(currentpath, fname)
             output = check_output(["git", "ls-tree", "HEAD", fname]).decode()
             fhash = output.split()[2]
             print(f"{'':>5} | {'':>5} | {fname:<20} | C {fhash} | 0.0")
 
         if ".git" in dirnames:
             dirnames.remove(".git")
 
 
 def generate_repo(repo_desc: List[Dict[str, Any]], output_dir: str) -> None:
     check_call(["git", "init", output_dir], stdout=PIPE, stderr=PIPE)
     os.chdir(output_dir)
     os.environ.update(
         {
             "GIT_AUTHOR_NAME": "SWH",
             "GIT_AUTHOR_EMAIL": "contact@softwareheritage.org",
             "GIT_COMMITTER_NAME": "SWH",
             "GIT_COMMITTER_EMAIL": "contact@softwareheritage.org",
         }
     )
 
     for rev_d in repo_desc:
         parents = rev_d.get("parents")
         if parents:
             # move at the proper (first) parent position, if any
             check_call(["git", "checkout", parents[0]], stdout=PIPE)
 
         # give a branch name (the msg) to each commit to make it easier to
         # navigate in history
         check_call(["git", "checkout", "-b", rev_d["msg"]], stdout=PIPE)
 
         if parents and len(parents) > 1:
             # it's a merge
             check_call(["git", "merge", "--no-commit", *parents[1:]], stdout=PIPE)
 
         clean_wd()
         for path, content in rev_d["content"].items():
             p = pathlib.Path(path)
             p.parent.mkdir(parents=True, exist_ok=True)
             p.write_text(content)
         os.environ.update(
             {
                 "GIT_AUTHOR_DATE": str(rev_d["date"]),
                 "GIT_COMMITTER_DATE": str(rev_d["date"]),
             }
         )
         check_call(["git", "add", "."], stdout=PIPE)
         check_call(
             [
                 "git",
                 "commit",
                 "--all",
                 "--allow-empty",
                 "-m",
                 rev_d["msg"],
             ],
             stdout=PIPE,
         )
+        if rev_d.get("tag"):
+            check_call(
+                ["git", "tag", "-a", str(rev_d["tag"]), "-m", "tag message"],
+                stdout=PIPE,
+            )
+
         print_ids()
 
 
 @click.command(name="generate-repo")
 @click.argument("input-file")
 @click.argument("output-dir")
 @click.option("-C", "--clean-output/--no-clean-output", default=False)
 def main(input_file: str, output_dir: str, clean_output: bool) -> None:
     repo_desc = yaml.load(open(input_file), Loader=yaml.Loader)
     if clean_output and os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     generate_repo(repo_desc, output_dir)
 
 
 if __name__ == "__main__":
     main()
diff --git a/swh/provenance/tests/data/generate_storage_from_git.py b/swh/provenance/tests/data/generate_storage_from_git.py
index e31c54a..14cb178 100644
--- a/swh/provenance/tests/data/generate_storage_from_git.py
+++ b/swh/provenance/tests/data/generate_storage_from_git.py
@@ -1,119 +1,121 @@
-# Copyright (C) 2021  The Software Heritage developers
+# Copyright (C) 2021-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 import os
 from subprocess import check_output
 from typing import Dict, Optional
 
 import click
 import yaml
 
 from swh.loader.git.from_disk import GitLoaderFromDisk
 from swh.model.hashutil import hash_to_bytes
 from swh.model.model import (
     Origin,
     OriginVisit,
     OriginVisitStatus,
     Snapshot,
     SnapshotBranch,
     TargetType,
 )
 from swh.storage import get_storage
 from swh.storage.interface import StorageInterface
 
 
 def load_git_repo(
     url: str, directory: str, storage: StorageInterface
 ) -> Dict[str, str]:
     visit_date = datetime.now(tz=timezone.utc)
     loader = GitLoaderFromDisk(
         url=url,
         directory=directory,
         visit_date=visit_date,
         storage=storage,
     )
     return loader.load()
 
 
 @click.command()
 @click.option("-o", "--output", default=None, help="output file")
 @click.option(
     "-v",
     "--visits",
     type=click.File(mode="rb"),
     default=None,
     help="additional visits to generate.",
 )
 @click.argument("git-repo", type=click.Path(exists=True, file_okay=False))
 def main(output: Optional[str], visits: bytes, git_repo: str) -> None:
     "simple tool to generate the git_repo.msgpack dataset file used in some tests"
     if output is None:
         output = f"{git_repo}.msgpack"
     with open(output, "wb") as outstream:
         sto = get_storage(
             cls="memory", journal_writer={"cls": "stream", "output_stream": outstream}
         )
         if git_repo.endswith("/"):
             git_repo = git_repo[:-1]
 
         reponame = os.path.basename(git_repo)
         load_git_repo(f"https://{reponame}", git_repo, sto)
 
         if visits:
             # retrieve all branches from the actual git repo
             all_branches = {
                 ref: sha1
                 for sha1, ref in (
                     line.strip().split()
                     for line in check_output(["git", "-C", git_repo, "show-ref"])
                     .decode()
                     .splitlines()
                 )
             }
 
             for visit in yaml.full_load(visits):
                 # add the origin (if it already exists, this is a noop)
                 sto.origin_add([Origin(url=visit["origin"])])
                 # add a new visit for this origin
                 visit_id = list(
                     sto.origin_visit_add(
                         [
                             OriginVisit(
                                 origin=visit["origin"],
                                 date=datetime.fromtimestamp(
                                     visit["date"], tz=timezone.utc
                                 ),
                                 type="git",
                             )
                         ]
                     )
                 )[0].visit
                 assert visit_id is not None
                 # add a snapshot with branches from the input file
                 branches = {
                     f"refs/heads/{name}".encode(): SnapshotBranch(
                         target=hash_to_bytes(all_branches[f"refs/heads/{name}"]),
                         target_type=TargetType.REVISION,
                     )
                     for name in visit["branches"]
                 }
                 snap = Snapshot(branches=branches)
                 sto.snapshot_add([snap])
                 # add a "closing" origin visit status update referencing the snapshot
                 status = OriginVisitStatus(
                     origin=visit["origin"],
                     visit=visit_id,
-                    date=datetime.fromtimestamp(visit["date"], tz=timezone.utc),
+                    # +1 required otherwise this second visit status is ignored
+                    # (storage sql query with ON CONFLICT DO NOTHING)
+                    date=datetime.fromtimestamp(visit["date"] + 1, tz=timezone.utc),
                     status="full",
                     snapshot=snap.id,
                 )
                 sto.origin_visit_status_add([status])
 
     click.echo(f"Serialized the storage made from {reponame} in {output}")
 
 
 if __name__ == "__main__":
     main()
diff --git a/swh/provenance/tests/data/origins.csv b/swh/provenance/tests/data/origins.csv
index e7e44bc..e246ec2 100644
--- a/swh/provenance/tests/data/origins.csv
+++ b/swh/provenance/tests/data/origins.csv
@@ -1 +1 @@
-https://cmdbts2,5f577c4d4e5a1d0bca64f78facfb891933b17d94
+https://cmdbts2,b2bd799f3c32f1a53b0370f0ea7afc78757b4f0d
diff --git a/swh/provenance/tests/data/out-of-order.msgpack b/swh/provenance/tests/data/out-of-order.msgpack
index 660fe16..624bf27 100644
Binary files a/swh/provenance/tests/data/out-of-order.msgpack and b/swh/provenance/tests/data/out-of-order.msgpack differ
diff --git a/swh/provenance/tests/data/out-of-order_repo.yaml b/swh/provenance/tests/data/out-of-order_repo.yaml
index 9d13f7a..9926d0c 100644
--- a/swh/provenance/tests/data/out-of-order_repo.yaml
+++ b/swh/provenance/tests/data/out-of-order_repo.yaml
@@ -1,35 +1,36 @@
 - msg: R00
   date: 1000000000
   content:
     A/B/C/a: "content a"
 - msg: R01
   date: 1000000010
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
 - msg: R02
   date: 1000000020
   content:
     A/C/a: "content a"
     A/C/b: "content b"
 - msg: R03
   date: 1000000030
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
 - msg: R04
   date: 1000000040
   content:
     A/C/a: "content a"
     A/C/b: "content b"
 - msg: R05
   date: 1000000005  # /!\ we add an earlier version of the 'b' file
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
 - msg: R06
   date: 1000000050
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
     A/B/c: "content c"
+  tag: "1.0"
diff --git a/swh/provenance/tests/data/with-merges.msgpack b/swh/provenance/tests/data/with-merges.msgpack
index 52ff778..6501325 100644
Binary files a/swh/provenance/tests/data/with-merges.msgpack and b/swh/provenance/tests/data/with-merges.msgpack differ
diff --git a/swh/provenance/tests/data/with-merges_repo.yaml b/swh/provenance/tests/data/with-merges_repo.yaml
index 74745c7..f1d841c 100644
--- a/swh/provenance/tests/data/with-merges_repo.yaml
+++ b/swh/provenance/tests/data/with-merges_repo.yaml
@@ -1,73 +1,76 @@
 # generate a git history with a multi-merge revision
 # *-.   R08
 # |\ \
 # | * | R07
 # | | |
 # | | * R06
 # | | |
 # * | | R05
 # | | |
 # * | | R04
 # |/  |
 # *   | R03
 # |  /
 # * /   R02
 # |/
 # *     R01
 # |
 # *     R00
 - msg: R00
   date: 1000000000
   content:
     A/B/C/a: "content a"
 - msg: R01
   date: 1000000010
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
+  tag: "0.0"
 - msg: R02
   date: 1000000020
   content:
     A/C/a: "content a"
     A/C/b: "content b"
 - msg: R03
   date: 1000000030
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
 - msg: R04
   date: 1000000040
   content:
     A/C/a: "content a"
     A/C/b: "content b"
 - msg: R05
   date: 1000000050
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
     A/B/c: "content c"
 - msg: R06
   parents:
     - R01
   date: 1000000005  # /!\ we add an earlier version of the 'b' file
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
 - msg: R07
   parents:
     - R03
   date: 1000000035  # /!\ we add an earlier version of the 'b' file
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
     A/B/c: "content c"
+  tag: "0.9"
 - msg: R08
   parents:
     - R05
     - R06
     - R07
   date: 1000000060
   content:
     A/B/C/a: "content a"
     A/B/C/b: "content b"
     A/B/c: "content c"
+  tag: "1.0"
diff --git a/swh/provenance/tests/test_archive_interface.py b/swh/provenance/tests/test_archive_interface.py
index 624cee8..9d673d8 100644
--- a/swh/provenance/tests/test_archive_interface.py
+++ b/swh/provenance/tests/test_archive_interface.py
@@ -1,255 +1,254 @@
 # Copyright (C) 2021-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import Counter
 from operator import itemgetter
 from typing import Any
 from typing import Counter as TCounter
 from typing import Dict, Iterable, List, Set, Tuple, Type, Union
 
 import pytest
 
 from swh.core.db import BaseDb
 from swh.graph.naive_client import NaiveClient
 from swh.model.model import (
+    SWH_MODEL_OBJECT_TYPES,
     BaseModel,
     Content,
     Directory,
     DirectoryEntry,
+    ObjectType,
     Origin,
-    OriginVisit,
     OriginVisitStatus,
+    Release,
     Revision,
     Sha1Git,
     Snapshot,
     SnapshotBranch,
     TargetType,
 )
 from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID
 from swh.provenance.archive import ArchiveInterface
 from swh.provenance.multiplexer.archive import ArchiveMultiplexed
 from swh.provenance.postgresql.archive import ArchivePostgreSQL
 from swh.provenance.storage.archive import ArchiveStorage
 from swh.provenance.swhgraph.archive import ArchiveGraph
 from swh.provenance.tests.conftest import fill_storage, load_repo_data
 from swh.storage.interface import StorageInterface
 from swh.storage.postgresql.storage import Storage
 
 
 class ArchiveNoop:
     storage: StorageInterface
 
     def directory_ls(self, id: Sha1Git, minsize: int = 0) -> Iterable[Dict[str, Any]]:
         return []
 
     def revision_get_some_outbound_edges(
         self, revision_id: Sha1Git
     ) -> Iterable[Tuple[Sha1Git, Sha1Git]]:
         return []
 
     def snapshot_get_heads(self, id: Sha1Git) -> Iterable[Sha1Git]:
         return []
 
 
 def check_directory_ls(
     reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]]
 ) -> None:
     for directory in data["directory"]:
         entries_ref = sorted(
             reference.directory_ls(directory["id"]), key=itemgetter("name")
         )
         entries = sorted(archive.directory_ls(directory["id"]), key=itemgetter("name"))
         assert entries_ref == entries
 
 
 def check_revision_get_some_outbound_edges(
     reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]]
 ) -> None:
     for revision in data["revision"]:
         parents_ref: TCounter[Tuple[Sha1Git, Sha1Git]] = Counter(
             reference.revision_get_some_outbound_edges(revision["id"])
         )
         parents: TCounter[Tuple[Sha1Git, Sha1Git]] = Counter(
             archive.revision_get_some_outbound_edges(revision["id"])
         )
 
         # Check that all the reference outbound edges are included in the other
         # archives's outbound edges
         assert set(parents_ref.items()) <= set(parents.items())
 
 
 def check_snapshot_get_heads(
     reference: ArchiveInterface, archive: ArchiveInterface, data: Dict[str, List[dict]]
 ) -> None:
     for snapshot in data["snapshot"]:
         heads_ref: TCounter[Sha1Git] = Counter(
             reference.snapshot_get_heads(snapshot["id"])
         )
         heads: TCounter[Sha1Git] = Counter(archive.snapshot_get_heads(snapshot["id"]))
         assert heads_ref == heads
 
 
 def get_object_class(object_type: str) -> Type[BaseModel]:
-    if object_type == "origin":
-        return Origin
-    elif object_type == "origin_visit":
-        return OriginVisit
-    elif object_type == "origin_visit_status":
-        return OriginVisitStatus
-    elif object_type == "content":
-        return Content
-    elif object_type == "directory":
-        return Directory
-    elif object_type == "revision":
-        return Revision
-    elif object_type == "snapshot":
-        return Snapshot
-    raise ValueError
+    return SWH_MODEL_OBJECT_TYPES[object_type]
 
 
 def data_to_model(data: Dict[str, List[dict]]) -> Dict[str, List[BaseModel]]:
     model: Dict[str, List[BaseModel]] = {}
     for object_type, objects in data.items():
         for object in objects:
             model.setdefault(object_type, []).append(
                 get_object_class(object_type).from_dict(object)
             )
     return model
 
 
 def add_link(
     edges: Set[
         Tuple[
             Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str]
         ]
     ],
-    src_obj: Union[Origin, Snapshot, Revision, Directory, Content],
+    src_obj: Union[Content, Directory, Origin, Release, Revision, Snapshot],
     dst_id: bytes,
     dst_type: ExtendedObjectType,
 ) -> None:
     swhid = ExtendedSWHID(object_type=dst_type, object_id=dst_id)
     edges.add((src_obj.swhid(), swhid))
 
 
 def get_graph_data(
     data: Dict[str, List[dict]]
 ) -> Tuple[
     List[Union[CoreSWHID, ExtendedSWHID, str]],
     List[
         Tuple[
             Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str]
         ]
     ],
 ]:
     nodes: Set[Union[CoreSWHID, ExtendedSWHID, str]] = set()
     edges: Set[
         Tuple[
             Union[CoreSWHID, ExtendedSWHID, str], Union[CoreSWHID, ExtendedSWHID, str]
         ]
     ] = set()
 
     model = data_to_model(data)
 
     for origin in model["origin"]:
         assert isinstance(origin, Origin)
         nodes.add(origin.swhid())
         for status in model["origin_visit_status"]:
             assert isinstance(status, OriginVisitStatus)
             if status.origin == origin.url and status.snapshot is not None:
                 add_link(edges, origin, status.snapshot, ExtendedObjectType.SNAPSHOT)
 
     for snapshot in model["snapshot"]:
         assert isinstance(snapshot, Snapshot)
         nodes.add(snapshot.swhid())
         for branch in snapshot.branches.values():
             assert isinstance(branch, SnapshotBranch)
             if branch.target_type in [TargetType.RELEASE, TargetType.REVISION]:
                 target_type = (
                     ExtendedObjectType.RELEASE
                     if branch.target_type == TargetType.RELEASE
                     else ExtendedObjectType.REVISION
                 )
                 add_link(edges, snapshot, branch.target, target_type)
 
     for revision in model["revision"]:
         assert isinstance(revision, Revision)
         nodes.add(revision.swhid())
         # root directory
         add_link(edges, revision, revision.directory, ExtendedObjectType.DIRECTORY)
         # parent
         for parent in revision.parents:
             add_link(edges, revision, parent, ExtendedObjectType.REVISION)
 
+    dir_entry_types = {
+        "file": ExtendedObjectType.CONTENT,
+        "dir": ExtendedObjectType.DIRECTORY,
+        "rev": ExtendedObjectType.REVISION,
+    }
     for directory in model["directory"]:
         assert isinstance(directory, Directory)
         nodes.add(directory.swhid())
         for entry in directory.entries:
             assert isinstance(entry, DirectoryEntry)
-            if entry.type == "file":
-                target_type = ExtendedObjectType.CONTENT
-            elif entry.type == "dir":
-                target_type = ExtendedObjectType.DIRECTORY
-            elif entry.type == "rev":
-                target_type = ExtendedObjectType.REVISION
-            else:
-                assert False, "unknown directory entry type"
-            add_link(edges, directory, entry.target, target_type)
+            add_link(edges, directory, entry.target, dir_entry_types[entry.type])
 
     for content in model["content"]:
         assert isinstance(content, Content)
         nodes.add(content.swhid())
 
+    object_type = {
+        ObjectType.CONTENT: ExtendedObjectType.CONTENT,
+        ObjectType.DIRECTORY: ExtendedObjectType.DIRECTORY,
+        ObjectType.REVISION: ExtendedObjectType.REVISION,
+        ObjectType.RELEASE: ExtendedObjectType.RELEASE,
+        ObjectType.SNAPSHOT: ExtendedObjectType.SNAPSHOT,
+    }
+    for release in model["release"]:
+        assert isinstance(release, Release)
+        nodes.add(release.swhid())
+
+        if release.target is not None:
+            add_link(edges, release, release.target, object_type[release.target_type])
+
     return list(nodes), list(edges)
 
 
 @pytest.mark.parametrize(
     "repo",
     ("cmdbts2", "out-of-order", "with-merges"),
 )
 def test_archive_interface(repo: str, archive: ArchiveInterface) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(archive.storage, data)
 
     # test against ArchiveStorage
     archive_api = ArchiveStorage(archive.storage)
     check_directory_ls(archive, archive_api, data)
     check_revision_get_some_outbound_edges(archive, archive_api, data)
     check_snapshot_get_heads(archive, archive_api, data)
 
     # test against ArchivePostgreSQL
     assert isinstance(archive.storage, Storage)
     dsn = archive.storage.get_db().conn.dsn
     with BaseDb.connect(dsn).conn as conn:
         BaseDb.adapt_conn(conn)
         archive_direct = ArchivePostgreSQL(conn)
         check_directory_ls(archive, archive_direct, data)
         check_revision_get_some_outbound_edges(archive, archive_direct, data)
         check_snapshot_get_heads(archive, archive_direct, data)
 
     # test against ArchiveGraph
     nodes, edges = get_graph_data(data)
     graph = NaiveClient(nodes=nodes, edges=edges)
     archive_graph = ArchiveGraph(graph, archive.storage)
     with pytest.raises(NotImplementedError):
         check_directory_ls(archive, archive_graph, data)
     check_revision_get_some_outbound_edges(archive, archive_graph, data)
     check_snapshot_get_heads(archive, archive_graph, data)
 
     # test against ArchiveMultiplexer
     archive_multiplexed = ArchiveMultiplexed(
         [("noop", ArchiveNoop()), ("graph", archive_graph), ("api", archive_api)]
     )
     check_directory_ls(archive, archive_multiplexed, data)
     check_revision_get_some_outbound_edges(archive, archive_multiplexed, data)
     check_snapshot_get_heads(archive, archive_multiplexed, data)
 
 
 def test_noop_multiplexer():
     archive = ArchiveMultiplexed([("noop", ArchiveNoop())])
 
     assert not archive.directory_ls(Sha1Git(b"abcd"))
     assert not archive.revision_get_some_outbound_edges(Sha1Git(b"abcd"))
     assert not archive.snapshot_get_heads(Sha1Git(b"abcd"))
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
index 18ded81..53bf275 100644
--- a/swh/provenance/tests/test_cli.py
+++ b/swh/provenance/tests/test_cli.py
@@ -1,164 +1,164 @@
 # Copyright (C) 2021-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Dict, List, Set
 
 from _pytest.monkeypatch import MonkeyPatch
 from click.testing import CliRunner
 import psycopg2.extensions
 import pytest
 
 from swh.core.cli import swh as swhmain
 import swh.core.cli.db  # noqa ; ensure cli is loaded
 from swh.core.db import BaseDb
 from swh.core.db.db_utils import init_admin_extensions
 from swh.model.hashutil import MultiHash
 import swh.provenance.cli  # noqa ; ensure cli is loaded
 from swh.provenance.tests.conftest import fill_storage, load_repo_data
 from swh.storage.interface import StorageInterface
 
 from .conftest import get_datafile
 from .test_utils import invoke, write_configuration_path
 
 
 def test_cli_swh_db_help() -> None:
     # swhmain.add_command(provenance_cli)
     result = CliRunner().invoke(swhmain, ["provenance", "-h"])
     assert result.exit_code == 0
     assert "Commands:" in result.output
     commands = result.output.split("Commands:")[1]
     for command in (
         "find-all",
         "find-first",
         "iter-frontiers",
         "iter-origins",
         "iter-revisions",
     ):
         assert f"  {command} " in commands
 
 
 TABLES = {
     "dbflavor",
     "dbmodule",
     "dbversion",
     "content",
     "content_in_revision",
     "content_in_directory",
     "directory",
     "directory_in_revision",
     "location",
     "origin",
     "revision",
     "revision_before_revision",
     "revision_in_origin",
 }
 
 
 @pytest.mark.parametrize(
     "flavor, dbtables", (("with-path", TABLES), ("without-path", TABLES))
 )
 def test_cli_db_create_and_init_db_with_flavor(
     monkeypatch: MonkeyPatch,
     postgresql: psycopg2.extensions.connection,
     flavor: str,
     dbtables: Set[str],
 ) -> None:
     """Test that 'swh db init provenance' works with flavors
 
     for both with-path and without-path flavors"""
 
     dbname = f"{flavor}-db"
 
     # DB creation using 'swh db create'
     db_params = postgresql.get_dsn_parameters()
     monkeypatch.setenv("PGHOST", db_params["host"])
     monkeypatch.setenv("PGUSER", db_params["user"])
     monkeypatch.setenv("PGPORT", db_params["port"])
     result = CliRunner().invoke(swhmain, ["db", "create", "-d", dbname, "provenance"])
     assert result.exit_code == 0, result.output
 
     # DB init using 'swh db init'
     result = CliRunner().invoke(
         swhmain, ["db", "init", "-d", dbname, "--flavor", flavor, "provenance"]
     )
     assert result.exit_code == 0, result.output
     assert f"(flavor {flavor})" in result.output
 
     db_params["dbname"] = dbname
     cnx = BaseDb.connect(**db_params).conn
     # check the DB looks OK (check for db_flavor and expected tables)
     with cnx.cursor() as cur:
         cur.execute("select swh_get_dbflavor()")
         assert cur.fetchone() == (flavor,)
 
         cur.execute(
             "select table_name from information_schema.tables "
             "where table_schema = 'public' "
             f"and table_catalog = '{dbname}'"
         )
         tables = set(x for (x,) in cur.fetchall())
         assert tables == dbtables
 
 
 def test_cli_init_db_default_flavor(postgresql: psycopg2.extensions.connection) -> None:
     "Test that 'swh db init provenance' defaults to a with-path flavored DB"
 
     dbname = postgresql.dsn
     init_admin_extensions("swh.provenance", dbname)
     result = CliRunner().invoke(swhmain, ["db", "init", "-d", dbname, "provenance"])
     assert result.exit_code == 0, result.output
 
     with postgresql.cursor() as cur:
         cur.execute("select swh_get_dbflavor()")
         assert cur.fetchone() == ("with-path",)
 
 
 @pytest.mark.parametrize(
     "subcommand",
     (["origin", "from-csv"], ["iter-origins"]),
 )
 def test_cli_origin_from_csv(
     swh_storage: StorageInterface,
     subcommand: List[str],
     swh_storage_backend_config: Dict,
     provenance,
     tmp_path,
 ):
     repo = "cmdbts2"
     origin_url = f"https://{repo}"
     data = load_repo_data(repo)
     fill_storage(swh_storage, data)
 
-    assert len(data["origin"]) == 1
-    assert {"url": origin_url} in data["origin"]
+    assert len(data["origin"]) >= 1
+    assert origin_url in [o["url"] for o in data["origin"]]
 
     cfg = {
         "provenance": {
             "archive": {
                 "cls": "api",
                 "storage": swh_storage_backend_config,
             },
             "storage": {
                 "cls": "postgresql",
                 # "db": provenance.storage.conn.dsn,
                 "db": provenance.storage.conn.get_dsn_parameters(),
             },
         },
     }
 
     config_path = write_configuration_path(cfg, tmp_path)
 
     csv_filepath = get_datafile("origins.csv")
     subcommand = subcommand + [csv_filepath]
 
     result = invoke(subcommand, config_path)
     assert result.exit_code == 0, f"Unexpected result: {result.output}"
 
     origin_sha1 = MultiHash.from_data(
         origin_url.encode(), hash_names=["sha1"]
     ).digest()["sha1"]
     actual_result = provenance.storage.origin_get([origin_sha1])
 
     assert actual_result == {origin_sha1: origin_url}
diff --git a/swh/provenance/tests/test_journal_client.py b/swh/provenance/tests/test_journal_client.py
index 85225b8..c0fc79c 100644
--- a/swh/provenance/tests/test_journal_client.py
+++ b/swh/provenance/tests/test_journal_client.py
@@ -1,135 +1,135 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Dict
 
 from confluent_kafka import Consumer
 import pytest
 
 from swh.model.hashutil import MultiHash
 from swh.provenance.tests.conftest import fill_storage, load_repo_data
 from swh.storage.interface import StorageInterface
 
 from .test_utils import invoke, write_configuration_path
 
 
 @pytest.fixture
 def swh_storage_backend_config(swh_storage_backend_config, kafka_server, kafka_prefix):
     writer_config = {
         "cls": "kafka",
         "brokers": [kafka_server],
         "client_id": "kafka_writer",
         "prefix": kafka_prefix,
         "anonymize": False,
     }
     yield {**swh_storage_backend_config, "journal_writer": writer_config}
 
 
 def test_cli_origin_from_journal_client(
     swh_storage: StorageInterface,
     swh_storage_backend_config: Dict,
     kafka_prefix: str,
     kafka_server: str,
     consumer: Consumer,
     tmp_path: str,
     provenance,
     provenance_postgresql,
 ) -> None:
     """Test origin journal client cli"""
 
     # Prepare storage data
     data = load_repo_data("cmdbts2")
-    assert len(data["origin"]) == 1
+    assert len(data["origin"]) >= 1
     origin_url = data["origin"][0]["url"]
     fill_storage(swh_storage, data)
 
     # Prepare configuration for cli call
     swh_storage_backend_config.pop("journal_writer", None)  # no need for that config
     storage_config_dict = swh_storage_backend_config
     cfg = {
         "journal_client": {
             "cls": "kafka",
             "brokers": [kafka_server],
             "group_id": "toto",
             "prefix": kafka_prefix,
             "stop_on_eof": True,
         },
         "provenance": {
             "archive": {
                 "cls": "api",
                 "storage": storage_config_dict,
             },
             "storage": {
                 "cls": "postgresql",
                 "db": provenance_postgresql.get_dsn_parameters(),
             },
         },
     }
     config_path = write_configuration_path(cfg, tmp_path)
 
     # call the cli 'swh provenance origin from-journal'
     result = invoke(["origin", "from-journal"], config_path)
     assert result.exit_code == 0, f"Unexpected result: {result.output}"
 
     origin_sha1 = MultiHash.from_data(
         origin_url.encode(), hash_names=["sha1"]
     ).digest()["sha1"]
     actual_result = provenance.storage.origin_get([origin_sha1])
 
     assert actual_result == {origin_sha1: origin_url}
 
 
 def test_cli_revision_from_journal_client(
     swh_storage: StorageInterface,
     swh_storage_backend_config: Dict,
     kafka_prefix: str,
     kafka_server: str,
     consumer: Consumer,
     tmp_path: str,
     provenance,
     provenance_postgresql,
 ) -> None:
     """Test revision journal client cli"""
 
     # Prepare storage data
     data = load_repo_data("cmdbts2")
-    assert len(data["origin"]) == 1
+    assert len(data["origin"]) >= 1
     fill_storage(swh_storage, data)
 
     # Prepare configuration for cli call
     swh_storage_backend_config.pop("journal_writer", None)  # no need for that config
     storage_config_dict = swh_storage_backend_config
     cfg = {
         "journal_client": {
             "cls": "kafka",
             "brokers": [kafka_server],
             "group_id": "toto",
             "prefix": kafka_prefix,
             "stop_on_eof": True,
         },
         "provenance": {
             "archive": {
                 "cls": "api",
                 "storage": storage_config_dict,
             },
             "storage": {
                 "cls": "postgresql",
                 "db": provenance_postgresql.get_dsn_parameters(),
             },
         },
     }
     config_path = write_configuration_path(cfg, tmp_path)
 
     revisions = [rev["id"] for rev in data["revision"]]
     result = provenance.storage.revision_get(revisions)
     assert not result
 
     # call the cli 'swh provenance revision from-journal'
     cli_result = invoke(["revision", "from-journal"], config_path)
     assert cli_result.exit_code == 0, f"Unexpected result: {result.output}"
 
     result = provenance.storage.revision_get(revisions)
 
     assert set(result.keys()) == set(revisions)
diff --git a/swh/provenance/tests/test_origin_iterator.py b/swh/provenance/tests/test_origin_iterator.py
index 2c4e3ac..020a07d 100644
--- a/swh/provenance/tests/test_origin_iterator.py
+++ b/swh/provenance/tests/test_origin_iterator.py
@@ -1,42 +1,46 @@
-# Copyright (C) 2021  The Software Heritage developers
+# Copyright (C) 2021-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import pytest
 
 from swh.provenance.origin import CSVOriginIterator
 from swh.provenance.tests.conftest import fill_storage, load_repo_data
 from swh.storage.algos.origin import (
     iter_origin_visit_statuses,
     iter_origin_visits,
     iter_origins,
 )
 from swh.storage.interface import StorageInterface
 
 
 @pytest.mark.parametrize(
     "repo",
     (
         "cmdbts2",
         "out-of-order",
     ),
 )
 def test_origin_iterator(swh_storage: StorageInterface, repo: str) -> None:
     """Test CSVOriginIterator"""
     data = load_repo_data(repo)
     fill_storage(swh_storage, data)
 
     origins_csv = []
     for origin in iter_origins(swh_storage):
         for visit in iter_origin_visits(swh_storage, origin.url):
             if visit.visit is not None:
                 for status in iter_origin_visit_statuses(
                     swh_storage, origin.url, visit.visit
                 ):
                     if status.snapshot is not None:
                         origins_csv.append((status.origin, status.snapshot))
     origins = list(CSVOriginIterator(origins_csv))
 
     assert origins
-    assert len(origins) == len(data["origin"])
+    # there can be more origins, depending on the additional extra visits.yaml
+    # file used during dataset generation (see data/generate_storage_from_git)
+    assert len(origins) >= len(data["origin"])
+    # but we can check it's a subset
+    assert set(o.url for o in origins) <= set(o["url"] for o in data["origin"])