diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
index aa54503..495b528 100644
--- a/swh/provenance/tests/conftest.py
+++ b/swh/provenance/tests/conftest.py
@@ -1,283 +1,157 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from os import path
-import re
-from typing import Any, Dict, Iterable, Iterator, List, Optional
+from typing import Any, Dict, Iterable, Iterator
 
 import msgpack
 import psycopg2
 import pytest
-from typing_extensions import TypedDict
 
 from swh.journal.serializers import msgpack_ext_hook
-from swh.model.hashutil import hash_to_bytes
-from swh.model.model import Sha1Git
 from swh.model.tests.swh_model_data import TEST_OBJECTS
 from swh.provenance import get_provenance, get_provenance_storage
 from swh.provenance.api.client import RemoteProvenanceStorage
 import swh.provenance.api.server as server
 from swh.provenance.archive import ArchiveInterface
 from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface
 from swh.provenance.postgresql.archive import ArchivePostgreSQL
 from swh.provenance.storage.archive import ArchiveStorage
 from swh.storage.postgresql.storage import Storage
 from swh.storage.replay import process_replay_objects
 
 
 @pytest.fixture(
     params=[
         "with-path",
         "without-path",
         "with-path-denormalized",
         "without-path-denormalized",
     ]
 )
 def populated_db(
     request,  # TODO: add proper type annotation
     postgresql: psycopg2.extensions.connection,
 ) -> Dict[str, str]:
     """return a working and initialized provenance db"""
     from swh.core.cli.db import populate_database_for_package
 
     # flavor = "with-path" if request.param == "client-server" else request.param
     populate_database_for_package(
         "swh.provenance", postgresql.dsn, flavor=request.param
     )
     return {
         k: v
         for (k, v) in (item.split("=") for item in postgresql.dsn.split())
         if k != "options"
     }
 
 
 # the Flask app used as server in these tests
 @pytest.fixture
 def app(populated_db: Dict[str, str]):
     assert hasattr(server, "storage")
     server.storage = get_provenance_storage(cls="local", db=populated_db)
     yield server.app
 
 
 # the RPCClient class used as client used in these tests
 @pytest.fixture
 def swh_rpc_client_class():
     return RemoteProvenanceStorage
 
 
 @pytest.fixture(params=["local", "remote"])
 def provenance(
     request,  # TODO: add proper type annotation
     populated_db: Dict[str, str],
     swh_rpc_client: RemoteProvenanceStorage,
 ) -> ProvenanceInterface:
     """return a working and initialized provenance db"""
 
     if request.param == "remote":
         from swh.provenance.provenance import Provenance
 
         assert isinstance(swh_rpc_client, ProvenanceStorageInterface)
         return Provenance(swh_rpc_client)
 
     else:
         # in test sessions, we DO want to raise any exception occurring at commit time
         prov = get_provenance(cls=request.param, db=populated_db, raise_on_commit=True)
         return prov
 
 
 @pytest.fixture
 def swh_storage_with_objects(swh_storage: Storage) -> Storage:
     """return a Storage object (postgresql-based by default) with a few of each
     object type in it
 
     The inserted content comes from swh.model.tests.swh_model_data.
     """
     for obj_type in (
         "content",
         "skipped_content",
         "directory",
         "revision",
         "release",
         "snapshot",
         "origin",
         "origin_visit",
         "origin_visit_status",
     ):
         getattr(swh_storage, f"{obj_type}_add")(TEST_OBJECTS[obj_type])
     return swh_storage
 
 
 @pytest.fixture
 def archive_direct(swh_storage_with_objects: Storage) -> ArchiveInterface:
     return ArchivePostgreSQL(swh_storage_with_objects.get_db().conn)
 
 
 @pytest.fixture
 def archive_api(swh_storage_with_objects: Storage) -> ArchiveInterface:
     return ArchiveStorage(swh_storage_with_objects)
 
 
 @pytest.fixture(params=["archive", "db"])
 def archive(request, swh_storage_with_objects: Storage) -> Iterator[ArchiveInterface]:
     """Return a ArchivePostgreSQL based StorageInterface object"""
     # this is a workaround to prevent tests from hanging because of an unclosed
     # transaction.
     # TODO: refactor the ArchivePostgreSQL to properly deal with
     # transactions and get rid of this fixture
     if request.param == "db":
         archive = ArchivePostgreSQL(conn=swh_storage_with_objects.get_db().conn)
         yield archive
         archive.conn.rollback()
     else:
         yield ArchiveStorage(swh_storage_with_objects)
 
 
 def get_datafile(fname: str) -> str:
     return path.join(path.dirname(__file__), "data", fname)
 
 
 def load_repo_data(repo: str) -> Dict[str, Any]:
     data: Dict[str, Any] = {}
     with open(get_datafile(f"{repo}.msgpack"), "rb") as fobj:
         unpacker = msgpack.Unpacker(
             fobj,
             raw=False,
             ext_hook=msgpack_ext_hook,
             strict_map_key=False,
             timestamp=3,  # convert Timestamp in datetime objects (tz UTC)
         )
         for objtype, objd in unpacker:
             data.setdefault(objtype, []).append(objd)
     return data
 
 
 def filter_dict(d: Dict[Any, Any], keys: Iterable[Any]) -> Dict[Any, Any]:
     return {k: v for (k, v) in d.items() if k in keys}
 
 
 def fill_storage(storage: Storage, data: Dict[str, Any]) -> None:
     process_replay_objects(data, storage=storage)
-
-
-class SynthRelation(TypedDict):
-    prefix: Optional[str]
-    path: str
-    src: Sha1Git
-    dst: Sha1Git
-    rel_ts: float
-
-
-class SynthRevision(TypedDict):
-    sha1: Sha1Git
-    date: float
-    msg: str
-    R_C: List[SynthRelation]
-    R_D: List[SynthRelation]
-    D_C: List[SynthRelation]
-
-
-def synthetic_result(filename: str) -> Iterator[SynthRevision]:
-    """Generates dict representations of synthetic revisions found in the synthetic
-    file (from the data/ directory) given as argument of the generator.
-
-    Generated SynthRevision (typed dict) with the following elements:
-
-      "sha1": (Sha1Git) sha1 of the revision,
-      "date": (float) timestamp of the revision,
-      "msg": (str) commit message of the revision,
-      "R_C": (list) new R---C relations added by this revision
-      "R_D": (list) new R-D   relations added by this revision
-      "D_C": (list) new   D-C relations added by this revision
-
-    Each relation above is a SynthRelation typed dict with:
-
-      "path": (str) location
-      "src": (Sha1Git) sha1 of the source of the relation
-      "dst": (Sha1Git) sha1 of the destination of the relation
-      "rel_ts": (float) timestamp of the target of the relation
-                (related to the timestamp of the revision)
-
-    """
-
-    with open(get_datafile(filename), "r") as fobj:
-        yield from _parse_synthetic_file(fobj)
-
-
-def _parse_synthetic_file(fobj: Iterable[str]) -> Iterator[SynthRevision]:
-    """Read a 'synthetic' file and generate a dict representation of the synthetic
-    revision for each revision listed in the synthetic file.
-    """
-    regs = [
-        "(?P<revname>R[0-9]{2,4})?",
-        "(?P<reltype>[^| ]*)",
-        "([+] )?(?P<path>[^| +]*?)[/]?",
-        "(?P<type>[RDC]) (?P<sha1>[0-9a-z]{40})",
-        "(?P<ts>-?[0-9]+(.[0-9]+)?)",
-    ]
-    regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$")
-    current_rev: List[dict] = []
-    for m in (regex.match(line) for line in fobj):
-        if m:
-            d = m.groupdict()
-            if d["revname"]:
-                if current_rev:
-                    yield _mk_synth_rev(current_rev)
-                current_rev.clear()
-            current_rev.append(d)
-    if current_rev:
-        yield _mk_synth_rev(current_rev)
-
-
-def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision:
-    assert synth_rev[0]["type"] == "R"
-    rev = SynthRevision(
-        sha1=hash_to_bytes(synth_rev[0]["sha1"]),
-        date=float(synth_rev[0]["ts"]),
-        msg=synth_rev[0]["revname"],
-        R_C=[],
-        R_D=[],
-        D_C=[],
-    )
-    current_path = None
-    # path of the last R-D relation we parsed, used a prefix for next D-C
-    # relations
-
-    for row in synth_rev[1:]:
-        if row["reltype"] == "R---C":
-            assert row["type"] == "C"
-            rev["R_C"].append(
-                SynthRelation(
-                    prefix=None,
-                    path=row["path"],
-                    src=rev["sha1"],
-                    dst=hash_to_bytes(row["sha1"]),
-                    rel_ts=float(row["ts"]),
-                )
-            )
-            current_path = None
-        elif row["reltype"] == "R-D":
-            assert row["type"] == "D"
-            rev["R_D"].append(
-                SynthRelation(
-                    prefix=None,
-                    path=row["path"],
-                    src=rev["sha1"],
-                    dst=hash_to_bytes(row["sha1"]),
-                    rel_ts=float(row["ts"]),
-                )
-            )
-            current_path = row["path"]
-        elif row["reltype"] == "D-C":
-            assert row["type"] == "C"
-            rev["D_C"].append(
-                SynthRelation(
-                    prefix=current_path,
-                    path=row["path"],
-                    src=rev["R_D"][-1]["dst"],
-                    dst=hash_to_bytes(row["sha1"]),
-                    rel_ts=float(row["ts"]),
-                )
-            )
-    return rev
diff --git a/swh/provenance/tests/data/README.md b/swh/provenance/tests/data/README.md
index 2eb0da0..81dff87 100644
--- a/swh/provenance/tests/data/README.md
+++ b/swh/provenance/tests/data/README.md
@@ -1,166 +1,166 @@
 # Provenance Index Test Dataset
 
 This directory contains datasets used by `test_provenance_heurstics` tests of
 the provenance index database.
 
 Each dataset `xxx` consist in several parts:
 
 - a description of a git repository as a yaml file named `xxx_repo.yaml`,
 - a msgpack file containing storage objects for the given repository, from
   which the storage is filled before each test using these data, and
 - a set of synthetic files, named `synthetic_xxx_(lower|upper)_<mindepth>.txt`,
   describing the expected result in the provenance database if ingested with
   the flag `lower` set or not set, and the `mindepth` value (integer, most
   often `1` or `2`).
 
 
 ## Git repos description file
 
 The description of a git repository is a yaml file which contains a list dicts,
 each one representing a git revision to add (linearly) in the git repo used a
 base for the dataset. Each dict consist in a structure like:
 
 ``` yaml
 - msg: R00
 	date: 1000000000
 	content:
     A/B/C/a: "content a"
 
 ```
 
 this example will generate a git commit with the commit message "R00", the
 author and committer date 1000000000 (given as a unix timestamp), and a one
 file which path is `A/B/C/a` and content is "content a".
 
 The file is parsed to create git revisions in a temporary git repository, in
 order of appearance in the yaml file (so one may create an git repository with
 'out-of-order' commits).
 
 There is no way of creating branches and merges for now.
 
 The tool to generate this git repo is `generate_repo.py`:
 
 ```
  python generate_repo.py --help
 Usage: generate_repo.py [OPTIONS] INPUT_FILE OUTPUT_DIR
 
 Options:
   -C, --clean-output / --no-clean-output
   --help                          Show this message and exit.
 ```
 
 It generates a git repository in the `OUTPUT_DIR` as well as produces a
 template `synthetic` file on its standard output, which can be used to ease
 writing the expected `synthetic` files.
 
 Typical usage will be:
 
 ```
 python generate_repo.py repo2_repo.yaml repo2 > synthetic_repo2_template.txt
 ```
 
 Note that hashes (for revision, directories and content) of the git objects
 only depends on the content of the input yaml file. Calling the tool twice on
 the same input file should generate the exact same git repo twice.
 
 Also note that the tool will add a branch at each revision (using the commit
 message as bramch name), to make it easier to reference any point in the git
 history.
 
 ## Msgpack dump of the storage
 
 This file contains a set of storage objects (`Revision`, `Content` and
 `Directory`) and is usually generated from a local git repository (typically
 the one generated by the previous command) using the
 `generate_storage_from_git.py` tool:
 
 ```
 python generate_storage_from_git.py --help
 Usage: generate_storage_from_git.py [OPTIONS] GIT_REPO
 
   simple tool to generate the CMDBTS.msgpack dataset filed used in tests
 
 Options:
   -r, --head TEXT    head revision to start from
   -o, --output TEXT  output file
   --help             Show this message and exit.
 
 ```
 
 Typical usage would be, using the git repository `repo2` created previously:
 
 ```
 python generate_storage_from_git.py repo2
 Revision hash for master is 8363e8e98751dc9f264d2fedd6b829ad4b1218b0
 Wrote 86 objects in repo2.msgpack
 ```
 
 ### Adding extra visits/snapshots
 
 It is also possible to generate a storage from a git repo with extra origin
-visits, using the `--visit` option of the `generate_repo_from_git` tool.
+visits, using the `--visit` option of the `generate_storage_from_git` tool.
 
 This option expect a yaml file as argument. This file contains a description of
 extra visits (and snapshots) you want to add to the storage.
 
 The format is simple, for example:
 
 ```
 # a visit pattern scenario for the 'repo_with_merges' repo
 
 - origin: http://repo_with_merges/1/
   date: 1000000015
   branches:
     - R01
 
 ```
 
 will create an OriginVisit (at given date) for the given origin URL (the Origin
 will be created as well), with a `Snapshot` including the listed
 branches.
 
 
 ## Synthetic files
 
 These files describe the expected content of the provenance database for each
 revision (in order of ingestion).
 
 The `generate_repo.py` tool will produce a template of synthetic file like:
 
 ```
 1000000000.0 b582a17b3fc37f72fc57877616f85c3f0abed064 R00
 R00   |       |                      | R b582a17b3fc37f72fc57877616f85c3f0abed064 | 1000000000.0
       |       | .                    | D a4cb5e6b2831f7e8eef0e6e08e43d642c97303a1 | 0.0
       |       | A                    | D 1c8d9fd9afa7e5a2cf52a3db6f05dc5c3a1ca86b | 0.0
       |       | A/B                  | D 36876d475197b5ad86ad592e8e28818171455f16 | 0.0
       |       | A/B/C                | D 98f7a4a23d8df1fb1a5055facae2aff9b2d0a8b3 | 0.0
       |       | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0
 
 1000000010.0 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe R01
 R01   |       |                      | R 8259eeae2ff5046f0bb4393d6e894fe6d7e01bfe | 1000000010.0
       |       | .                    | D b3cf11b22c9f93c3c494cf90ab072f394155072d | 0.0
       |       | A                    | D baca735bf8b8720131b4bfdb47c51631a9260348 | 0.0
       |       | A/B                  | D 4b28979d88ed209a09c272bcc80f69d9b18339c2 | 0.0
       |       | A/B/C                | D c9cabe7f49012e3fdef6ac6b929efb5654f583cf | 0.0
       |       | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0.0
       |       | A/B/C/b              | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0.0
 
 [...]
 ```
 
 where all the content and directories of each revision are listed; it's then
 the responsibility of the user to create the expected synthetic file for a
 given heuristics configuration. For example, the 2 revisions above are to be
 adapted, for the `(lower=True, mindepth=1)` case, as:
 
 ```
 1000000000 c0d8929936631ecbcf9147be6b8aa13b13b014e4 R00
 R00   |       |                      | R c0d8929936631ecbcf9147be6b8aa13b13b014e4 | 1000000000
       | R---C | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | 0
 
 1000000010 1444db96cbd8cd791abe83527becee73d3c64e86 R01
 R01   |       |                      | R 1444db96cbd8cd791abe83527becee73d3c64e86 | 1000000010
       | R---C | A/B/C/a              | C 20329687bb9c1231a7e05afe86160343ad49b494 | -10
       | R---C | A/B/C/b              | C 50e9cdb03f9719261dd39d7f2920b906db3711a3 | 0
 
 ```
diff --git a/swh/provenance/tests/data/origin-revision_with-merges_visits-01.txt b/swh/provenance/tests/data/origin-revision_with-merges_visits-01.txt
new file mode 100644
index 0000000..87a99c3
--- /dev/null
+++ b/swh/provenance/tests/data/origin-revision_with-merges_visits-01.txt
@@ -0,0 +1,67 @@
+1000000015 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/
+http://repo_with_merges/1/    |       |         | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175
+                              |       |         | S 8d6b9ac022cae46a59b4f5b9285f0eea9736dae4
+                              | O-R   | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+
+1000000025 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/
+http://repo_with_merges/1/    |       |         | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175
+                              |       |         | S 537a4f35fc36eb08a065952210958c8095fea3c7
+                              | O-R   | R03     | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb
+                              |   R-R | R02     | R 1c533587277731236616cac0d44f3b46c1da0f8a
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+                              | O-R   | R06     | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+
+1000000035 ed02cb79ed65a650cee6907323f986cff4a8428f http://repo_with_merges/2/
+http://repo_with_merges/2/    |       |         | O ed02cb79ed65a650cee6907323f986cff4a8428f
+                              |       |         | S 5aa910026ddbbea2971fe1b89725bc5f076637e3
+                              | O-R   | R05     | R 65e58853df939b318c106c4c1f55acaf8b41c74c
+                              |   R-R | R04     | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982
+                              |   R-R | R03     | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb
+                              |   R-R | R02     | R 1c533587277731236616cac0d44f3b46c1da0f8a
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+                              | O-R   | R06     | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+
+1000000045 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/
+http://repo_with_merges/1/    |       |         | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175
+                              |       |         | S 8193e0324e7a21181331c13588b3e348022b3268
+                              | O-R   | R06     | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+                              | O-R   | R07     | R fff0089fad98e8f5b46ec5c9025a20a602851ba6
+                              |   R-R | R03     | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb
+                              |   R-R | R02     | R 1c533587277731236616cac0d44f3b46c1da0f8a
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+
+1000000055 3acef14580ea7fd42840ee905c5ce2b0ef9e8175 http://repo_with_merges/1/
+http://repo_with_merges/1/    |       |         | O 3acef14580ea7fd42840ee905c5ce2b0ef9e8175
+                              |       |         | S ee9ea8b0ba40c9012d7d8103234946a21b66a729
+                              | O-R   | R08     | R 7c8f29237dded4f9d265e46ec7066503e7858e87
+                              |   R-R | R07     | R fff0089fad98e8f5b46ec5c9025a20a602851ba6
+                              |   R-R | R06     | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff
+                              |   R-R | R05     | R 65e58853df939b318c106c4c1f55acaf8b41c74c
+                              |   R-R | R04     | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982
+                              |   R-R | R03     | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb
+                              |   R-R | R02     | R 1c533587277731236616cac0d44f3b46c1da0f8a
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
+
+1000000065 ed02cb79ed65a650cee6907323f986cff4a8428f http://repo_with_merges/2/
+http://repo_with_merges/2/    |       |         | O ed02cb79ed65a650cee6907323f986cff4a8428f
+                              |       |         | S ee9ea8b0ba40c9012d7d8103234946a21b66a729
+                              | O-R   | R08     | R 7c8f29237dded4f9d265e46ec7066503e7858e87
+                              |   R-R | R07     | R fff0089fad98e8f5b46ec5c9025a20a602851ba6
+                              |   R-R | R06     | R 72d92d41a9095db2dd6b8fb1c62d92c8251753ff
+                              |   R-R | R05     | R 65e58853df939b318c106c4c1f55acaf8b41c74c
+                              |   R-R | R04     | R 0d66eadcc15e0d7f6cfd4289329a7749a1309982
+                              |   R-R | R03     | R 20f4da0f48609d9f7f908ebbcac3b3741a0f25cb
+                              |   R-R | R02     | R 1c533587277731236616cac0d44f3b46c1da0f8a
+                              |   R-R | R01     | R 1444db96cbd8cd791abe83527becee73d3c64e86
+                              |   R-R | R00     | R c0d8929936631ecbcf9147be6b8aa13b13b014e4
diff --git a/swh/provenance/tests/data/with-merges.msgpack b/swh/provenance/tests/data/with-merges.msgpack
index 3accba0..52ff778 100644
Binary files a/swh/provenance/tests/data/with-merges.msgpack and b/swh/provenance/tests/data/with-merges.msgpack differ
diff --git a/swh/provenance/tests/test_origin_revision_layer.py b/swh/provenance/tests/test_origin_revision_layer.py
new file mode 100644
index 0000000..9bb66d8
--- /dev/null
+++ b/swh/provenance/tests/test_origin_revision_layer.py
@@ -0,0 +1,192 @@
+# Copyright (C) 2021  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import re
+from typing import Any, Dict, Iterable, Iterator, List, Set
+
+import pytest
+from typing_extensions import TypedDict
+
+from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
+from swh.provenance.archive import ArchiveInterface
+from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType
+from swh.provenance.model import OriginEntry
+from swh.provenance.origin import origin_add
+from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data
+from swh.storage.postgresql.storage import Storage
+
+
+class SynthRelation(TypedDict):
+    src: Sha1Git
+    dst: Sha1Git
+    name: str
+
+
+class SynthOrigin(TypedDict):
+    sha1: Sha1Git
+    url: str
+    snap: Sha1Git
+    O_R: List[SynthRelation]
+    R_R: List[SynthRelation]
+
+
+def synthetic_origin_revision_result(filename: str) -> Iterator[SynthOrigin]:
+    """Generates dict representations of synthetic origin visits found in the
+    synthetic file (from the data/ directory) given as argument of the generator.
+
+    Generated SynthOrigin (typed dict) with the following elements:
+
+      "sha1": (Sha1Git) sha1 of the origin,
+      "url": (str) url of the origin,
+      "snap": (Sha1Git) sha1 of the visit's snapshot,
+      "O_R": (list) new O-R   relations added by this origin visit
+      "R_R": (list) new   R-R relations added by this origin visit
+
+    Each relation above is a SynthRelation typed dict with:
+
+      "src": (Sha1Git) sha1 of the source of the relation
+      "dst": (Sha1Git) sha1 of the destination of the relation
+
+    """
+
+    with open(get_datafile(filename), "r") as fobj:
+        yield from _parse_synthetic_origin_revision_file(fobj)
+
+
+def _parse_synthetic_origin_revision_file(fobj: Iterable[str]) -> Iterator[SynthOrigin]:
+    """Read a 'synthetic' file and generate a dict representation of the synthetic
+    origin visit for each snapshot listed in the synthetic file.
+    """
+    regs = [
+        "(?P<url>[^ ]+)?",
+        "(?P<reltype>[^| ]*)",
+        "(?P<revname>R[0-9]{2,4})?",
+        "(?P<type>[ORS]) (?P<sha1>[0-9a-f]{40})",
+    ]
+    regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$")
+    current_org: List[dict] = []
+    for m in (regex.match(line) for line in fobj):
+        if m:
+            d = m.groupdict()
+            if d["url"]:
+                if current_org:
+                    yield _mk_synth_org(current_org)
+                current_org.clear()
+            current_org.append(d)
+    if current_org:
+        yield _mk_synth_org(current_org)
+
+
+def _mk_synth_org(synth_org: List[Dict[str, str]]) -> SynthOrigin:
+    assert synth_org[0]["type"] == "O"
+    assert synth_org[1]["type"] == "S"
+    org = SynthOrigin(
+        sha1=hash_to_bytes(synth_org[0]["sha1"]),
+        url=synth_org[0]["url"],
+        snap=hash_to_bytes(synth_org[1]["sha1"]),
+        O_R=[],
+        R_R=[],
+    )
+
+    for row in synth_org[2:]:
+        if row["reltype"] == "O-R":
+            assert row["type"] == "R"
+            org["O_R"].append(
+                SynthRelation(
+                    src=org["sha1"],
+                    dst=hash_to_bytes(row["sha1"]),
+                    name=row["revname"],
+                )
+            )
+        elif row["reltype"] == "R-R":
+            assert row["type"] == "R"
+            org["R_R"].append(
+                SynthRelation(
+                    src=org["O_R"][-1]["dst"],
+                    dst=hash_to_bytes(row["sha1"]),
+                    name=row["revname"],
+                )
+            )
+    return org
+
+
+@pytest.mark.parametrize(
+    "repo, visit",
+    (("with-merges", "visits-01"),),
+)
+def test_origin_revision_layer(
+    provenance: ProvenanceInterface,
+    swh_storage: Storage,
+    archive: ArchiveInterface,
+    repo: str,
+    visit: str,
+) -> None:
+    # read data/README.md for more details on how these datasets are generated
+    data = load_repo_data(repo)
+    fill_storage(swh_storage, data)
+    syntheticfile = get_datafile(f"origin-revision_{repo}_{visit}.txt")
+
+    origins = [
+        {"url": status["origin"], "snap": status["snapshot"]}
+        for status in data["origin_visit_status"]
+        if status["snapshot"] is not None
+    ]
+
+    rows: Dict[str, Set[Any]] = {
+        "origin": set(),
+        "revision_in_origin": set(),
+        "revision_before_revision": set(),
+        "revision": set(),
+    }
+
+    for synth_org in synthetic_origin_revision_result(syntheticfile):
+        for origin in (
+            org
+            for org in origins
+            if org["url"] == synth_org["url"] and org["snap"] == synth_org["snap"]
+        ):
+            entry = OriginEntry(url=origin["url"], snapshot=origin["snap"])
+            origin_add(provenance, archive, [entry])
+
+            # each "entry" in the synth file is one new origin visit
+            rows["origin"].add(synth_org["sha1"])
+            assert rows["origin"] == provenance.storage.entity_get_all(
+                EntityType.ORIGIN
+            ), synth_org["url"]
+            # check the url of the origin
+            assert (
+                provenance.storage.origin_get([synth_org["sha1"]])[synth_org["sha1"]]
+                == synth_org["url"]
+            ), synth_org["snap"]
+
+            # this origin visit might have added new revision objects
+            rows["revision"] |= set(x["dst"] for x in synth_org["O_R"])
+            rows["revision"] |= set(x["dst"] for x in synth_org["R_R"])
+            assert rows["revision"] == provenance.storage.entity_get_all(
+                EntityType.REVISION
+            ), synth_org["snap"]
+
+            # check for O-R (head) entries
+            # these are added in the revision_in_origin relation
+            rows["revision_in_origin"] |= set(
+                (x["dst"], x["src"], None) for x in synth_org["O_R"]
+            )
+            assert rows["revision_in_origin"] == {
+                (rel.src, rel.dst, rel.path)
+                for rel in provenance.storage.relation_get_all(RelationType.REV_IN_ORG)
+            }, synth_org["snap"]
+
+            # check for R-R entries
+            # these are added in the revision_before_revision relation
+            rows["revision_before_revision"] |= set(
+                (x["dst"], x["src"], None) for x in synth_org["R_R"]
+            )
+            assert rows["revision_before_revision"] == {
+                (rel.src, rel.dst, rel.path)
+                for rel in provenance.storage.relation_get_all(
+                    RelationType.REV_BEFORE_REV
+                )
+            }, synth_org["snap"]
diff --git a/swh/provenance/tests/test_provenance_heuristics.py b/swh/provenance/tests/test_revision_content_layer.py
similarity index 71%
rename from swh/provenance/tests/test_provenance_heuristics.py
rename to swh/provenance/tests/test_revision_content_layer.py
index 7eb1cd9..77e36d3 100644
--- a/swh/provenance/tests/test_provenance_heuristics.py
+++ b/swh/provenance/tests/test_revision_content_layer.py
@@ -1,325 +1,447 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from typing import Any, Dict, List, Optional, Set, Tuple
+import re
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple
 
 import pytest
+from typing_extensions import TypedDict
 
 from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
 from swh.provenance.archive import ArchiveInterface
 from swh.provenance.interface import EntityType, ProvenanceInterface, RelationType
 from swh.provenance.model import RevisionEntry
 from swh.provenance.revision import revision_add
-from swh.provenance.tests.conftest import (
-    fill_storage,
-    get_datafile,
-    load_repo_data,
-    synthetic_result,
-)
+from swh.provenance.tests.conftest import fill_storage, get_datafile, load_repo_data
 from swh.provenance.tests.test_provenance_db import ts2dt
 from swh.storage.postgresql.storage import Storage
 
 
+class SynthRelation(TypedDict):
+    prefix: Optional[str]
+    path: str
+    src: Sha1Git
+    dst: Sha1Git
+    rel_ts: float
+
+
+class SynthRevision(TypedDict):
+    sha1: Sha1Git
+    date: float
+    msg: str
+    R_C: List[SynthRelation]
+    R_D: List[SynthRelation]
+    D_C: List[SynthRelation]
+
+
+def synthetic_revision_content_result(filename: str) -> Iterator[SynthRevision]:
+    """Generates dict representations of synthetic revisions found in the synthetic
+    file (from the data/ directory) given as argument of the generator.
+
+    Generated SynthRevision (typed dict) with the following elements:
+
+      "sha1": (Sha1Git) sha1 of the revision,
+      "date": (float) timestamp of the revision,
+      "msg": (str) commit message of the revision,
+      "R_C": (list) new R---C relations added by this revision
+      "R_D": (list) new R-D   relations added by this revision
+      "D_C": (list) new   D-C relations added by this revision
+
+    Each relation above is a SynthRelation typed dict with:
+
+      "path": (str) location
+      "src": (Sha1Git) sha1 of the source of the relation
+      "dst": (Sha1Git) sha1 of the destination of the relation
+      "rel_ts": (float) timestamp of the target of the relation
+                (related to the timestamp of the revision)
+
+    """
+
+    with open(get_datafile(filename), "r") as fobj:
+        yield from _parse_synthetic_revision_content_file(fobj)
+
+
+def _parse_synthetic_revision_content_file(
+    fobj: Iterable[str],
+) -> Iterator[SynthRevision]:
+    """Read a 'synthetic' file and generate a dict representation of the synthetic
+    revision for each revision listed in the synthetic file.
+    """
+    regs = [
+        "(?P<revname>R[0-9]{2,4})?",
+        "(?P<reltype>[^| ]*)",
+        "([+] )?(?P<path>[^| +]*?)[/]?",
+        "(?P<type>[RDC]) (?P<sha1>[0-9a-f]{40})",
+        "(?P<ts>-?[0-9]+(.[0-9]+)?)",
+    ]
+    regex = re.compile("^ *" + r" *[|] *".join(regs) + r" *(#.*)?$")
+    current_rev: List[dict] = []
+    for m in (regex.match(line) for line in fobj):
+        if m:
+            d = m.groupdict()
+            if d["revname"]:
+                if current_rev:
+                    yield _mk_synth_rev(current_rev)
+                current_rev.clear()
+            current_rev.append(d)
+    if current_rev:
+        yield _mk_synth_rev(current_rev)
+
+
+def _mk_synth_rev(synth_rev: List[Dict[str, str]]) -> SynthRevision:
+    assert synth_rev[0]["type"] == "R"
+    rev = SynthRevision(
+        sha1=hash_to_bytes(synth_rev[0]["sha1"]),
+        date=float(synth_rev[0]["ts"]),
+        msg=synth_rev[0]["revname"],
+        R_C=[],
+        R_D=[],
+        D_C=[],
+    )
+    current_path = None
+    # path of the last R-D relation we parsed, used a prefix for next D-C
+    # relations
+
+    for row in synth_rev[1:]:
+        if row["reltype"] == "R---C":
+            assert row["type"] == "C"
+            rev["R_C"].append(
+                SynthRelation(
+                    prefix=None,
+                    path=row["path"],
+                    src=rev["sha1"],
+                    dst=hash_to_bytes(row["sha1"]),
+                    rel_ts=float(row["ts"]),
+                )
+            )
+            current_path = None
+        elif row["reltype"] == "R-D":
+            assert row["type"] == "D"
+            rev["R_D"].append(
+                SynthRelation(
+                    prefix=None,
+                    path=row["path"],
+                    src=rev["sha1"],
+                    dst=hash_to_bytes(row["sha1"]),
+                    rel_ts=float(row["ts"]),
+                )
+            )
+            current_path = row["path"]
+        elif row["reltype"] == "D-C":
+            assert row["type"] == "C"
+            rev["D_C"].append(
+                SynthRelation(
+                    prefix=current_path,
+                    path=row["path"],
+                    src=rev["R_D"][-1]["dst"],
+                    dst=hash_to_bytes(row["sha1"]),
+                    rel_ts=float(row["ts"]),
+                )
+            )
+    return rev
+
+
 @pytest.mark.parametrize(
     "repo, lower, mindepth",
     (
         ("cmdbts2", True, 1),
         ("cmdbts2", False, 1),
         ("cmdbts2", True, 2),
         ("cmdbts2", False, 2),
         ("out-of-order", True, 1),
     ),
 )
-def test_provenance_heuristics(
+def test_revision_content_result(
     provenance: ProvenanceInterface,
     swh_storage: Storage,
     archive: ArchiveInterface,
     repo: str,
     lower: bool,
     mindepth: int,
 ) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(swh_storage, data)
     syntheticfile = get_datafile(
         f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
     )
 
     revisions = {rev["id"]: rev for rev in data["revision"]}
 
     rows: Dict[str, Set[Any]] = {
         "content": set(),
         "content_in_directory": set(),
         "content_in_revision": set(),
         "directory": set(),
         "directory_in_revision": set(),
         "location": set(),
         "revision": set(),
     }
 
     def maybe_path(path: str) -> Optional[bytes]:
         if provenance.storage.with_path():
             return path.encode("utf-8")
         return None
 
-    for synth_rev in synthetic_result(syntheticfile):
+    for synth_rev in synthetic_revision_content_result(syntheticfile):
         revision = revisions[synth_rev["sha1"]]
         entry = RevisionEntry(
             id=revision["id"],
             date=ts2dt(revision["date"]),
             root=revision["directory"],
         )
         revision_add(provenance, archive, [entry], lower=lower, mindepth=mindepth)
 
         # each "entry" in the synth file is one new revision
         rows["revision"].add(synth_rev["sha1"])
         assert rows["revision"] == provenance.storage.entity_get_all(
             EntityType.REVISION
         ), synth_rev["msg"]
         # check the timestamp of the revision
         rev_ts = synth_rev["date"]
         rev_data = provenance.storage.revision_get([synth_rev["sha1"]])[
             synth_rev["sha1"]
         ]
         assert (
             rev_data.date is not None and rev_ts == rev_data.date.timestamp()
         ), synth_rev["msg"]
 
         # this revision might have added new content objects
         rows["content"] |= set(x["dst"] for x in synth_rev["R_C"])
         rows["content"] |= set(x["dst"] for x in synth_rev["D_C"])
         assert rows["content"] == provenance.storage.entity_get_all(
             EntityType.CONTENT
         ), synth_rev["msg"]
 
         # check for R-C (direct) entries
         # these are added directly in the content_early_in_rev table
         rows["content_in_revision"] |= set(
             (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_C"]
         )
         assert rows["content_in_revision"] == {
             (rel.src, rel.dst, rel.path)
             for rel in provenance.storage.relation_get_all(
                 RelationType.CNT_EARLY_IN_REV
             )
         }, synth_rev["msg"]
         # check timestamps
         for rc in synth_rev["R_C"]:
             assert (
                 rev_ts + rc["rel_ts"]
                 == provenance.storage.content_get([rc["dst"]])[rc["dst"]].timestamp()
             ), synth_rev["msg"]
 
         # check directories
         # each directory stored in the provenance index is an entry
         #      in the "directory" table...
         rows["directory"] |= set(x["dst"] for x in synth_rev["R_D"])
         assert rows["directory"] == provenance.storage.entity_get_all(
             EntityType.DIRECTORY
         ), synth_rev["msg"]
 
         # ... + a number of rows in the "directory_in_rev" table...
         # check for R-D entries
         rows["directory_in_revision"] |= set(
             (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["R_D"]
         )
         assert rows["directory_in_revision"] == {
             (rel.src, rel.dst, rel.path)
             for rel in provenance.storage.relation_get_all(RelationType.DIR_IN_REV)
         }, synth_rev["msg"]
         # check timestamps
         for rd in synth_rev["R_D"]:
             assert (
                 rev_ts + rd["rel_ts"]
                 == provenance.storage.directory_get([rd["dst"]])[rd["dst"]].timestamp()
             ), synth_rev["msg"]
 
         # ... + a number of rows in the "content_in_dir" table
         #     for content of the directory.
         # check for D-C entries
         rows["content_in_directory"] |= set(
             (x["dst"], x["src"], maybe_path(x["path"])) for x in synth_rev["D_C"]
         )
         assert rows["content_in_directory"] == {
             (rel.src, rel.dst, rel.path)
             for rel in provenance.storage.relation_get_all(RelationType.CNT_IN_DIR)
         }, synth_rev["msg"]
         # check timestamps
         for dc in synth_rev["D_C"]:
             assert (
                 rev_ts + dc["rel_ts"]
                 == provenance.storage.content_get([dc["dst"]])[dc["dst"]].timestamp()
             ), synth_rev["msg"]
 
         if provenance.storage.with_path():
             # check for location entries
             rows["location"] |= set(x["path"] for x in synth_rev["R_C"])
             rows["location"] |= set(x["path"] for x in synth_rev["D_C"])
             rows["location"] |= set(x["path"] for x in synth_rev["R_D"])
             assert rows["location"] == provenance.storage.location_get(), synth_rev[
                 "msg"
             ]
 
 
 @pytest.mark.parametrize(
     "repo, lower, mindepth",
     (
         ("cmdbts2", True, 1),
         ("cmdbts2", False, 1),
         ("cmdbts2", True, 2),
         ("cmdbts2", False, 2),
         ("out-of-order", True, 1),
     ),
 )
 @pytest.mark.parametrize("batch", (True, False))
 def test_provenance_heuristics_content_find_all(
     provenance: ProvenanceInterface,
     swh_storage: Storage,
     archive: ArchiveInterface,
     repo: str,
     lower: bool,
     mindepth: int,
     batch: bool,
 ) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(swh_storage, data)
     revisions = [
         RevisionEntry(
             id=revision["id"],
             date=ts2dt(revision["date"]),
             root=revision["directory"],
         )
         for revision in data["revision"]
     ]
 
     def maybe_path(path: str) -> str:
         if provenance.storage.with_path():
             return path
         return ""
 
     if batch:
         revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
     else:
         for revision in revisions:
             revision_add(
                 provenance, archive, [revision], lower=lower, mindepth=mindepth
             )
 
     syntheticfile = get_datafile(
         f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
     )
     expected_occurrences: Dict[str, List[Tuple[str, float, Optional[str], str]]] = {}
-    for synth_rev in synthetic_result(syntheticfile):
+    for synth_rev in synthetic_revision_content_result(syntheticfile):
         rev_id = synth_rev["sha1"].hex()
         rev_ts = synth_rev["date"]
 
         for rc in synth_rev["R_C"]:
             expected_occurrences.setdefault(rc["dst"].hex(), []).append(
                 (rev_id, rev_ts, None, maybe_path(rc["path"]))
             )
         for dc in synth_rev["D_C"]:
             assert dc["prefix"] is not None  # to please mypy
             expected_occurrences.setdefault(dc["dst"].hex(), []).append(
                 (rev_id, rev_ts, None, maybe_path(dc["prefix"] + "/" + dc["path"]))
             )
 
     for content_id, results in expected_occurrences.items():
         expected = [(content_id, *result) for result in results]
         db_occurrences = [
             (
                 occur.content.hex(),
                 occur.revision.hex(),
                 occur.date.timestamp(),
                 occur.origin,
                 occur.path.decode(),
             )
             for occur in provenance.content_find_all(hash_to_bytes(content_id))
         ]
         if provenance.storage.with_path():
             # this is not true if the db stores no path, because a same content
             # that appears several times in a given revision may be reported
             # only once by content_find_all()
             assert len(db_occurrences) == len(expected)
         assert set(db_occurrences) == set(expected)
 
 
 @pytest.mark.parametrize(
     "repo, lower, mindepth",
     (
         ("cmdbts2", True, 1),
         ("cmdbts2", False, 1),
         ("cmdbts2", True, 2),
         ("cmdbts2", False, 2),
         ("out-of-order", True, 1),
     ),
 )
 @pytest.mark.parametrize("batch", (True, False))
 def test_provenance_heuristics_content_find_first(
     provenance: ProvenanceInterface,
     swh_storage: Storage,
     archive: ArchiveInterface,
     repo: str,
     lower: bool,
     mindepth: int,
     batch: bool,
 ) -> None:
     # read data/README.md for more details on how these datasets are generated
     data = load_repo_data(repo)
     fill_storage(swh_storage, data)
     revisions = [
         RevisionEntry(
             id=revision["id"],
             date=ts2dt(revision["date"]),
             root=revision["directory"],
         )
         for revision in data["revision"]
     ]
 
     if batch:
         revision_add(provenance, archive, revisions, lower=lower, mindepth=mindepth)
     else:
         for revision in revisions:
             revision_add(
                 provenance, archive, [revision], lower=lower, mindepth=mindepth
             )
 
     syntheticfile = get_datafile(
         f"synthetic_{repo}_{'lower' if lower else 'upper'}_{mindepth}.txt"
     )
     expected_first: Dict[str, Tuple[str, float, List[str]]] = {}
     # dict of tuples (blob_id, rev_id, [path, ...]) the third element for path
     # is a list because a content can be added at several places in a single
     # revision, in which case the result of content_find_first() is one of
     # those path, but we have no guarantee which one it will return.
-    for synth_rev in synthetic_result(syntheticfile):
+    for synth_rev in synthetic_revision_content_result(syntheticfile):
         rev_id = synth_rev["sha1"].hex()
         rev_ts = synth_rev["date"]
 
         for rc in synth_rev["R_C"]:
             sha1 = rc["dst"].hex()
             if sha1 not in expected_first:
                 assert rc["rel_ts"] == 0
                 expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
             else:
                 if rev_ts == expected_first[sha1][1]:
                     expected_first[sha1][2].append(rc["path"])
                 elif rev_ts < expected_first[sha1][1]:
                     expected_first[sha1] = (rev_id, rev_ts, [rc["path"]])
 
         for dc in synth_rev["D_C"]:
             sha1 = rc["dst"].hex()
             assert sha1 in expected_first
             # nothing to do there, this content cannot be a "first seen file"
 
     for content_id, (rev_id, ts, paths) in expected_first.items():
         occur = provenance.content_find_first(hash_to_bytes(content_id))
         assert occur is not None
         assert occur.content.hex() == content_id
         assert occur.revision.hex() == rev_id
         assert occur.date.timestamp() == ts
         assert occur.origin is None
         if provenance.storage.with_path():
             assert occur.path.decode() in paths