diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
index 066a462..81d8666 100644
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -1,617 +1,647 @@
-# Copyright (C) 2016-2021  The Software Heritage developers
+# Copyright (C) 2016-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
 from contextlib import contextmanager
 import logging
 import os
 import shutil
 import tempfile
 from typing import (
     Any,
     Dict,
     Generic,
     Iterable,
     Iterator,
     List,
     Optional,
     Set,
     TypeVar,
     Union,
 )
 import warnings
 
 import sentry_sdk
+from typing_extensions import TypedDict
 
 from swh.core import utils
 from swh.core.config import load_from_envvar, merge_configs
 from swh.indexer.storage import INDEXER_CFG_KEY, Sha1, get_indexer_storage
 from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.model import hashutil
-from swh.model.model import Revision, Sha1Git
+from swh.model.model import Origin, Revision, Sha1Git
 from swh.objstorage.exc import ObjNotFoundError
 from swh.objstorage.factory import get_objstorage
 from swh.scheduler import CONFIG as SWH_CONFIG
 from swh.storage import get_storage
 from swh.storage.interface import StorageInterface
 
 
+class ObjectsDict(TypedDict, total=False):
+    revision: List[Dict]
+    origin: List[Dict]
+    origin_visit_status: List[Dict]
+
+
 @contextmanager
 def write_to_temp(filename: str, data: bytes, working_directory: str) -> Iterator[str]:
     """Write the sha1's content in a temporary file.
 
     Args:
         filename: one of sha1's many filenames
         data: the sha1's content to write in temporary
           file
         working_directory: the directory into which the
           file is written
 
     Returns:
         The path to the temporary file created. That file is
         filled in with the raw content's data.
 
     """
     os.makedirs(working_directory, exist_ok=True)
     temp_dir = tempfile.mkdtemp(dir=working_directory)
     content_path = os.path.join(temp_dir, filename)
 
     with open(content_path, "wb") as f:
         f.write(data)
 
     yield content_path
     shutil.rmtree(temp_dir)
 
 
 DEFAULT_CONFIG = {
     INDEXER_CFG_KEY: {"cls": "memory"},
     "storage": {"cls": "memory"},
     "objstorage": {"cls": "memory"},
 }
 
 
 TId = TypeVar("TId")
 """type of the ids of index()ed objects."""
 TData = TypeVar("TData")
 """type of the objects passed to index()."""
 TResult = TypeVar("TResult")
 """return type of index()"""
 
 
 class BaseIndexer(Generic[TId, TData, TResult], metaclass=abc.ABCMeta):
     """Base class for indexers to inherit from.
 
     The main entry point is the :func:`run` function which is in
     charge of triggering the computations on the batch dict/ids
     received.
 
     Indexers can:
 
     - filter out ids whose data has already been indexed.
     - retrieve ids data from storage or objstorage
     - index this data depending on the object and store the result in
       storage.
 
     To implement a new object type indexer, inherit from the
     BaseIndexer and implement indexing:
 
     :meth:`~BaseIndexer.run`:
       object_ids are different depending on object. For example: sha1 for
       content, sha1_git for revision, directory, release, and id for origin
 
     To implement a new concrete indexer, inherit from the object level
     classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
     :class:`OriginIndexer`.
 
     Then you need to implement the following functions:
 
     :meth:`~BaseIndexer.filter`:
       filter out data already indexed (in storage).
 
     :meth:`~BaseIndexer.index_object`:
       compute index on id with data (retrieved from the storage or the
       objstorage by the id key) and return the resulting index computation.
 
     :meth:`~BaseIndexer.persist_index_computations`:
       persist the results of multiple index computations in the storage.
 
     The new indexer implementation can also override the following functions:
 
     :meth:`~BaseIndexer.prepare`:
       Configuration preparation for the indexer.  When overriding, this must
       call the `super().prepare()` instruction.
 
     :meth:`~BaseIndexer.check`:
       Configuration check for the indexer.  When overriding, this must call the
       `super().check()` instruction.
 
     :meth:`~BaseIndexer.register_tools`:
       This should return a dict of the tool(s) to use when indexing or
       filtering.
 
     """
 
     results: List[TResult]
 
     USE_TOOLS = True
 
     catch_exceptions = True
     """Prevents exceptions in `index()` from raising too high. Set to False
     in tests to properly catch all exceptions."""
 
     scheduler: Any
     storage: StorageInterface
     objstorage: Any
     idx_storage: IndexerStorageInterface
 
     def __init__(self, config=None, **kw) -> None:
         """Prepare and check that the indexer is ready to run."""
         super().__init__()
         if config is not None:
             self.config = config
         elif SWH_CONFIG:
             self.config = SWH_CONFIG.copy()
         else:
             self.config = load_from_envvar()
         self.config = merge_configs(DEFAULT_CONFIG, self.config)
         self.prepare()
         self.check()
         self.log.debug("%s: config=%s", self, self.config)
 
     def prepare(self) -> None:
         """Prepare the indexer's needed runtime configuration.
         Without this step, the indexer cannot possibly run.
 
         """
         config_storage = self.config.get("storage")
         if config_storage:
             self.storage = get_storage(**config_storage)
 
         self.objstorage = get_objstorage(**self.config["objstorage"])
 
         idx_storage = self.config[INDEXER_CFG_KEY]
         self.idx_storage = get_indexer_storage(**idx_storage)
 
         _log = logging.getLogger("requests.packages.urllib3.connectionpool")
         _log.setLevel(logging.WARN)
         self.log = logging.getLogger("swh.indexer")
 
         if self.USE_TOOLS:
             self.tools = list(self.register_tools(self.config.get("tools", [])))
         self.results = []
 
     @property
     def tool(self) -> Dict:
         return self.tools[0]
 
     def check(self) -> None:
         """Check the indexer's configuration is ok before proceeding.
         If ok, does nothing. If not raise error.
 
         """
         if self.USE_TOOLS and not self.tools:
             raise ValueError("Tools %s is unknown, cannot continue" % self.tools)
 
     def _prepare_tool(self, tool: Dict[str, Any]) -> Dict[str, Any]:
         """Prepare the tool dict to be compliant with the storage api."""
         return {"tool_%s" % key: value for key, value in tool.items()}
 
     def register_tools(
         self, tools: Union[Dict[str, Any], List[Dict[str, Any]]]
     ) -> List[Dict[str, Any]]:
         """Permit to register tools to the storage.
 
            Add a sensible default which can be overridden if not
            sufficient.  (For now, all indexers use only one tool)
 
            Expects the self.config['tools'] property to be set with
            one or more tools.
 
         Args:
             tools: Either a dict or a list of dict.
 
         Returns:
             list: List of dicts with additional id key.
 
         Raises:
             ValueError: if not a list nor a dict.
 
         """
         if isinstance(tools, list):
             tools = list(map(self._prepare_tool, tools))
         elif isinstance(tools, dict):
             tools = [self._prepare_tool(tools)]
         else:
             raise ValueError("Configuration tool(s) must be a dict or list!")
 
         if tools:
             return self.idx_storage.indexer_configuration_add(tools)
         else:
             return []
 
     def index(self, id: TId, data: Optional[TData], **kwargs) -> List[TResult]:
         """Index computation for the id and associated raw data.
 
         Args:
             id: identifier or Dict object
             data: id's data from storage or objstorage depending on
               object type
 
         Returns:
             dict: a dict that makes sense for the
             :meth:`.persist_index_computations` method.
 
         """
         raise NotImplementedError()
 
     def filter(self, ids: List[TId]) -> Iterator[TId]:
         """Filter missing ids for that particular indexer.
 
         Args:
             ids: list of ids
 
         Yields:
             iterator of missing ids
 
         """
         yield from ids
 
     @abc.abstractmethod
     def persist_index_computations(self, results: List[TResult]) -> Dict[str, int]:
         """Persist the computation resulting from the index.
 
         Args:
 
             results: List of results. One result is the
               result of the index function.
 
         Returns:
             a summary dict of what has been inserted in the storage
 
         """
         return {}
 
 
 class ContentIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
     """A content indexer working on a list of ids directly.
 
     To work on indexer partition, use the :class:`ContentPartitionIndexer`
     instead.
 
     Note: :class:`ContentIndexer` is not an instantiable object. To
     use it, one should inherit from this class and override the
     methods mentioned in the :class:`BaseIndexer` class.
 
     """
 
     def run(self, ids: List[Sha1], **kwargs) -> Dict:
         """Given a list of ids:
 
         - retrieve the content from the storage
         - execute the indexing computations
         - store the results
 
         Args:
             ids (Iterable[Union[bytes, str]]): sha1's identifier list
             **kwargs: passed to the `index` method
 
         Returns:
             A summary Dict of the task's status
 
         """
         if "policy_update" in kwargs:
             warnings.warn(
                 "'policy_update' argument is deprecated and ignored.",
                 DeprecationWarning,
             )
             del kwargs["policy_update"]
 
         sha1s = [
             hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
         ]
         results = []
         summary: Dict = {"status": "uneventful"}
         try:
             for sha1 in sha1s:
                 try:
                     raw_content = self.objstorage.get(sha1)
                 except ObjNotFoundError:
                     self.log.warning(
                         "Content %s not found in objstorage"
                         % hashutil.hash_to_hex(sha1)
                     )
                     continue
                 res = self.index(sha1, raw_content, **kwargs)
                 if res:  # If no results, skip it
                     results.extend(res)
                     summary["status"] = "eventful"
             summary = self.persist_index_computations(results)
             self.results = results
         except Exception:
             if not self.catch_exceptions:
                 raise
             self.log.exception("Problem when reading contents metadata.")
             sentry_sdk.capture_exception()
             summary["status"] = "failed"
         return summary
 
 
 class ContentPartitionIndexer(BaseIndexer[Sha1, bytes, TResult], Generic[TResult]):
     """A content partition indexer.
 
     This expects as input a partition_id and a nb_partitions. This will then index the
     contents within that partition.
 
     To work on a list of ids, use the :class:`ContentIndexer` instead.
 
     Note: :class:`ContentPartitionIndexer` is not an instantiable
     object. To use it, one should inherit from this class and override
     the methods mentioned in the :class:`BaseIndexer` class.
 
     """
 
     @abc.abstractmethod
     def indexed_contents_in_partition(
         self, partition_id: int, nb_partitions: int
     ) -> Iterable[Sha1]:
         """Retrieve indexed contents within range [start, end].
 
         Args:
             partition_id: Index of the partition to fetch
             nb_partitions: Total number of partitions to split into
             page_token: opaque token used for pagination
 
         """
         pass
 
     def _list_contents_to_index(
         self, partition_id: int, nb_partitions: int, indexed: Set[Sha1]
     ) -> Iterable[Sha1]:
         """Compute from storage the new contents to index in the partition_id . The already
            indexed contents are skipped.
 
         Args:
             partition_id: Index of the partition to fetch data from
             nb_partitions: Total number of partition
             indexed: Set of content already indexed.
 
         Yields:
             Sha1 id (bytes) of contents to index
 
         """
         if not isinstance(partition_id, int) or not isinstance(nb_partitions, int):
             raise TypeError(
                 f"identifiers must be int, not {partition_id!r} and {nb_partitions!r}."
             )
         next_page_token = None
         while True:
             result = self.storage.content_get_partition(
                 partition_id, nb_partitions, page_token=next_page_token
             )
             contents = result.results
             for c in contents:
                 _id = hashutil.hash_to_bytes(c.sha1)
                 if _id in indexed:
                     continue
                 yield _id
             next_page_token = result.next_page_token
             if next_page_token is None:
                 break
 
     def _index_contents(
         self, partition_id: int, nb_partitions: int, indexed: Set[Sha1], **kwargs: Any
     ) -> Iterator[TResult]:
         """Index the contents within the partition_id.
 
         Args:
             start: Starting bound from range identifier
             end: End range identifier
             indexed: Set of content already indexed.
 
         Yields:
             indexing result as dict to persist in the indexer backend
 
         """
         for sha1 in self._list_contents_to_index(partition_id, nb_partitions, indexed):
             try:
                 raw_content = self.objstorage.get(sha1)
             except ObjNotFoundError:
                 self.log.warning(f"Content {sha1.hex()} not found in objstorage")
                 continue
             yield from self.index(sha1, raw_content, **kwargs)
 
     def _index_with_skipping_already_done(
         self, partition_id: int, nb_partitions: int
     ) -> Iterator[TResult]:
         """Index not already indexed contents within the partition partition_id
 
         Args:
             partition_id: Index of the partition to fetch
             nb_partitions: Total number of partitions to split into
 
         Yields:
            indexing result as dict to persist in the indexer backend
 
         """
         already_indexed_contents = set(
             self.indexed_contents_in_partition(partition_id, nb_partitions)
         )
 
         return self._index_contents(
             partition_id, nb_partitions, already_indexed_contents
         )
 
     def run(
         self,
         partition_id: int,
         nb_partitions: int,
         skip_existing: bool = True,
         **kwargs,
     ) -> Dict:
         """Given a partition of content ids, index the contents within.
 
            Either the indexer is incremental (filter out existing computed data) or it
            computes everything from scratch.
 
         Args:
             partition_id: Index of the partition to fetch
             nb_partitions: Total number of partitions to split into
             skip_existing: Skip existing indexed data
                 (default) or not
             **kwargs: passed to the `index` method
 
         Returns:
             dict with the indexing task status
 
         """
         summary: Dict[str, Any] = {"status": "uneventful"}
         count = 0
         try:
             if skip_existing:
                 gen = self._index_with_skipping_already_done(
                     partition_id, nb_partitions
                 )
             else:
                 gen = self._index_contents(partition_id, nb_partitions, indexed=set([]))
 
             count_object_added_key: Optional[str] = None
 
             for contents in utils.grouper(gen, n=self.config["write_batch_size"]):
                 res = self.persist_index_computations(list(contents))
                 if not count_object_added_key:
                     count_object_added_key = list(res.keys())[0]
                 count += res[count_object_added_key]
                 if count > 0:
                     summary["status"] = "eventful"
         except Exception:
             if not self.catch_exceptions:
                 raise
             self.log.exception("Problem when computing metadata.")
             sentry_sdk.capture_exception()
             summary["status"] = "failed"
 
         if count > 0 and count_object_added_key:
             summary[count_object_added_key] = count
         return summary
 
 
 class OriginIndexer(BaseIndexer[str, None, TResult], Generic[TResult]):
     """An object type indexer, inherits from the :class:`BaseIndexer` and
     implements Origin indexing using the run method
 
     Note: the :class:`OriginIndexer` is not an instantiable object.
     To use it in another context one should inherit from this class
     and override the methods mentioned in the :class:`BaseIndexer`
     class.
 
     """
 
     def run(self, origin_urls: List[str], **kwargs) -> Dict:
         """Given a list of origin urls:
 
         - retrieve origins from storage
         - execute the indexing computations
         - store the results
 
         Args:
             origin_urls: list of origin urls.
             **kwargs: passed to the `index` method
 
         """
         if "policy_update" in kwargs:
             warnings.warn(
                 "'policy_update' argument is deprecated and ignored.",
                 DeprecationWarning,
             )
             del kwargs["policy_update"]
+
+        origins = [{"url": url} for url in origin_urls]
+
+        return self.process_journal_objects({"origin": origins})
+
+    def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+        """Worker function for ``JournalClient``. Expects ``objects`` to have a single
+        key, either ``origin`` or ``"origin_visit_status"``."""
+        assert set(objects) == {"origin"}
+
+        origins = [Origin(url=origin["url"]) for origin in objects.get("origin", [])]
+
         summary: Dict[str, Any] = {"status": "uneventful"}
         try:
-            results = self.index_list(origin_urls, **kwargs)
+            results = self.index_list(origins)
         except Exception:
             if not self.catch_exceptions:
                 raise
             summary["status"] = "failed"
             return summary
 
         summary_persist = self.persist_index_computations(results)
         self.results = results
         if summary_persist:
             for value in summary_persist.values():
                 if value > 0:
                     summary["status"] = "eventful"
             summary.update(summary_persist)
         return summary
 
-    def index_list(self, origin_urls: List[str], **kwargs) -> List[TResult]:
+    def index_list(self, origins: List[Origin], **kwargs) -> List[TResult]:
         results = []
-        for origin_url in origin_urls:
+        for origin in origins:
             try:
-                results.extend(self.index(origin_url, **kwargs))
+                results.extend(self.index(origin.url, **kwargs))
             except Exception:
-                self.log.exception("Problem when processing origin %s", origin_url)
+                self.log.exception("Problem when processing origin %s", origin.url)
                 sentry_sdk.capture_exception()
                 raise
         return results
 
 
 class RevisionIndexer(BaseIndexer[Sha1Git, Revision, TResult], Generic[TResult]):
     """An object type indexer, inherits from the :class:`BaseIndexer` and
     implements Revision indexing using the run method
 
     Note: the :class:`RevisionIndexer` is not an instantiable object.
     To use it in another context one should inherit from this class
     and override the methods mentioned in the :class:`BaseIndexer`
     class.
 
     """
 
     def run(self, ids: List[Sha1Git], **kwargs) -> Dict:
         """Given a list of sha1_gits:
 
         - retrieve revisions from storage
         - execute the indexing computations
         - store the results
 
         Args:
             ids: sha1_git's identifier list
 
         """
         if "policy_update" in kwargs:
             warnings.warn(
                 "'policy_update' argument is deprecated and ignored.",
                 DeprecationWarning,
             )
             del kwargs["policy_update"]
-        summary: Dict[str, Any] = {"status": "uneventful"}
-        results = []
 
         revision_ids = [
             hashutil.hash_to_bytes(id_) if isinstance(id_, str) else id_ for id_ in ids
         ]
+        revisions = []
         for (rev_id, rev) in zip(revision_ids, self.storage.revision_get(revision_ids)):
             if not rev:
                 # TODO: call self.index() with rev=None?
                 self.log.warning(
                     "Revision %s not found in storage", hashutil.hash_to_hex(rev_id)
                 )
                 continue
+            revisions.append(rev.to_dict())
+
+        return self.process_journal_objects({"revision": revisions})
+
+    def process_journal_objects(self, objects: ObjectsDict) -> Dict:
+        """Worker function for ``JournalClient``. Expects ``objects`` to have a single
+        key, ``"revision"``."""
+        assert set(objects) == {"revision"}
+
+        summary: Dict[str, Any] = {"status": "uneventful"}
+        results = []
+
+        for rev in objects["revision"]:
             try:
-                results.extend(self.index(rev_id, rev))
+                results.extend(self.index(rev["id"], Revision.from_dict(rev)))
             except Exception:
                 if not self.catch_exceptions:
                     raise
                 self.log.exception("Problem when processing revision")
                 sentry_sdk.capture_exception()
                 summary["status"] = "failed"
-                return summary
 
         summary_persist = self.persist_index_computations(results)
         if summary_persist:
             for value in summary_persist.values():
                 if value > 0:
                     summary["status"] = "eventful"
             summary.update(summary_persist)
         self.results = results
         return summary
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 6fc3705..b3d7717 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,397 +1,401 @@
-# Copyright (C) 2017-2021 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
 from typing import (
     Any,
     Callable,
     Dict,
     Iterable,
     Iterator,
     List,
     Optional,
     Tuple,
     TypeVar,
 )
 
 import sentry_sdk
 
 from swh.core.config import merge_configs
 from swh.core.utils import grouper
 from swh.indexer.codemeta import merge_documents
 from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.origin_head import OriginHeadIndexer
 from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     OriginIntrinsicMetadataRow,
     RevisionIntrinsicMetadataRow,
 )
 from swh.model import hashutil
-from swh.model.model import Revision, Sha1Git
+from swh.model.model import Origin, Revision, Sha1Git
 
 REVISION_GET_BATCH_SIZE = 10
 ORIGIN_GET_BATCH_SIZE = 10
 
 
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 
 
 def call_with_batches(
     f: Callable[[List[T1]], Iterable[T2]],
     args: List[T1],
     batch_size: int,
 ) -> Iterator[T2]:
     """Calls a function with batches of args, and concatenates the results."""
     groups = grouper(args, batch_size)
     for group in groups:
         yield from f(list(group))
 
 
 class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.content_metadata_missing(
             (
                 {
                     "id": sha1,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1 in ids
             )
         )
 
     def index(
         self,
         id: Sha1,
         data: Optional[bytes] = None,
         log_suffix="unknown revision",
         **kwargs,
     ) -> List[ContentMetadataRow]:
         """Index sha1s' content and store result.
 
         Args:
             id: content's identifier
             data: raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         assert isinstance(id, bytes)
         assert data is not None
         metadata = None
         try:
             mapping_name = self.tool["tool_configuration"]["context"]
             log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
             metadata = MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id)
             )
             sentry_sdk.capture_exception()
         if metadata is None:
             return []
         return [
             ContentMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[ContentMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage.
 
         Args:
             results: list of content_metadata, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - metadata (jsonb): detected metadata
 
         """
         return self.idx_storage.content_metadata_add(results)
 
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "tools": {
         "name": "swh-metadata-detector",
         "version": "0.0.2",
         "configuration": {},
     },
 }
 
 
 class RevisionMetadataIndexer(RevisionIndexer[RevisionIntrinsicMetadataRow]):
     """Revision-level indexer
 
     This indexer is in charge of:
 
     - filtering revisions already indexed in revision_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in root directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for revision
 
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.revision_intrinsic_metadata_missing(
             (
                 {
                     "id": sha1_git,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1_git in sha1_gits
             )
         )
 
     def index(
         self, id: Sha1Git, data: Optional[Revision], **kwargs
     ) -> List[RevisionIntrinsicMetadataRow]:
         """Index rev by processing it and organizing result.
 
         use metadata_detector to iterate on filenames
 
         - if one filename detected -> sends file to content indexer
         - if multiple file detected -> translation needed at revision level
 
         Args:
           id: sha1_git of the revision
           data: revision model object from storage
 
         Returns:
             dict: dictionary representing a revision_intrinsic_metadata, with
             keys:
 
             - id (str): rev's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         rev = data
         assert isinstance(rev, Revision)
 
         try:
             root_dir = rev.directory
             dir_ls = list(self.storage.directory_ls(root_dir, recursive=False))
             if [entry["type"] for entry in dir_ls] == ["dir"]:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_ls[0]["target"]
                 dir_ls = list(self.storage.directory_ls(subdir, recursive=False))
             files = [entry for entry in dir_ls if entry["type"] == "file"]
             detected_files = detect_metadata(files)
             (mappings, metadata) = self.translate_revision_intrinsic_metadata(
                 detected_files,
                 log_suffix="revision=%s" % hashutil.hash_to_hex(rev.id),
             )
         except Exception as e:
             self.log.exception("Problem when indexing rev: %r", e)
             sentry_sdk.capture_exception()
         return [
             RevisionIntrinsicMetadataRow(
                 id=rev.id,
                 indexer_configuration_id=self.tool["id"],
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[RevisionIntrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage.
 
         Args:
             results: list of content_mimetype, dict with the
               following keys:
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
 
         """
         # TODO: add functions in storage to keep data in
         # revision_intrinsic_metadata
         return self.idx_storage.revision_intrinsic_metadata_add(results)
 
     def translate_revision_intrinsic_metadata(
         self, detected_files: Dict[str, List[Any]], log_suffix: str
     ) -> Tuple[List[Any], Any]:
         """
         Determine plan of action to translate metadata when containing
         one or multiple detected files:
 
         Args:
             detected_files: dictionary mapping context names (e.g.,
               "npm", "authors") to list of sha1
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         used_mappings = [MAPPINGS[context].name for context in detected_files]
         metadata = []
         tool = {
             "name": "swh-metadata-translator",
             "version": "0.0.2",
             "configuration": {},
         }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
         config["tools"] = [tool]
         for context in detected_files.keys():
             cfg = deepcopy(config)
             cfg["tools"][0]["configuration"]["context"] = context
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(
                 detected_files[context]
             )
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c.id
                 sha1s_in_storage.append(sha1)
                 local_metadata = c.metadata
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [
                 item for item in detected_files[context] if item not in sha1s_in_storage
             ]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(
                         sha1s_filtered,
                         log_suffix=log_suffix,
                     )
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result.metadata
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception("Exception while indexing metadata on contents")
                     sentry_sdk.capture_exception()
 
         metadata = merge_documents(metadata)
         return (used_mappings, metadata)
 
 
 class OriginMetadataIndexer(
     OriginIndexer[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]
 ):
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs) -> None:
         super().__init__(config=config, **kwargs)
         self.origin_head_indexer = OriginHeadIndexer(config=config)
         self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
 
     def index_list(
-        self, origin_urls: List[str], **kwargs
+        self, origins: List[Origin], **kwargs
     ) -> List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]]:
         head_rev_ids = []
         origins_with_head = []
-        origins = list(
+
+        # Filter out origins not in the storage
+        known_origins = list(
             call_with_batches(
                 self.storage.origin_get,
-                origin_urls,
+                [origin.url for origin in origins],
                 ORIGIN_GET_BATCH_SIZE,
             )
         )
-        for origin in origins:
+
+        for origin in known_origins:
             if origin is None:
                 continue
             head_results = self.origin_head_indexer.index(origin.url)
             if head_results:
                 (head_result,) = head_results
                 origins_with_head.append(origin)
                 head_rev_ids.append(head_result["revision_id"])
 
         head_revs = list(
             call_with_batches(
                 self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
             )
         )
         assert len(head_revs) == len(head_rev_ids)
 
         results = []
         for (origin, rev) in zip(origins_with_head, head_revs):
             if not rev:
                 self.log.warning("Missing head revision of origin %r", origin.url)
                 continue
 
             for rev_metadata in self.revision_metadata_indexer.index(rev.id, rev):
                 # There is at most one rev_metadata
                 orig_metadata = OriginIntrinsicMetadataRow(
                     from_revision=rev_metadata.id,
                     id=origin.url,
                     metadata=rev_metadata.metadata,
                     mappings=rev_metadata.mappings,
                     indexer_configuration_id=rev_metadata.indexer_configuration_id,
                 )
                 results.append((orig_metadata, rev_metadata))
+
         return results
 
     def persist_index_computations(
         self,
         results: List[Tuple[OriginIntrinsicMetadataRow, RevisionIntrinsicMetadataRow]],
     ) -> Dict[str, int]:
         # Deduplicate revisions
         rev_metadata: List[RevisionIntrinsicMetadataRow] = []
         orig_metadata: List[OriginIntrinsicMetadataRow] = []
         summary: Dict = {}
         for (orig_item, rev_item) in results:
             assert rev_item.metadata == orig_item.metadata
             if rev_item.metadata and not (rev_item.metadata.keys() <= {"@context"}):
                 # Only store non-empty metadata sets
                 if rev_item not in rev_metadata:
                     rev_metadata.append(rev_item)
                 if orig_item not in orig_metadata:
                     orig_metadata.append(orig_item)
 
         if rev_metadata:
             summary_rev = self.idx_storage.revision_intrinsic_metadata_add(rev_metadata)
             summary.update(summary_rev)
         if orig_metadata:
             summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
             summary.update(summary_ori)
 
         return summary
diff --git a/swh/indexer/tests/test_indexer.py b/swh/indexer/tests/test_indexer.py
index 767c63f..85d1a62 100644
--- a/swh/indexer/tests/test_indexer.py
+++ b/swh/indexer/tests/test_indexer.py
@@ -1,155 +1,155 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Any, Dict, Iterable, List, Optional
 from unittest.mock import Mock
 
 import pytest
 
 from swh.indexer.indexer import (
     ContentIndexer,
     ContentPartitionIndexer,
     OriginIndexer,
     RevisionIndexer,
 )
 from swh.indexer.storage import PagedResult, Sha1
 from swh.model.model import Content
 
-from .utils import BASE_TEST_CONFIG
+from .utils import BASE_TEST_CONFIG, REVISION
 
 
 class _TestException(Exception):
     pass
 
 
 class CrashingIndexerMixin:
     USE_TOOLS = False
 
     def index(
         self, id: Any, data: Optional[Any] = None, **kwargs
     ) -> List[Dict[str, Any]]:
         raise _TestException()
 
     def persist_index_computations(self, results) -> Dict[str, int]:
         return {}
 
     def indexed_contents_in_partition(
         self, partition_id: int, nb_partitions: int
     ) -> Iterable[Sha1]:
         raise _TestException()
 
 
 class CrashingContentIndexer(CrashingIndexerMixin, ContentIndexer):
     pass
 
 
 class CrashingContentPartitionIndexer(CrashingIndexerMixin, ContentPartitionIndexer):
     pass
 
 
 class CrashingRevisionIndexer(CrashingIndexerMixin, RevisionIndexer):
     pass
 
 
 class CrashingOriginIndexer(CrashingIndexerMixin, OriginIndexer):
     pass
 
 
 class TrivialContentPartitionIndexer(ContentPartitionIndexer[str]):
     USE_TOOLS = False
 
     def index(self, id: bytes, data: Optional[bytes], **kwargs) -> List[str]:
         return ["indexed " + id.decode()]
 
     def indexed_contents_in_partition(
         self, partition_id: int, nb_partitions: int
     ) -> Iterable[Sha1]:
         return iter([b"excluded hash", b"other excluded hash"])
 
     def persist_index_computations(self, results: List[str]) -> Dict[str, int]:
         self._results.append(results)  # type: ignore
         return {"nb_added": len(results)}
 
 
 def test_content_indexer_catch_exceptions():
     indexer = CrashingContentIndexer(config=BASE_TEST_CONFIG)
     indexer.objstorage = Mock()
     indexer.objstorage.get.return_value = b"content"
 
     assert indexer.run([b"foo"]) == {"status": "failed"}
 
     indexer.catch_exceptions = False
 
     with pytest.raises(_TestException):
         indexer.run([b"foo"])
 
 
 def test_revision_indexer_catch_exceptions():
     indexer = CrashingRevisionIndexer(config=BASE_TEST_CONFIG)
     indexer.storage = Mock()
-    indexer.storage.revision_get.return_value = ["rev"]
+    indexer.storage.revision_get.return_value = [REVISION]
 
     assert indexer.run([b"foo"]) == {"status": "failed"}
 
     indexer.catch_exceptions = False
 
     with pytest.raises(_TestException):
         indexer.run([b"foo"])
 
 
 def test_origin_indexer_catch_exceptions():
     indexer = CrashingOriginIndexer(config=BASE_TEST_CONFIG)
 
     assert indexer.run(["http://example.org"]) == {"status": "failed"}
 
     indexer.catch_exceptions = False
 
     with pytest.raises(_TestException):
         indexer.run(["http://example.org"])
 
 
 def test_content_partition_indexer_catch_exceptions():
     indexer = CrashingContentPartitionIndexer(
         config={**BASE_TEST_CONFIG, "write_batch_size": 42}
     )
 
     assert indexer.run(0, 42) == {"status": "failed"}
 
     indexer.catch_exceptions = False
 
     with pytest.raises(_TestException):
         indexer.run(0, 42)
 
 
 def test_content_partition_indexer():
     # TODO: simplify the mocking in this test
     indexer = TrivialContentPartitionIndexer(
         config={
             **BASE_TEST_CONFIG,
             "write_batch_size": 10,
         }  # doesn't matter
     )
     indexer.catch_exceptions = False
     indexer._results = []
     indexer.storage = Mock()
     indexer.storage.content_get_partition = lambda *args, **kwargs: PagedResult(
         results=[
             Content(sha1=c, sha1_git=c, sha256=c, blake2s256=c, length=42)
             for c in [
                 b"hash1",
                 b"excluded hash",
                 b"hash2",
                 b"other excluded hash",
                 b"hash3",
             ]
         ],
         next_page_token=None,
     )
     indexer.objstorage = Mock()
     indexer.objstorage.get = lambda id: b"foo"
     nb_partitions = 1
     partition_id = 0
     indexer.run(partition_id, nb_partitions)
     assert indexer._results == [["indexed hash1", "indexed hash2", "indexed hash3"]]
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index e05bd08..3afc777 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,256 +1,256 @@
 # Copyright (C) 2018-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import copy
 from unittest.mock import patch
 
 import pytest
 
 from swh.indexer.metadata import OriginMetadataIndexer
 from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.indexer.storage.model import (
     OriginIntrinsicMetadataRow,
     RevisionIntrinsicMetadataRow,
 )
 from swh.model.model import Origin
 from swh.storage.interface import StorageInterface
 
 from .test_metadata import TRANSLATOR_TOOL
 from .utils import REVISION, YARN_PARSER_METADATA
 
 
 @pytest.fixture
 def swh_indexer_config(swh_indexer_config):
     """Override the default configuration to override the tools entry"""
     cfg = copy.deepcopy(swh_indexer_config)
     cfg["tools"] = TRANSLATOR_TOOL
     return cfg
 
 
 def test_origin_metadata_indexer(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     indexer.run([origin])
 
     tool = swh_indexer_config["tools"]
 
     rev_id = REVISION.id
     rev_metadata = RevisionIntrinsicMetadataRow(
         id=rev_id,
         tool=tool,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
     origin_metadata = OriginIntrinsicMetadataRow(
         id=origin,
         tool=tool,
         from_revision=rev_id,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
 
     rev_results = list(idx_storage.revision_intrinsic_metadata_get([rev_id]))
     for rev_result in rev_results:
         assert rev_result.tool
         del rev_result.tool["id"]
     assert rev_results == [rev_metadata]
 
     orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
     for orig_result in orig_results:
         assert orig_result.tool
         del orig_result.tool["id"]
     assert orig_results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_origin(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["https://github.com/librariesio/yarn-parser"])
     indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
 
     origin = "https://github.com/librariesio/yarn-parser"
     rev_id = REVISION.id
 
     rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(rev_results) == 1
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert len(orig_results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     storage.origin_add([Origin(url="https://example.com")])
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.run(["https://example.com"])
 
     origin = "https://example.com"
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     origin1 = "https://example.com"
     origin2 = "https://github.com/librariesio/yarn-parser"
     storage.origin_add([Origin(url=origin1)])
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.run([origin1, origin2])
 
     rev_id = REVISION.id
 
     rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert rev_results == [
         RevisionIntrinsicMetadataRow(
             id=rev_id,
             metadata=YARN_PARSER_METADATA,
             mappings=["npm"],
             tool=rev_results[0].tool,
         )
     ]
 
     orig_results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     for orig_result in orig_results:
         assert orig_results == [
             OriginIntrinsicMetadataRow(
                 id=origin2,
                 from_revision=rev_id,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
                 tool=orig_results[0].tool,
             )
         ]
 
 
 def test_origin_metadata_indexer_duplicate_revision(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.catch_exceptions = False
     origin1 = "https://github.com/librariesio/yarn-parser"
     origin2 = "https://github.com/librariesio/yarn-parser.git"
     indexer.run([origin1, origin2])
 
     rev_id = REVISION.id
 
     rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert len(rev_results) == 1
 
     orig_results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     assert len(orig_results) == 2
 
 
 def test_origin_metadata_indexer_no_metadata_file(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
         indexer.run([origin])
 
     rev_id = REVISION.id
 
     rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert rev_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_no_metadata(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch(
         "swh.indexer.metadata.RevisionMetadataIndexer"
         ".translate_revision_intrinsic_metadata",
         return_value=(["npm"], {"@context": "foo"}),
     ):
         indexer.run([origin])
 
     rev_id = REVISION.id
 
     rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert rev_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_error(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch(
         "swh.indexer.metadata.RevisionMetadataIndexer"
         ".translate_revision_intrinsic_metadata",
         return_value=None,
     ):
         indexer.run([origin])
 
     rev_id = REVISION.id
 
     rev_results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
     assert rev_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_unknown_origin(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
-    result = indexer.index_list(["https://unknown.org/foo"])
+    result = indexer.index_list([Origin("https://unknown.org/foo")])
     assert not result