diff --git a/PKG-INFO b/PKG-INFO
index 10c99f5..7c5cbd3 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 2.7.3
+Version: 2.8.0
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 License-File: AUTHORS
 
 swh-indexer
 ============
 
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
   - ctags
   - language
   - fossology-license
   - metadata
 - revision:
   - metadata
 
 An indexer is in charge of:
 - looking up objects
 - extracting information from those objects
 - store those information in the swh-indexer db
 
 There are multiple indexers working on different object types:
   - content indexer: works with content sha1 hashes
   - revision indexer: works with revision sha1 hashes
   - origin indexer: works with origin identifiers
 
 Indexation procedure:
 - receive batch of ids
 - retrieve the associated data depending on object type
 - compute for that object some index
 - store the result to swh's storage
 
 Current content indexers:
 
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
 - language (queue swh_indexer_content_language): detect the
   programming language
 
 - ctags (queue swh_indexer_content_ctags): compute tags information
 
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
 - metadata: translate file into translated_metadata dict
 
 Current revision indexers:
 
 - metadata: detects files containing metadata and retrieves translated_metadata
   in content_metadata table in storage or run content indexer to translate
   files.
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index 10c99f5..7c5cbd3 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 2.7.3
+Version: 2.8.0
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 License-File: AUTHORS
 
 swh-indexer
 ============
 
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
   - ctags
   - language
   - fossology-license
   - metadata
 - revision:
   - metadata
 
 An indexer is in charge of:
 - looking up objects
 - extracting information from those objects
 - store those information in the swh-indexer db
 
 There are multiple indexers working on different object types:
   - content indexer: works with content sha1 hashes
   - revision indexer: works with revision sha1 hashes
   - origin indexer: works with origin identifiers
 
 Indexation procedure:
 - receive batch of ids
 - retrieve the associated data depending on object type
 - compute for that object some index
 - store the result to swh's storage
 
 Current content indexers:
 
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
 - language (queue swh_indexer_content_language): detect the
   programming language
 
 - ctags (queue swh_indexer_content_ctags): compute tags information
 
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
 - metadata: translate file into translated_metadata dict
 
 Current revision indexers:
 
 - metadata: detects files containing metadata and retrieves translated_metadata
   in content_metadata table in storage or run content indexer to translate
   files.
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index f1e1e6a..14212a3 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,559 +1,567 @@
 # Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
+import hashlib
 import itertools
 import logging
 import time
 from typing import (
     Any,
     Callable,
     Dict,
     Iterable,
     Iterator,
     List,
     Optional,
     Tuple,
     TypeVar,
     cast,
 )
 from urllib.parse import urlparse
 
 import pkg_resources
 import sentry_sdk
 
 from swh.core.config import merge_configs
 from swh.core.utils import grouper
 from swh.indexer.codemeta import merge_documents
 from swh.indexer.indexer import (
     BaseIndexer,
     ContentIndexer,
     DirectoryIndexer,
     ObjectsDict,
     OriginIndexer,
 )
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS
 from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
 from swh.indexer.origin_head import get_head_swhid
 from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
     OriginExtrinsicMetadataRow,
     OriginIntrinsicMetadataRow,
 )
 from swh.model import hashutil
 from swh.model.model import Directory, MetadataAuthorityType
 from swh.model.model import ObjectType as ModelObjectType
 from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git
 from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
 
 REVISION_GET_BATCH_SIZE = 10
 RELEASE_GET_BATCH_SIZE = 10
 ORIGIN_GET_BATCH_SIZE = 10
 
 
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 
 logger = logging.getLogger(__name__)
 
 
 def call_with_batches(
     f: Callable[[List[T1]], Iterable[T2]],
     args: List[T1],
     batch_size: int,
 ) -> Iterator[T2]:
     """Calls a function with batches of args, and concatenates the results."""
     groups = grouper(args, batch_size)
     for group in groups:
         yield from f(list(group))
 
 
 class ExtrinsicMetadataIndexer(
     BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow]
 ):
     def process_journal_objects(self, objects: ObjectsDict) -> Dict:
         summary: Dict[str, Any] = {"status": "uneventful"}
         try:
             results = {}
             for item in objects.get("raw_extrinsic_metadata", []):
                 remd = RawExtrinsicMetadata.from_dict(item)
                 sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid()))
                 results[remd.target] = self.index(remd.id, data=remd)
         except Exception:
             if not self.catch_exceptions:
                 raise
             summary["status"] = "failed"
             return summary
 
         self.results = list(itertools.chain.from_iterable(results.values()))
         summary_persist = self.persist_index_computations(self.results)
         if summary_persist:
             for value in summary_persist.values():
                 if value > 0:
                     summary["status"] = "eventful"
             summary.update(summary_persist)
         return summary
 
     def index(
         self,
         id: Sha1Git,
         data: Optional[RawExtrinsicMetadata],
         **kwargs,
     ) -> List[OriginExtrinsicMetadataRow]:
         if data is None:
             raise NotImplementedError(
                 "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
             )
-        if data.target.object_type != ExtendedObjectType.ORIGIN:
+        if data.target.object_type == ExtendedObjectType.ORIGIN:
+            origin_sha1 = data.target.object_id
+        elif data.origin is not None:
+            # HACK: As swh-search does (yet?) not support searching on directories
+            # and traversing back to origins, we index metadata on non-origins with
+            # an origin context as if they were on the origin itself.
+            origin_sha1 = hashlib.sha1(data.origin.encode()).digest()
+        else:
             # other types are not supported yet
             return []
 
         if data.authority.type == MetadataAuthorityType.REGISTRY:
             # metadata provided by a third-party; don't trust it
             # (technically this could be handled below, but we check it here
             # to return early; sparing a translation and origin lookup)
             # TODO: add ways to define trusted authorities
             return []
 
         metadata_items = []
         mappings: List[str] = []
         for mapping_cls in EXTRINSIC_MAPPINGS.values():
             if data.format in mapping_cls.extrinsic_metadata_formats():
                 mapping = mapping_cls()
                 metadata_item = mapping.translate(data.metadata)
                 if metadata_item is not None:
                     metadata_items.append(metadata_item)
                     mappings.append(mapping.name)
 
         if not metadata_items:
             # Don't have any mapping to parse it, ignore
             return []
 
         # TODO: batch requests to origin_get_by_sha1()
         for _ in range(6):
-            origins = self.storage.origin_get_by_sha1([data.target.object_id])
+            origins = self.storage.origin_get_by_sha1([origin_sha1])
             try:
                 (origin,) = origins
                 if origin is not None:
                     break
             except ValueError:
                 pass
             # The origin does not exist. This may be due to some replication lag
             # between the loader's DB/journal and the DB we are consuming from.
             # Wait a bit and try again
             logger.debug("Origin %s not found, sleeping for 10s.", data.target)
             time.sleep(10)
         else:
             # Does not exist, or replication lag > 60s.
             raise ValueError(f"Unknown origin {data.target}") from None
 
         if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
             # metadata provided by a third-party; don't trust it
             # TODO: add ways to define trusted authorities
             return []
 
         metadata = merge_documents(metadata_items)
 
         return [
             OriginExtrinsicMetadataRow(
                 id=origin["url"],
                 indexer_configuration_id=self.tool["id"],
                 from_remd_id=data.id,
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[OriginExtrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         return self.idx_storage.origin_extrinsic_metadata_add(results)
 
 
 class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.content_metadata_missing(
             (
                 {
                     "id": sha1,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1 in ids
             )
         )
 
     def index(
         self,
         id: Sha1,
         data: Optional[bytes] = None,
         log_suffix="unknown directory",
         **kwargs,
     ) -> List[ContentMetadataRow]:
         """Index sha1s' content and store result.
 
         Args:
             id: content's identifier
             data: raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         assert isinstance(id, bytes)
         assert data is not None
         metadata = None
         try:
             mapping_name = self.tool["tool_configuration"]["context"]
             log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
             metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id)
             )
             sentry_sdk.capture_exception()
         if metadata is None:
             return []
         return [
             ContentMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[ContentMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         return self.idx_storage.content_metadata_add(results)
 
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "tools": {
         "name": "swh.indexer.metadata",
         "version": pkg_resources.get_distribution("swh.indexer").version,
         "configuration": {},
     },
 }
 
 
 class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
     """Directory-level indexer
 
     This indexer is in charge of:
 
     - filtering directories already indexed in directory_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for directory
 
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.directory_intrinsic_metadata_missing(
             (
                 {
                     "id": sha1_git,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1_git in sha1_gits
             )
         )
 
     def index(
         self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
     ) -> List[DirectoryIntrinsicMetadataRow]:
         """Index directory by processing it and organizing result.
 
         use metadata_detector to iterate on filenames, passes them to the content
         indexers, then merges (if more than one)
 
         Args:
           id: sha1_git of the directory
           data: should always be None
 
         Returns:
             dict: dictionary representing a directory_intrinsic_metadata, with
             keys:
 
             - id: directory's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         dir_: List[DirectoryLsEntry]
         assert data is None, "Unexpected directory object"
         dir_ = cast(
             List[DirectoryLsEntry],
             list(self.storage.directory_ls(id, recursive=False)),
         )
 
         try:
             if [entry["type"] for entry in dir_] == ["dir"]:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_[0]["target"]
                 dir_ = cast(
                     List[DirectoryLsEntry],
                     list(self.storage.directory_ls(subdir, recursive=False)),
                 )
             files = [entry for entry in dir_ if entry["type"] == "file"]
             (mappings, metadata) = self.translate_directory_intrinsic_metadata(
                 files,
                 log_suffix="directory=%s" % hashutil.hash_to_hex(id),
             )
         except Exception as e:
             self.log.exception("Problem when indexing dir: %r", e)
             sentry_sdk.capture_exception()
             return []
         return [
             DirectoryIntrinsicMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[DirectoryIntrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         # TODO: add functions in storage to keep data in
         # directory_intrinsic_metadata
         return self.idx_storage.directory_intrinsic_metadata_add(results)
 
     def translate_directory_intrinsic_metadata(
         self, files: List[DirectoryLsEntry], log_suffix: str
     ) -> Tuple[List[Any], Any]:
         """
         Determine plan of action to translate metadata in the given root directory
 
         Args:
             files: list of file entries, as returned by
               :meth:`swh.storage.interface.StorageInterface.directory_ls`
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         metadata = []
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {
             k: self.config[k]
             for k in [INDEXER_CFG_KEY, "objstorage", "storage", "tools"]
         }
         all_detected_files = detect_metadata(files)
         used_mappings = [
             INTRINSIC_MAPPINGS[context].name for context in all_detected_files
         ]
         for (mapping_name, detected_files) in all_detected_files.items():
             cfg = deepcopy(config)
             cfg["tools"]["configuration"]["context"] = mapping_name
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(detected_files)
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c.id
                 sha1s_in_storage.append(sha1)
                 local_metadata = c.metadata
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [
                 item for item in detected_files if item not in sha1s_in_storage
             ]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(
                         sha1s_filtered,
                         log_suffix=log_suffix,
                     )
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result.metadata
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception("Exception while indexing metadata on contents")
                     sentry_sdk.capture_exception()
 
         metadata = merge_documents(metadata)
         return (used_mappings, metadata)
 
 
 class OriginMetadataIndexer(
     OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
 ):
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs) -> None:
         super().__init__(config=config, **kwargs)
         self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
 
     def index_list(
         self,
         origins: List[Origin],
         *,
         check_origin_known: bool = True,
         **kwargs,
     ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
         head_rev_ids = []
         head_rel_ids = []
         origin_heads: Dict[Origin, CoreSWHID] = {}
 
         # Filter out origins not in the storage
         if check_origin_known:
             known_origins = list(
                 call_with_batches(
                     self.storage.origin_get,
                     [origin.url for origin in origins],
                     ORIGIN_GET_BATCH_SIZE,
                 )
             )
         else:
             known_origins = list(origins)
 
         for origin in known_origins:
             if origin is None:
                 continue
             head_swhid = get_head_swhid(self.storage, origin.url)
             if head_swhid:
                 origin_heads[origin] = head_swhid
                 if head_swhid.object_type == ObjectType.REVISION:
                     head_rev_ids.append(head_swhid.object_id)
                 elif head_swhid.object_type == ObjectType.RELEASE:
                     head_rel_ids.append(head_swhid.object_id)
                 else:
                     assert False, head_swhid
 
         head_revs = dict(
             zip(
                 head_rev_ids,
                 call_with_batches(
                     self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
                 ),
             )
         )
         head_rels = dict(
             zip(
                 head_rel_ids,
                 call_with_batches(
                     self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
                 ),
             )
         )
 
         results = []
         for (origin, head_swhid) in origin_heads.items():
             sentry_sdk.set_tag("swh-indexer-origin-url", origin.url)
             sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid))
             if head_swhid.object_type == ObjectType.REVISION:
                 rev = head_revs[head_swhid.object_id]
                 if not rev:
                     self.log.warning(
                         "Missing head object %s of origin %r", head_swhid, origin.url
                     )
                     continue
                 directory_id = rev.directory
             elif head_swhid.object_type == ObjectType.RELEASE:
                 rel = head_rels[head_swhid.object_id]
                 if not rel:
                     self.log.warning(
                         "Missing head object %s of origin %r", head_swhid, origin.url
                     )
                     continue
                 if rel.target_type != ModelObjectType.DIRECTORY:
                     # TODO
                     self.log.warning(
                         "Head release %s of %r has unexpected target type %s",
                         head_swhid,
                         origin.url,
                         rel.target_type,
                     )
                     continue
                 assert rel.target, rel
                 directory_id = rel.target
             else:
                 assert False, head_swhid
 
             for dir_metadata in self.directory_metadata_indexer.index(directory_id):
                 # There is at most one dir_metadata
                 orig_metadata = OriginIntrinsicMetadataRow(
                     from_directory=dir_metadata.id,
                     id=origin.url,
                     metadata=dir_metadata.metadata,
                     mappings=dir_metadata.mappings,
                     indexer_configuration_id=dir_metadata.indexer_configuration_id,
                 )
                 results.append((orig_metadata, dir_metadata))
 
         return results
 
     def persist_index_computations(
         self,
         results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
     ) -> Dict[str, int]:
         # Deduplicate directories
         dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
         orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
         summary: Dict = {}
         for (orig_item, dir_item) in results:
             assert dir_item.metadata == orig_item.metadata
             if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
                 # Only store non-empty metadata sets
                 if dir_item.id not in dir_metadata:
                     dir_metadata[dir_item.id] = dir_item
                 if orig_item.id not in orig_metadata:
                     orig_metadata[orig_item.id] = orig_item
 
         if dir_metadata:
             summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
                 list(dir_metadata.values())
             )
             summary.update(summary_dir)
         if orig_metadata:
             summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
                 list(orig_metadata.values())
             )
             summary.update(summary_ori)
 
         return summary
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
index 2d9ff6d..82ac133 100644
--- a/swh/indexer/origin_head.py
+++ b/swh/indexer/origin_head.py
@@ -1,120 +1,156 @@
 # Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import re
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
-from swh.model.model import SnapshotBranch, TargetType
+from swh.model.model import Snapshot, SnapshotBranch, TargetType
 from swh.model.swhids import CoreSWHID, ObjectType
 from swh.storage.algos.origin import origin_get_latest_visit_status
 from swh.storage.algos.snapshot import snapshot_get_all_branches
+from swh.storage.interface import PartialBranches, StorageInterface
 
 
-def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]:
+def get_head_swhid(storage: StorageInterface, origin_url: str) -> Optional[CoreSWHID]:
     """Returns the SWHID of the head revision or release of an origin"""
     visit_status = origin_get_latest_visit_status(
         storage, origin_url, allowed_statuses=["full"], require_snapshot=True
     )
     if not visit_status:
         return None
     assert visit_status.snapshot is not None
-    snapshot = snapshot_get_all_branches(storage, visit_status.snapshot)
-    if snapshot is None:
-        return None
 
     if visit_status.type == "ftp":
-        return _try_get_ftp_head(dict(snapshot.branches))
+        # We need to fetch all branches in order to find the largest one
+        snapshot = snapshot_get_all_branches(storage, visit_status.snapshot)
+        if snapshot is None:
+            return None
+        return _try_get_ftp_head(storage, snapshot)
     else:
-        return _try_get_head_generic(dict(snapshot.branches))
+        # Peak into the snapshot, without fetching too many refs.
+        # If the snapshot is small, this gets all of it in a single request.
+        # If the snapshot is large, we will query specific branches as we need them.
+        partial_branches = storage.snapshot_get_branches(
+            visit_status.snapshot, branches_count=100
+        )
+        if partial_branches is None:
+            # Snapshot does not exist
+            return None
+        return _try_get_head_generic(storage, partial_branches)
 
 
 _archive_filename_re = re.compile(
     rb"^"
     rb"(?P<pkgname>.*)[-_]"
     rb"(?P<version>[0-9]+(\.[0-9])*)"
     rb"(?P<preversion>[-+][a-zA-Z0-9.~]+?)?"
     rb"(?P<extension>(\.[a-zA-Z0-9]+)+)"
     rb"$"
 )
 
 
 def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]:
     """Extracts the release version from an archive filename,
     to get an ordering whose maximum is likely to be the last
     version of the software
 
     >>> _parse_version(b'foo')
     (-inf,)
     >>> _parse_version(b'foo.tar.gz')
     (-inf,)
     >>> _parse_version(b'gnu-hello-0.0.1.tar.gz')
     (0, 0, 1, 0)
     >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
     (0, 0, 1, -1, 'beta2')
     >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
     (0, 0, 1, 1, 'foobar')
     """
     res = _archive_filename_re.match(filename)
     if res is None:
         return (float("-infinity"),)
     version: List[Union[float, int, str]] = [
         int(n) for n in res.group("version").decode().split(".")
     ]
     if res.group("preversion") is None:
         version.append(0)
     else:
         preversion = res.group("preversion").decode()
         if preversion.startswith("-"):
             version.append(-1)
             version.append(preversion[1:])
         elif preversion.startswith("+"):
             version.append(1)
             version.append(preversion[1:])
         else:
             assert False, res.group("preversion")
     return tuple(version)
 
 
 def _try_get_ftp_head(
-    branches: Dict[bytes, Optional[SnapshotBranch]]
+    storage: StorageInterface, snapshot: Snapshot
 ) -> Optional[CoreSWHID]:
-    archive_names = list(branches)
+    archive_names = list(snapshot.branches)
     max_archive_name = max(archive_names, key=_parse_version)
-    return _try_resolve_target(branches, max_archive_name)
+    return _try_resolve_target(
+        storage,
+        {"id": snapshot.id, "branches": dict(snapshot.branches), "next_branch": None},
+        branch_name=max_archive_name,
+    )
 
 
 def _try_get_head_generic(
-    branches: Dict[bytes, Optional[SnapshotBranch]]
+    storage: StorageInterface, partial_branches: PartialBranches
 ) -> Optional[CoreSWHID]:
     # Works on 'deposit', 'pypi', and VCSs.
-    return _try_resolve_target(branches, b"HEAD") or _try_resolve_target(
-        branches, b"master"
-    )
+    return _try_resolve_target(
+        storage, partial_branches, branch_name=b"HEAD"
+    ) or _try_resolve_target(storage, partial_branches, branch_name=b"master")
+
+
+def _get_branch(
+    storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes
+) -> Optional[SnapshotBranch]:
+    """Given a ``branch_name``, gets it from ``partial_branches`` if present,
+    and fetches it from the storage otherwise."""
+    if branch_name in partial_branches["branches"]:
+        return partial_branches["branches"][branch_name]
+    elif partial_branches["next_branch"] is not None:
+        # Branch is not in `partial_branches`, and `partial_branches` indeed partial
+        res = storage.snapshot_get_branches(
+            partial_branches["id"], branches_from=branch_name, branches_count=1
+        )
+        assert res is not None, "Snapshot does not exist anymore"
+        return res["branches"].get(branch_name)
+    else:
+        # Branch is not in `partial_branches`, but `partial_branches` is the full
+        # list of branches, which means it is a dangling reference.
+        return None
 
 
 def _try_resolve_target(
-    branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes
+    storage: StorageInterface, partial_branches: PartialBranches, branch_name: bytes
 ) -> Optional[CoreSWHID]:
     try:
-        branch = branches[branch_name]
+        branch = _get_branch(storage, partial_branches, branch_name)
         if branch is None:
             return None
+
         while branch.target_type == TargetType.ALIAS:
-            branch = branches[branch.target]
+            branch = _get_branch(storage, partial_branches, branch.target)
             if branch is None:
                 return None
 
         if branch.target_type == TargetType.REVISION:
             return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target)
         elif branch.target_type == TargetType.CONTENT:
             return None  # TODO
         elif branch.target_type == TargetType.DIRECTORY:
             return None  # TODO
         elif branch.target_type == TargetType.RELEASE:
             return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target)
         else:
             assert False, branch
     except KeyError:
         return None
diff --git a/swh/indexer/storage/writer.py b/swh/indexer/storage/writer.py
index b4fa365..7c06800 100644
--- a/swh/indexer/storage/writer.py
+++ b/swh/indexer/storage/writer.py
@@ -1,69 +1,73 @@
 # Copyright (C) 2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Any, Callable, Dict, Iterable, Optional
 
 import attr
 
 try:
     from swh.journal.writer import JournalWriterInterface, get_journal_writer
 except ImportError:
     get_journal_writer = None  # type: ignore
     # mypy limitation, see https://github.com/python/mypy/issues/1153
 
 from .model import BaseRow
 
 
 class JournalWriter:
     """Journal writer storage collaborator. It's in charge of adding objects to
     the journal.
 
     """
 
     journal: Optional[JournalWriterInterface]
 
     def __init__(self, tool_getter: Callable[[int], Dict[str, Any]], journal_writer):
         """
         Args:
             tool_getter: a callable that takes a tool_id and return a dict representing
                          a tool object
             journal_writer: configuration passed to
                             `swh.journal.writer.get_journal_writer`
         """
         self._tool_getter = tool_getter
         if journal_writer:
             if get_journal_writer is None:
                 raise EnvironmentError(
                     "You need the swh.journal package to use the "
                     "journal_writer feature"
                 )
             self.journal = get_journal_writer(
                 **journal_writer,
                 value_sanitizer=lambda object_type, value_dict: value_dict,
             )
         else:
             self.journal = None
 
     def write_additions(self, obj_type, entries: Iterable[BaseRow]) -> None:
         if not self.journal:
             return
 
+        translated = []
+
         # usually, all the additions in a batch are from the same indexer,
         # so this cache allows doing a single query for all the entries.
         tool_cache = {}
 
         for entry in entries:
             assert entry.object_type == obj_type  # type: ignore
             # get the tool used to generate this addition
             tool_id = entry.indexer_configuration_id
             assert tool_id
             if tool_id not in tool_cache:
                 tool_cache[tool_id] = self._tool_getter(tool_id)
             entry = attr.evolve(
                 entry, tool=tool_cache[tool_id], indexer_configuration_id=None
             )
 
-            # write to kafka
-            self.journal.write_addition(obj_type, entry)
+            translated.append(entry)
+
+        # write to kafka
+        self.journal.write_additions(obj_type, translated)
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 37b574c..bb6b883 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,414 +1,419 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 from unittest.mock import call
 
 import attr
 
 from swh.indexer.metadata import (
     ContentMetadataIndexer,
     DirectoryMetadataIndexer,
     ExtrinsicMetadataIndexer,
 )
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
     OriginExtrinsicMetadataRow,
 )
 from swh.indexer.tests.utils import DIRECTORY2
 from swh.model.model import (
     Directory,
     DirectoryEntry,
     MetadataAuthority,
     MetadataAuthorityType,
     MetadataFetcher,
     RawExtrinsicMetadata,
 )
 from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
 
 from .utils import (
     BASE_TEST_CONFIG,
     MAPPING_DESCRIPTION_CONTENT_SHA1,
     MAPPING_DESCRIPTION_CONTENT_SHA1GIT,
     YARN_PARSER_METADATA,
     fill_obj_storage,
     fill_storage,
 )
 
 TRANSLATOR_TOOL = {
     "name": "swh-metadata-translator",
     "version": "0.0.2",
     "configuration": {"type": "local", "context": "NpmMapping"},
 }
 
 
 class ContentMetadataTestIndexer(ContentMetadataIndexer):
     """Specific Metadata whose configuration is enough to satisfy the
     indexing tests.
     """
 
     def parse_config_file(self, *args, **kwargs):
         assert False, "should not be called; the dir indexer configures it."
 
 
 DIRECTORY_METADATA_CONFIG = {
     **BASE_TEST_CONFIG,
     "tools": TRANSLATOR_TOOL,
 }
 
 DEPOSIT_REMD = RawExtrinsicMetadata(
     target=ExtendedSWHID(
-        object_type=ExtendedObjectType.ORIGIN,
-        object_id=b"\x01" * 20,
+        object_type=ExtendedObjectType.DIRECTORY,
+        object_id=b"\x02" * 20,
     ),
     discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
     authority=MetadataAuthority(
         type=MetadataAuthorityType.DEPOSIT_CLIENT,
         url="https://example.org/",
     ),
     fetcher=MetadataFetcher(
         name="example-fetcher",
         version="1.0.0",
     ),
     format="sword-v2-atom-codemeta-v2",
     metadata="""<?xml version="1.0"?>
         <atom:entry xmlns:atom="http://www.w3.org/2005/Atom"
                     xmlns="https://doi.org/10.5063/schema/codemeta-2.0">
           <name>My Software</name>
           <author>
             <name>Author 1</name>
             <email>foo@example.org</email>
           </author>
           <author>
             <name>Author 2</name>
           </author>
         </atom:entry>
     """.encode(),
+    origin="https://example.org/jdoe/myrepo",
 )
 
 GITHUB_REMD = RawExtrinsicMetadata(
     target=ExtendedSWHID(
         object_type=ExtendedObjectType.ORIGIN,
         object_id=b"\x01" * 20,
     ),
     discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
     authority=MetadataAuthority(
         type=MetadataAuthorityType.FORGE,
         url="https://example.org/",
     ),
     fetcher=MetadataFetcher(
         name="example-fetcher",
         version="1.0.0",
     ),
     format="application/vnd.github.v3+json",
     metadata=b'{"full_name": "test software", "html_url": "http://example.org/"}',
 )
 
 
 class TestMetadata:
     """
     Tests metadata_mock_tool tool for Metadata detection
     """
 
     def test_directory_metadata_indexer(self):
         metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
         dir_ = DIRECTORY2
 
         assert (
             dir_.entries[0].target
             == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
         )
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
                     ],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
             ]
         )
 
         metadata_indexer.run([dir_.id])
 
         results = list(
             metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id])
         )
 
         expected_results = [
             DirectoryIntrinsicMetadataRow(
                 id=dir_.id,
                 tool=TRANSLATOR_TOOL,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
             )
         ]
 
         for result in results:
             del result.tool["id"]
 
         assert results == expected_results
 
     def test_directory_metadata_indexer_single_root_dir(self):
         metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # Add a parent directory, that is the only directory at the root
         # of the directory
         dir_ = DIRECTORY2
         assert (
             dir_.entries[0].target
             == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
         )
 
         new_dir = Directory(
             entries=(
                 DirectoryEntry(
                     name=b"foobar-1.0.0",
                     type="dir",
                     target=dir_.id,
                     perms=16384,
                 ),
             ),
         )
         assert new_dir.id is not None
         metadata_indexer.storage.directory_add([new_dir])
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
                     ],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
             ]
         )
 
         metadata_indexer.run([new_dir.id])
 
         results = list(
             metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
         )
 
         expected_results = [
             DirectoryIntrinsicMetadataRow(
                 id=new_dir.id,
                 tool=TRANSLATOR_TOOL,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
             )
         ]
 
         for result in results:
             del result.tool["id"]
 
         assert results == expected_results
 
     def test_extrinsic_metadata_indexer_unknown_format(self, mocker):
         """Should be ignored when unknown format"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(GITHUB_REMD, format="unknown format")
 
         results = metadata_indexer.index(remd.id, data=remd)
 
         assert metadata_indexer.storage.method_calls == []
         assert results == []
 
     def test_extrinsic_metadata_indexer_github(self, mocker):
         """Nominal case, calling the mapping and storing the result"""
         origin = "https://example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
             {"raw_extrinsic_metadata": [GITHUB_REMD.to_dict()]}
         ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
 
         results = list(
             metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
         )
         assert results == [
             OriginExtrinsicMetadataRow(
                 id="https://example.org/jdoe/myrepo",
                 tool={"id": tool["id"], **TRANSLATOR_TOOL},
                 metadata={
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                     "id": "http://example.org/",
                     "type": "https://forgefed.org/ns#Repository",
                     "name": "test software",
                 },
                 from_remd_id=GITHUB_REMD.id,
                 mappings=["github"],
             )
         ]
 
     def test_extrinsic_metadata_indexer_firstparty_deposit(self, mocker):
         """Also nominal case, calling the mapping and storing the result"""
         origin = "https://example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
             {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]}
         ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
 
         assert metadata_indexer.storage.method_calls == [
-            call.origin_get_by_sha1([b"\x01" * 20])
+            call.origin_get_by_sha1(
+                [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"]
+            )
         ]
 
         results = list(
             metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
         )
         assert results == [
             OriginExtrinsicMetadataRow(
                 id="https://example.org/jdoe/myrepo",
                 tool={"id": tool["id"], **TRANSLATOR_TOOL},
                 metadata={
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                     "author": [
                         {"email": "foo@example.org", "name": "Author 1"},
                         {"name": "Author 2"},
                     ],
                     "name": "My Software",
                 },
                 from_remd_id=DEPOSIT_REMD.id,
                 mappings=["sword-codemeta"],
             )
         ]
 
     def test_extrinsic_metadata_indexer_thirdparty_deposit(self, mocker):
         """Metadata-only deposit: currently ignored"""
         origin = "https://not-from-example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
             {"raw_extrinsic_metadata": [DEPOSIT_REMD.to_dict()]}
         ) == {"status": "uneventful", "origin_extrinsic_metadata:add": 0}
 
         assert metadata_indexer.storage.method_calls == [
-            call.origin_get_by_sha1([b"\x01" * 20])
+            call.origin_get_by_sha1(
+                [b"\xb1\x0c\\\xd2w\x1b\xdd\xac\x07\xdb\xdf>\x93O1\xd0\xc9L\x0c\xcf"]
+            )
         ]
 
         results = list(
             metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
         )
         assert results == []
 
     def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
         """Early abort on non-forge authorities"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(
             GITHUB_REMD,
             authority=attr.evolve(
                 GITHUB_REMD.authority, type=MetadataAuthorityType.REGISTRY
             ),
         )
 
         results = metadata_indexer.index(remd.id, data=remd)
 
         assert metadata_indexer.storage.method_calls == []
         assert results == []
 
     def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker):
         """Should be ignored when authority URL does not match the origin"""
 
         origin = "https://different-domain.example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         results = metadata_indexer.index(GITHUB_REMD.id, data=GITHUB_REMD)
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
         assert results == []
 
     def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker):
         """Nominal case, calling the mapping and storing the result"""
         origin = "https://example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
             {
                 "raw_extrinsic_metadata": [
                     GITHUB_REMD.to_dict(),
                     {**GITHUB_REMD.to_dict(), "id": b"\x00" * 20},
                 ]
             }
         ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
 
         results = list(
             metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
         )
         assert len(results) == 1, results
         assert results[0].from_remd_id == b"\x00" * 20
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
index 999084b..e44ca71 100644
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -1,157 +1,271 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
+import itertools
 
 import pytest
 
 from swh.indexer.origin_head import get_head_swhid
 from swh.indexer.tests.utils import fill_storage
 from swh.model.model import (
     Origin,
     OriginVisit,
     OriginVisitStatus,
     Snapshot,
     SnapshotBranch,
     TargetType,
 )
 from swh.model.swhids import CoreSWHID
 from swh.storage.utils import now
 
+
+@pytest.fixture
+def swh_storage_backend_config():
+    """In-memory storage, to make tests go faster."""
+    return {"cls": "memory"}
+
+
 SAMPLE_SNAPSHOT = Snapshot(
     branches={
         b"foo": None,
         b"HEAD": SnapshotBranch(
             target_type=TargetType.ALIAS,
             target=b"foo",
         ),
     },
 )
 
 
+def _add_snapshot_to_origin(storage, origin_url, visit_type, snapshot):
+    storage.origin_add([Origin(url=origin_url)])
+    visit = storage.origin_visit_add(
+        [
+            OriginVisit(
+                origin=origin_url,
+                date=datetime(2019, 2, 27, tzinfo=timezone.utc),
+                type="pypi",
+            )
+        ]
+    )[0]
+    storage.snapshot_add([snapshot])
+    visit_status = OriginVisitStatus(
+        origin=origin_url,
+        visit=visit.visit,
+        date=now(),
+        status="full",
+        snapshot=snapshot.id,
+    )
+    storage.origin_visit_status_add([visit_status])
+
+
 @pytest.fixture
 def storage(swh_storage):
     fill_storage(swh_storage)
     return swh_storage
 
 
 def test_git(storage):
     origin_url = "https://github.com/SoftwareHeritage/swh-storage"
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d"
     )
 
 
 def test_git_partial_snapshot(storage):
     """Checks partial snapshots are ignored."""
     origin_url = "https://github.com/SoftwareHeritage/swh-core"
     storage.origin_add([Origin(url=origin_url)])
     visit = storage.origin_visit_add(
         [
             OriginVisit(
                 origin=origin_url,
                 date=datetime(2019, 2, 27, tzinfo=timezone.utc),
                 type="git",
             )
         ]
     )[0]
     storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="partial",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     storage.origin_visit_status_add([visit_status])
     assert get_head_swhid(storage, origin_url) is None
 
 
 def test_vcs_missing_snapshot(storage):
     origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
     storage.origin_add([Origin(url=origin_url)])
     assert get_head_swhid(storage, origin_url) is None
 
 
 def test_pypi_missing_branch(storage):
     origin_url = "https://pypi.org/project/abcdef/"
-    storage.origin_add(
-        [
-            Origin(
-                url=origin_url,
-            )
-        ]
+    _add_snapshot_to_origin(storage, origin_url, "pypi", SAMPLE_SNAPSHOT)
+    assert get_head_swhid(storage, origin_url) is None
+
+
+@pytest.mark.parametrize(
+    "branches_start,branches_middle,branches_end",
+    itertools.product([0, 40, 99, 100, 200], [0, 40, 99, 100, 200], [0, 40, 200]),
+)
+def test_large_snapshot(storage, branches_start, branches_middle, branches_end):
+    rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+    snapshot = Snapshot(
+        branches=dict(
+            [(f"AAAA{i}".encode(), None) for i in range(branches_start)]
+            + [
+                (
+                    b"HEAD",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/foo"
+                    ),
+                )
+            ]
+            + [(f"aaaa{i}".encode(), None) for i in range(branches_middle)]
+            + [
+                (
+                    b"refs/heads/foo",
+                    SnapshotBranch(
+                        target_type=TargetType.REVISION,
+                        target=bytes.fromhex(rev_id),
+                    ),
+                )
+            ]
+            + [(f"zzzz{i}".encode(), None) for i in range(branches_end)]
+        )
     )
-    visit = storage.origin_visit_add(
-        [
-            OriginVisit(
-                origin=origin_url,
-                date=datetime(2019, 2, 27, tzinfo=timezone.utc),
-                type="pypi",
-            )
-        ]
-    )[0]
-    storage.snapshot_add([SAMPLE_SNAPSHOT])
-    visit_status = OriginVisitStatus(
-        origin=origin_url,
-        visit=visit.visit,
-        date=now(),
-        status="full",
-        snapshot=SAMPLE_SNAPSHOT.id,
+
+    origin_url = "https://example.org/repo.git"
+    _add_snapshot_to_origin(storage, origin_url, "git", snapshot)
+
+    assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+        "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
     )
-    storage.origin_visit_status_add([visit_status])
+
+
+def test_large_snapshot_chained_aliases(storage):
+    rev_id = "8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+    snapshot = Snapshot(
+        branches=dict(
+            [(f"AAAA{i}".encode(), None) for i in range(200)]
+            + [
+                (
+                    b"HEAD",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/alias2"
+                    ),
+                )
+            ]
+            + [(f"aaaa{i}".encode(), None) for i in range(200)]
+            + [
+                (
+                    b"refs/heads/alias2",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/branch"
+                    ),
+                )
+            ]
+            + [(f"refs/heads/bbbb{i}".encode(), None) for i in range(200)]
+            + [
+                (
+                    b"refs/heads/branch",
+                    SnapshotBranch(
+                        target_type=TargetType.REVISION,
+                        target=bytes.fromhex(rev_id),
+                    ),
+                )
+            ]
+        )
+    )
+
+    origin_url = "https://example.org/repo.git"
+    _add_snapshot_to_origin(storage, origin_url, "git", snapshot)
+
+    assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
+        "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
+    )
+
+
+@pytest.mark.parametrize(
+    "branches_start,branches_end",
+    itertools.product([0, 40, 99, 100, 200], [0, 40, 200]),
+)
+def test_large_snapshot_dangling_alias(storage, branches_start, branches_end):
+    snapshot = Snapshot(
+        branches=dict(
+            [(f"AAAA{i}".encode(), None) for i in range(branches_start)]
+            + [
+                (
+                    b"HEAD",
+                    SnapshotBranch(
+                        target_type=TargetType.ALIAS, target=b"refs/heads/foo"
+                    ),
+                )
+            ]
+            + [(f"zzzz{i}".encode(), None) for i in range(branches_end)]
+        )
+    )
+
+    origin_url = "https://example.org/repo.git"
+    _add_snapshot_to_origin(storage, origin_url, "git", snapshot)
+
     assert get_head_swhid(storage, origin_url) is None
 
 
 def test_ftp(storage):
     origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79"
     )
 
 
 def test_ftp_missing_snapshot(storage):
     origin_url = "rsync://ftp.gnu.org/gnu/foobar"
     storage.origin_add([Origin(url=origin_url)])
     assert get_head_swhid(storage, origin_url) is None
 
 
 def test_deposit(storage):
     origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
     storage.origin_add([Origin(url=origin_url)])
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb"
     )
 
 
 def test_deposit_missing_snapshot(storage):
     origin_url = "https://forge.softwareheritage.org/source/foobar"
     storage.origin_add(
         [
             Origin(
                 url=origin_url,
             )
         ]
     )
     assert get_head_swhid(storage, origin_url) is None
 
 
 def test_pypi(storage):
     origin_url = "https://old-pypi.example.org/project/limnoria/"
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
     )
 
     origin_url = "https://pypi.org/project/limnoria/"
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874"
     )
 
 
 def test_svn(storage):
     origin_url = "http://0-512-md.googlecode.com/svn/"
     assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string(
         "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18"
     )