diff --git a/PKG-INFO b/PKG-INFO
index ee4435e..2c3b3bd 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 2.5.0
+Version: 2.6.0
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 License-File: AUTHORS
 
 swh-indexer
 ============
 
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
   - ctags
   - language
   - fossology-license
   - metadata
 - revision:
   - metadata
 
 An indexer is in charge of:
 - looking up objects
 - extracting information from those objects
 - store those information in the swh-indexer db
 
 There are multiple indexers working on different object types:
   - content indexer: works with content sha1 hashes
   - revision indexer: works with revision sha1 hashes
   - origin indexer: works with origin identifiers
 
 Indexation procedure:
 - receive batch of ids
 - retrieve the associated data depending on object type
 - compute for that object some index
 - store the result to swh's storage
 
 Current content indexers:
 
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
 - language (queue swh_indexer_content_language): detect the
   programming language
 
 - ctags (queue swh_indexer_content_ctags): compute tags information
 
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
 - metadata: translate file into translated_metadata dict
 
 Current revision indexers:
 
 - metadata: detects files containing metadata and retrieves translated_metadata
   in content_metadata table in storage or run content indexer to translate
   files.
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index ee4435e..2c3b3bd 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 2.5.0
+Version: 2.6.0
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 License-File: AUTHORS
 
 swh-indexer
 ============
 
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
   - ctags
   - language
   - fossology-license
   - metadata
 - revision:
   - metadata
 
 An indexer is in charge of:
 - looking up objects
 - extracting information from those objects
 - store those information in the swh-indexer db
 
 There are multiple indexers working on different object types:
   - content indexer: works with content sha1 hashes
   - revision indexer: works with revision sha1 hashes
   - origin indexer: works with origin identifiers
 
 Indexation procedure:
 - receive batch of ids
 - retrieve the associated data depending on object type
 - compute for that object some index
 - store the result to swh's storage
 
 Current content indexers:
 
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
 - language (queue swh_indexer_content_language): detect the
   programming language
 
 - ctags (queue swh_indexer_content_ctags): compute tags information
 
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
 - metadata: translate file into translated_metadata dict
 
 Current revision indexers:
 
 - metadata: detects files containing metadata and retrieves translated_metadata
   in content_metadata table in storage or run content indexer to translate
   files.
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
index 2c7318f..939b4b1 100644
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,407 +1,408 @@
 # Copyright (C) 2019-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Callable, Dict, Iterator, List, Optional
 
 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
 import click
 
 from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
 from swh.core.cli import swh as swh_cli_group
 
 
 @swh_cli_group.group(
     name="indexer", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup
 )
 @click.option(
     "--config-file",
     "-C",
     default=None,
     type=click.Path(
         exists=True,
         dir_okay=False,
     ),
     help="Configuration file.",
 )
 @click.pass_context
 def indexer_cli_group(ctx, config_file):
     """Software Heritage Indexer tools.
 
     The Indexer is used to mine the content of the archive and extract derived
     information from archive source code artifacts.
 
     """
     from swh.core import config
 
     ctx.ensure_object(dict)
     conf = config.read(config_file)
     ctx.obj["config"] = conf
 
 
 def _get_api(getter, config, config_key, url):
     if url:
         config[config_key] = {"cls": "remote", "url": url}
     elif config_key not in config:
         raise click.ClickException("Missing configuration for {}".format(config_key))
     return getter(**config[config_key])
 
 
 @indexer_cli_group.group("mapping")
 def mapping():
     """Manage Software Heritage Indexer mappings."""
     pass
 
 
 @mapping.command("list")
 def mapping_list():
     """Prints the list of known mappings."""
     from swh.indexer import metadata_dictionary
 
     mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()]
     mapping_names.sort()
     for mapping_name in mapping_names:
         click.echo(mapping_name)
 
 
 @mapping.command("list-terms")
 @click.option(
     "--exclude-mapping", multiple=True, help="Exclude the given mapping from the output"
 )
 @click.option(
     "--concise",
     is_flag=True,
     default=False,
     help="Don't print the list of mappings supporting each term.",
 )
 def mapping_list_terms(concise, exclude_mapping):
     """Prints the list of known CodeMeta terms, and which mappings
     support them."""
     from swh.indexer import metadata_dictionary
 
     properties = metadata_dictionary.list_terms()
     for (property_name, supported_mappings) in sorted(properties.items()):
         supported_mappings = {m.name for m in supported_mappings}
         supported_mappings -= set(exclude_mapping)
         if supported_mappings:
             if concise:
                 click.echo(property_name)
             else:
                 click.echo("{}:".format(property_name))
                 click.echo("\t" + ", ".join(sorted(supported_mappings)))
 
 
 @mapping.command("translate")
 @click.argument("mapping-name")
 @click.argument("file", type=click.File("rb"))
 def mapping_translate(mapping_name, file):
     """Translates file from mapping-name to codemeta format."""
     import json
 
     from swh.indexer import metadata_dictionary
 
     mapping_cls = [
         cls for cls in metadata_dictionary.MAPPINGS.values() if cls.name == mapping_name
     ]
     if not mapping_cls:
         raise click.ClickException("Unknown mapping {}".format(mapping_name))
     assert len(mapping_cls) == 1
     mapping_cls = mapping_cls[0]
     mapping = mapping_cls()
     codemeta_doc = mapping.translate(file.read())
     click.echo(json.dumps(codemeta_doc, indent=4))
 
 
 @indexer_cli_group.group("schedule")
 @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
 @click.option(
     "--indexer-storage-url", "-i", default=None, help="URL of the indexer storage API"
 )
 @click.option(
     "--storage-url", "-g", default=None, help="URL of the (graph) storage API"
 )
 @click.option(
     "--dry-run/--no-dry-run",
     is_flag=True,
     default=False,
     help="List only what would be scheduled.",
 )
 @click.pass_context
 def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run):
     """Manipulate Software Heritage Indexer tasks.
 
     Via SWH Scheduler's API."""
     from swh.indexer.storage import get_indexer_storage
     from swh.scheduler import get_scheduler
     from swh.storage import get_storage
 
     ctx.obj["indexer_storage"] = _get_api(
         get_indexer_storage, ctx.obj["config"], "indexer_storage", indexer_storage_url
     )
     ctx.obj["storage"] = _get_api(
         get_storage, ctx.obj["config"], "storage", storage_url
     )
     ctx.obj["scheduler"] = _get_api(
         get_scheduler, ctx.obj["config"], "scheduler", scheduler_url
     )
     if dry_run:
         ctx.obj["scheduler"] = None
 
 
 def list_origins_by_producer(idx_storage, mappings, tool_ids) -> Iterator[str]:
     next_page_token = ""
     limit = 10000
     while next_page_token is not None:
         result = idx_storage.origin_intrinsic_metadata_search_by_producer(
             page_token=next_page_token,
             limit=limit,
             ids_only=True,
             mappings=mappings or None,
             tool_ids=tool_ids or None,
         )
         next_page_token = result.next_page_token
         yield from result.results
 
 
 @schedule.command("reindex_origin_metadata")
 @click.option(
     "--batch-size",
     "-b",
     "origin_batch_size",
     default=10,
     show_default=True,
     type=int,
     help="Number of origins per task",
 )
 @click.option(
     "--tool-id",
     "-t",
     "tool_ids",
     type=int,
     multiple=True,
     help="Restrict search of old metadata to this/these tool ids.",
 )
 @click.option(
     "--mapping",
     "-m",
     "mappings",
     multiple=True,
     help="Mapping(s) that should be re-scheduled (eg. 'npm', 'gemspec', 'maven')",
 )
 @click.option(
     "--task-type",
     default="index-origin-metadata",
     show_default=True,
     help="Name of the task type to schedule.",
 )
 @click.pass_context
 def schedule_origin_metadata_reindex(
     ctx, origin_batch_size, tool_ids, mappings, task_type
 ):
     """Schedules indexing tasks for origins that were already indexed."""
     from swh.scheduler.cli_utils import schedule_origin_batches
 
     idx_storage = ctx.obj["indexer_storage"]
     scheduler = ctx.obj["scheduler"]
 
     origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
 
     kwargs = {"retries_left": 1}
     schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs)
 
 
 @indexer_cli_group.command("journal-client")
 @click.argument(
     "indexer",
     type=click.Choice(
         [
             "origin_intrinsic_metadata",
             "extrinsic_metadata",
             "content_mimetype",
             "content_fossology_license",
             "*",
         ]
     ),
     required=False
     # TODO: remove required=False after we stop using it
 )
 @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
 @click.option(
     "--origin-metadata-task-type",
     default="index-origin-metadata",
     help="Name of the task running the origin metadata indexer.",
 )
 @click.option(
     "--broker", "brokers", type=str, multiple=True, help="Kafka broker to connect to."
 )
 @click.option(
     "--prefix", type=str, default=None, help="Prefix of Kafka topic names to read from."
 )
 @click.option("--group-id", type=str, help="Consumer/group id for reading from Kafka.")
 @click.option(
     "--stop-after-objects",
     "-m",
     default=None,
     type=int,
     help="Maximum number of objects to replay. Default is to run forever.",
 )
 @click.option(
     "--batch-size",
     "-b",
     default=None,
     type=int,
     help="Batch size. Default is 200.",
 )
 @click.pass_context
 def journal_client(
     ctx,
     indexer: Optional[str],
     scheduler_url: str,
     origin_metadata_task_type: str,
     brokers: List[str],
     prefix: str,
     group_id: str,
     stop_after_objects: Optional[int],
     batch_size: Optional[int],
 ):
     """
     Listens for new objects from the SWH Journal, and either:
 
     * runs the indexer with the name passed as argument, if any
     * schedules tasks to run relevant indexers (currently, only
       origin_intrinsic_metadata) on these new objects otherwise.
 
     Passing '*' as indexer name runs all indexers.
     """
     import functools
     import warnings
 
     from swh.indexer.indexer import BaseIndexer, ObjectsDict
     from swh.indexer.journal_client import process_journal_objects
     from swh.journal.client import get_journal_client
     from swh.scheduler import get_scheduler
 
     cfg = ctx.obj["config"]
     journal_cfg = cfg.get("journal", {})
 
     scheduler = _get_api(get_scheduler, cfg, "scheduler", scheduler_url)
 
-    brokers = brokers or journal_cfg.get("brokers")
-    if not brokers:
+    if brokers:
+        journal_cfg["brokers"] = brokers
+    if not journal_cfg.get("brokers"):
         raise ValueError("The brokers configuration is mandatory.")
 
-    prefix = prefix or journal_cfg.get("prefix")
-    group_id = group_id or journal_cfg.get("group_id")
+    if prefix:
+        journal_cfg["prefix"] = prefix
+    if group_id:
+        journal_cfg["group_id"] = group_id
     origin_metadata_task_type = origin_metadata_task_type or journal_cfg.get(
         "origin_metadata_task_type"
     )
-    stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects")
-    batch_size = batch_size or journal_cfg.get("batch_size", 200)
+    if stop_after_objects:
+        journal_cfg["stop_after_objects"] = stop_after_objects
+    if batch_size:
+        journal_cfg["batch_size"] = batch_size
 
     object_types = set()
     worker_fns: List[Callable[[ObjectsDict], Dict]] = []
 
     if indexer is None:
         warnings.warn(
             "'swh indexer journal-client' with no argument creates scheduler tasks "
             "to index, rather than index directly.",
             DeprecationWarning,
         )
         object_types.add("origin_visit_status")
         worker_fns.append(
             functools.partial(
                 process_journal_objects,
                 scheduler=scheduler,
                 task_names={
                     "origin_metadata": origin_metadata_task_type,
                 },
             )
         )
 
     idx: Optional[BaseIndexer] = None
 
     if indexer in ("origin_intrinsic_metadata", "*"):
         from swh.indexer.metadata import OriginMetadataIndexer
 
         object_types.add("origin_visit_status")
         idx = OriginMetadataIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if indexer in ("extrinsic_metadata", "*"):
         from swh.indexer.metadata import ExtrinsicMetadataIndexer
 
         object_types.add("raw_extrinsic_metadata")
         idx = ExtrinsicMetadataIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if indexer in ("content_mimetype", "*"):
         from swh.indexer.mimetype import MimetypeIndexer
 
         object_types.add("content")
         idx = MimetypeIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if indexer in ("content_fossology_license", "*"):
         from swh.indexer.fossology_license import FossologyLicenseIndexer
 
         object_types.add("content")
         idx = FossologyLicenseIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if not worker_fns:
         raise click.ClickException(f"Unknown indexer: {indexer}")
 
     client = get_journal_client(
         cls="kafka",
-        brokers=brokers,
-        prefix=prefix,
-        group_id=group_id,
         object_types=list(object_types),
-        stop_after_objects=stop_after_objects,
-        batch_size=batch_size,
+        **journal_cfg,
     )
 
     def worker_fn(objects: ObjectsDict):
         for fn in worker_fns:
             fn(objects)
 
     try:
         client.process(worker_fn)
     except KeyboardInterrupt:
         ctx.exit(0)
     else:
         print("Done.")
     finally:
         client.close()
 
 
 @indexer_cli_group.command("rpc-serve")
 @click.argument("config-path", required=True)
 @click.option("--host", default="0.0.0.0", help="Host to run the server")
 @click.option("--port", default=5007, type=click.INT, help="Binding port of the server")
 @click.option(
     "--debug/--nodebug",
     default=True,
     help="Indicates if the server should run in debug mode",
 )
 def rpc_server(config_path, host, port, debug):
     """Starts a Software Heritage Indexer RPC HTTP server."""
     from swh.indexer.storage.api.server import app, load_and_check_config
 
     api_cfg = load_and_check_config(config_path, type="any")
     app.config.update(api_cfg)
     app.run(host, port=int(port), debug=bool(debug))
 
 
 def main():
     return indexer_cli_group(auto_envvar_prefix="SWH_INDEXER")
 
 
 if __name__ == "__main__":
     main()
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 566ab98..d9b3eb3 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,545 +1,561 @@
 # Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
+import itertools
+import logging
+import time
 from typing import (
     Any,
     Callable,
     Dict,
     Iterable,
     Iterator,
     List,
     Optional,
     Tuple,
     TypeVar,
     cast,
 )
 from urllib.parse import urlparse
 
 import sentry_sdk
 
 from swh.core.config import merge_configs
 from swh.core.utils import grouper
 from swh.indexer.codemeta import merge_documents
 from swh.indexer.indexer import (
     BaseIndexer,
     ContentIndexer,
     DirectoryIndexer,
     ObjectsDict,
     OriginIndexer,
 )
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS
 from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
 from swh.indexer.origin_head import get_head_swhid
 from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
     OriginExtrinsicMetadataRow,
     OriginIntrinsicMetadataRow,
 )
 from swh.model import hashutil
 from swh.model.model import Directory, MetadataAuthorityType
 from swh.model.model import ObjectType as ModelObjectType
 from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git
 from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
 
 REVISION_GET_BATCH_SIZE = 10
 RELEASE_GET_BATCH_SIZE = 10
 ORIGIN_GET_BATCH_SIZE = 10
 
 
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 
+logger = logging.getLogger(__name__)
+
 
 def call_with_batches(
     f: Callable[[List[T1]], Iterable[T2]],
     args: List[T1],
     batch_size: int,
 ) -> Iterator[T2]:
     """Calls a function with batches of args, and concatenates the results."""
     groups = grouper(args, batch_size)
     for group in groups:
         yield from f(list(group))
 
 
 class ExtrinsicMetadataIndexer(
     BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow]
 ):
     def process_journal_objects(self, objects: ObjectsDict) -> Dict:
         summary: Dict[str, Any] = {"status": "uneventful"}
         try:
-            results = []
+            results = {}
             for item in objects.get("raw_extrinsic_metadata", []):
                 remd = RawExtrinsicMetadata.from_dict(item)
-                sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid())
-                results.extend(self.index(remd.id, data=remd))
+                sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid()))
+                results[remd.target] = self.index(remd.id, data=remd)
         except Exception:
             if not self.catch_exceptions:
                 raise
             summary["status"] = "failed"
             return summary
 
-        summary_persist = self.persist_index_computations(results)
-        self.results = results
+        self.results = list(itertools.chain.from_iterable(results.values()))
+        summary_persist = self.persist_index_computations(self.results)
         if summary_persist:
             for value in summary_persist.values():
                 if value > 0:
                     summary["status"] = "eventful"
             summary.update(summary_persist)
         return summary
 
     def index(
         self,
         id: Sha1Git,
         data: Optional[RawExtrinsicMetadata],
         **kwargs,
     ) -> List[OriginExtrinsicMetadataRow]:
         if data is None:
             raise NotImplementedError(
                 "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
             )
         if data.target.object_type != ExtendedObjectType.ORIGIN:
             # other types are not supported yet
             return []
 
         if data.authority.type != MetadataAuthorityType.FORGE:
             # metadata provided by a third-party; don't trust it
             # (technically this could be handled below, but we check it here
             # to return early; sparing a translation and origin lookup)
             # TODO: add ways to define trusted authorities
             return []
 
         metadata_items = []
         mappings: List[str] = []
         for mapping_cls in EXTRINSIC_MAPPINGS.values():
             if data.format in mapping_cls.extrinsic_metadata_formats():
                 mapping = mapping_cls()
                 metadata_item = mapping.translate(data.metadata)
                 if metadata_item is not None:
                     metadata_items.append(metadata_item)
                     mappings.append(mapping.name)
 
         if not metadata_items:
             # Don't have any mapping to parse it, ignore
             return []
 
         # TODO: batch requests to origin_get_by_sha1()
-        origins = self.storage.origin_get_by_sha1([data.target.object_id])
-        try:
-            (origin,) = origins
-            if origin is None:
-                raise ValueError()
-        except ValueError:
+        for _ in range(6):
+            origins = self.storage.origin_get_by_sha1([data.target.object_id])
+            try:
+                (origin,) = origins
+                if origin is not None:
+                    break
+            except ValueError:
+                pass
+            # The origin does not exist. This may be due to some replication lag
+            # between the loader's DB/journal and the DB we are consuming from.
+            # Wait a bit and try again
+            logger.debug("Origin %s not found, sleeping for 10s.", data.target)
+            time.sleep(10)
+        else:
+            # Does not exist, or replication lag > 60s.
             raise ValueError(f"Unknown origin {data.target}") from None
 
         if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
             # metadata provided by a third-party; don't trust it
             # TODO: add ways to define trusted authorities
             return []
 
         metadata = merge_documents(metadata_items)
 
         return [
             OriginExtrinsicMetadataRow(
                 id=origin["url"],
                 indexer_configuration_id=self.tool["id"],
                 from_remd_id=data.id,
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[OriginExtrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         return self.idx_storage.origin_extrinsic_metadata_add(results)
 
 
 class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.content_metadata_missing(
             (
                 {
                     "id": sha1,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1 in ids
             )
         )
 
     def index(
         self,
         id: Sha1,
         data: Optional[bytes] = None,
         log_suffix="unknown directory",
         **kwargs,
     ) -> List[ContentMetadataRow]:
         """Index sha1s' content and store result.
 
         Args:
             id: content's identifier
             data: raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         assert isinstance(id, bytes)
         assert data is not None
         metadata = None
         try:
             mapping_name = self.tool["tool_configuration"]["context"]
             log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
             metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id)
             )
             sentry_sdk.capture_exception()
         if metadata is None:
             return []
         return [
             ContentMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[ContentMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         return self.idx_storage.content_metadata_add(results)
 
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "tools": {
         "name": "swh-metadata-detector",
         "version": "0.0.2",
         "configuration": {},
     },
 }
 
 
 class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
     """Directory-level indexer
 
     This indexer is in charge of:
 
     - filtering directories already indexed in directory_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for directory
 
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.directory_intrinsic_metadata_missing(
             (
                 {
                     "id": sha1_git,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1_git in sha1_gits
             )
         )
 
     def index(
         self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
     ) -> List[DirectoryIntrinsicMetadataRow]:
         """Index directory by processing it and organizing result.
 
         use metadata_detector to iterate on filenames, passes them to the content
         indexers, then merges (if more than one)
 
         Args:
           id: sha1_git of the directory
           data: should always be None
 
         Returns:
             dict: dictionary representing a directory_intrinsic_metadata, with
             keys:
 
             - id: directory's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         dir_: List[DirectoryLsEntry]
         assert data is None, "Unexpected directory object"
         dir_ = cast(
             List[DirectoryLsEntry],
             list(self.storage.directory_ls(id, recursive=False)),
         )
 
         try:
             if [entry["type"] for entry in dir_] == ["dir"]:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_[0]["target"]
                 dir_ = cast(
                     List[DirectoryLsEntry],
                     list(self.storage.directory_ls(subdir, recursive=False)),
                 )
             files = [entry for entry in dir_ if entry["type"] == "file"]
             (mappings, metadata) = self.translate_directory_intrinsic_metadata(
                 files,
                 log_suffix="directory=%s" % hashutil.hash_to_hex(id),
             )
         except Exception as e:
             self.log.exception("Problem when indexing dir: %r", e)
             sentry_sdk.capture_exception()
             return []
         return [
             DirectoryIntrinsicMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[DirectoryIntrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         # TODO: add functions in storage to keep data in
         # directory_intrinsic_metadata
         return self.idx_storage.directory_intrinsic_metadata_add(results)
 
     def translate_directory_intrinsic_metadata(
         self, files: List[DirectoryLsEntry], log_suffix: str
     ) -> Tuple[List[Any], Any]:
         """
         Determine plan of action to translate metadata in the given root directory
 
         Args:
             files: list of file entries, as returned by
               :meth:`swh.storage.interface.StorageInterface.directory_ls`
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         metadata = []
         tool = {
             "name": "swh-metadata-translator",
             "version": "0.0.2",
             "configuration": {},
         }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
         config["tools"] = [tool]
         all_detected_files = detect_metadata(files)
         used_mappings = [
             INTRINSIC_MAPPINGS[context].name for context in all_detected_files
         ]
         for (mapping_name, detected_files) in all_detected_files.items():
             cfg = deepcopy(config)
             cfg["tools"][0]["configuration"]["context"] = mapping_name
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(detected_files)
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c.id
                 sha1s_in_storage.append(sha1)
                 local_metadata = c.metadata
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [
                 item for item in detected_files if item not in sha1s_in_storage
             ]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(
                         sha1s_filtered,
                         log_suffix=log_suffix,
                     )
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result.metadata
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception("Exception while indexing metadata on contents")
                     sentry_sdk.capture_exception()
 
         metadata = merge_documents(metadata)
         return (used_mappings, metadata)
 
 
 class OriginMetadataIndexer(
     OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
 ):
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs) -> None:
         super().__init__(config=config, **kwargs)
         self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
 
     def index_list(
         self,
         origins: List[Origin],
         *,
         check_origin_known: bool = True,
         **kwargs,
     ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
         head_rev_ids = []
         head_rel_ids = []
         origin_heads: Dict[Origin, CoreSWHID] = {}
 
         # Filter out origins not in the storage
         if check_origin_known:
             known_origins = list(
                 call_with_batches(
                     self.storage.origin_get,
                     [origin.url for origin in origins],
                     ORIGIN_GET_BATCH_SIZE,
                 )
             )
         else:
             known_origins = list(origins)
 
         for origin in known_origins:
             if origin is None:
                 continue
             head_swhid = get_head_swhid(self.storage, origin.url)
             if head_swhid:
                 origin_heads[origin] = head_swhid
                 if head_swhid.object_type == ObjectType.REVISION:
                     head_rev_ids.append(head_swhid.object_id)
                 elif head_swhid.object_type == ObjectType.RELEASE:
                     head_rel_ids.append(head_swhid.object_id)
                 else:
                     assert False, head_swhid
 
         head_revs = dict(
             zip(
                 head_rev_ids,
                 call_with_batches(
                     self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
                 ),
             )
         )
         head_rels = dict(
             zip(
                 head_rel_ids,
                 call_with_batches(
                     self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
                 ),
             )
         )
 
         results = []
         for (origin, head_swhid) in origin_heads.items():
             sentry_sdk.set_tag("swh-indexer-origin-url", origin.url)
             sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid))
             if head_swhid.object_type == ObjectType.REVISION:
                 rev = head_revs[head_swhid.object_id]
                 if not rev:
                     self.log.warning(
                         "Missing head object %s of origin %r", head_swhid, origin.url
                     )
                     continue
                 directory_id = rev.directory
             elif head_swhid.object_type == ObjectType.RELEASE:
                 rel = head_rels[head_swhid.object_id]
                 if not rel:
                     self.log.warning(
                         "Missing head object %s of origin %r", head_swhid, origin.url
                     )
                     continue
                 if rel.target_type != ModelObjectType.DIRECTORY:
                     # TODO
                     self.log.warning(
                         "Head release %s of %r has unexpected target type %s",
                         head_swhid,
                         origin.url,
                         rel.target_type,
                     )
                     continue
                 assert rel.target, rel
                 directory_id = rel.target
             else:
                 assert False, head_swhid
 
             for dir_metadata in self.directory_metadata_indexer.index(directory_id):
                 # There is at most one dir_metadata
                 orig_metadata = OriginIntrinsicMetadataRow(
                     from_directory=dir_metadata.id,
                     id=origin.url,
                     metadata=dir_metadata.metadata,
                     mappings=dir_metadata.mappings,
                     indexer_configuration_id=dir_metadata.indexer_configuration_id,
                 )
                 results.append((orig_metadata, dir_metadata))
 
         return results
 
     def persist_index_computations(
         self,
         results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
     ) -> Dict[str, int]:
         # Deduplicate directories
-        dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
-        orig_metadata: List[OriginIntrinsicMetadataRow] = []
+        dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
+        orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
         summary: Dict = {}
         for (orig_item, dir_item) in results:
             assert dir_item.metadata == orig_item.metadata
             if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
                 # Only store non-empty metadata sets
-                if dir_item not in dir_metadata:
-                    dir_metadata.append(dir_item)
-                if orig_item not in orig_metadata:
-                    orig_metadata.append(orig_item)
+                if dir_item.id not in dir_metadata:
+                    dir_metadata[dir_item.id] = dir_item
+                if orig_item.id not in orig_metadata:
+                    orig_metadata[orig_item.id] = orig_item
 
         if dir_metadata:
             summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
-                dir_metadata
+                list(dir_metadata.values())
             )
             summary.update(summary_dir)
         if orig_metadata:
-            summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
+            summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
+                list(orig_metadata.values())
+            )
             summary.update(summary_ori)
 
         return summary
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index 418c2ec..f6253d7 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -1,348 +1,371 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 import logging
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+import urllib.parse
 import uuid
 import xml.parsers.expat
 
 from pyld import jsonld
 import rdflib
 from typing_extensions import TypedDict
 import xmltodict
 import yaml
 
 from swh.indexer.codemeta import _document_loader, compact
 from swh.indexer.namespaces import RDF, SCHEMA
 from swh.indexer.storage.interface import Sha1
 
 
 class DirectoryLsEntry(TypedDict):
     target: Sha1
     sha1: Sha1
     name: bytes
     type: str
 
 
 TTranslateCallable = TypeVar(
     "TTranslateCallable",
     bound=Callable[[Any, rdflib.Graph, rdflib.term.BNode, Any], None],
 )
 
 
 def produce_terms(*uris: str) -> Callable[[TTranslateCallable], TTranslateCallable]:
     """Returns a decorator that marks the decorated function as adding
     the given terms to the ``translated_metadata`` dict"""
 
     def decorator(f: TTranslateCallable) -> TTranslateCallable:
         if not hasattr(f, "produced_terms"):
             f.produced_terms = []  # type: ignore
         f.produced_terms.extend(uris)  # type: ignore
         return f
 
     return decorator
 
 
 class BaseMapping:
     """Base class for :class:`BaseExtrinsicMapping` and :class:`BaseIntrinsicMapping`,
     not to be inherited directly."""
 
     def __init__(self, log_suffix=""):
         self.log_suffix = log_suffix
         self.log = logging.getLogger(
             "%s.%s" % (self.__class__.__module__, self.__class__.__name__)
         )
 
     @property
     def name(self):
         """A name of this mapping, used as an identifier in the
         indexer storage."""
         raise NotImplementedError(f"{self.__class__.__name__}.name")
 
     def translate(self, raw_content: bytes) -> Optional[Dict]:
         """
         Translates content by parsing content from a bytestring containing
         mapping-specific data and translating with the appropriate mapping
         to JSON-LD using the Codemeta and ForgeFed vocabularies.
 
         Args:
             raw_content: raw content to translate
 
         Returns:
             translated metadata in JSON friendly form needed for the content
             if parseable, :const:`None` otherwise.
 
         """
         raise NotImplementedError(f"{self.__class__.__name__}.translate")
 
     def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         raise NotImplementedError(f"{self.__class__.__name__}.normalize_translation")
 
 
 class BaseExtrinsicMapping(BaseMapping):
     """Base class for extrinsic_metadata mappings to inherit from
 
     To implement a new mapping:
 
     - inherit this class
     - override translate function
     """
 
     @classmethod
     def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
         """
         Returns the list of extrinsic metadata formats which can be translated
         by this mapping
         """
         raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats")
 
     def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         return compact(metadata, forgefed=True)
 
 
 class BaseIntrinsicMapping(BaseMapping):
     """Base class for intrinsic-metadata mappings to inherit from
 
     To implement a new mapping:
 
     - inherit this class
     - override translate function
     """
 
     @classmethod
     def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
         """
         Returns the sha1 hashes of files which can be translated by this mapping
         """
         raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
 
     def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         return compact(metadata, forgefed=False)
 
 
 class SingleFileIntrinsicMapping(BaseIntrinsicMapping):
     """Base class for all intrinsic metadata mappings that use a single file as input."""
 
     @property
     def filename(self):
         """The .json file to extract metadata from."""
         raise NotImplementedError(f"{self.__class__.__name__}.filename")
 
     @classmethod
     def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
         for entry in file_entries:
             if entry["name"].lower() == cls.filename:
                 return [entry["sha1"]]
         return []
 
 
 class DictMapping(BaseMapping):
     """Base class for mappings that take as input a file that is mostly
     a key-value store (eg. a shallow JSON dict)."""
 
     string_fields: List[str] = []
     """List of fields that are simple strings, and don't need any
     normalization."""
 
     uri_fields: List[str] = []
     """List of fields that are simple URIs, and don't need any
     normalization."""
 
     @property
     def mapping(self):
         """A translation dict to map dict keys into a canonical name."""
         raise NotImplementedError(f"{self.__class__.__name__}.mapping")
 
     @staticmethod
     def _normalize_method_name(name: str) -> str:
         return name.replace("-", "_")
 
     @classmethod
     def supported_terms(cls):
         # one-to-one mapping from the original key to a CodeMeta term
         simple_terms = {
             str(term)
             for (key, term) in cls.mapping.items()
             if key in cls.string_fields + cls.uri_fields
             or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
         }
 
         # more complex mapping from the original key to JSON-LD
         complex_terms = {
             str(term)
             for meth_name in dir(cls)
             if meth_name.startswith("translate_")
             for term in getattr(getattr(cls, meth_name), "produced_terms", [])
         }
 
         return simple_terms | complex_terms
 
     def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]:
         """
         Translates content  by parsing content from a dict object
         and translating with the appropriate mapping
 
         Args:
             content_dict (dict): content dict to translate
 
         Returns:
             dict: translated metadata in json-friendly form needed for
             the indexer
 
         """
         graph = rdflib.Graph()
 
         # The main object being described (the SoftwareSourceCode) does not necessarily
         # may or may not have an id.
         # Either way, we temporarily use this URI to identify it. Unfortunately,
         # we cannot use a blank node as we need to use it for JSON-LD framing later,
         # and blank nodes cannot be used for framing in JSON-LD >= 1.1
         root_id = (
             "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
             + str(uuid.uuid4())
         )
         root = rdflib.URIRef(root_id)
         graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode))
 
         for k, v in content_dict.items():
             # First, check if there is a specific translation
             # method for this key
             translation_method = getattr(
                 self, "translate_" + self._normalize_method_name(k), None
             )
             if translation_method:
                 translation_method(graph, root, v)
             elif k in self.mapping:
                 # if there is no method, but the key is known from the
                 # crosswalk table
                 codemeta_key = self.mapping[k]
 
                 # if there is a normalization method, use it on the value,
                 # and add its results to the triples
                 normalization_method = getattr(
                     self, "normalize_" + self._normalize_method_name(k), None
                 )
                 if normalization_method:
                     v = normalization_method(v)
                     if v is None:
                         pass
                     elif isinstance(v, list):
                         for item in reversed(v):
                             graph.add((root, codemeta_key, item))
                     else:
                         graph.add((root, codemeta_key, v))
                 elif k in self.string_fields and isinstance(v, str):
                     graph.add((root, codemeta_key, rdflib.Literal(v)))
                 elif k in self.string_fields and isinstance(v, list):
                     for item in v:
                         graph.add((root, codemeta_key, rdflib.Literal(item)))
                 elif k in self.uri_fields and isinstance(v, str):
-                    graph.add((root, codemeta_key, rdflib.URIRef(v)))
+                    # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
+                    # URLs that are blatantly invalid early, so PyLD does not crash.
+                    parsed_url = urllib.parse.urlparse(v)
+                    if parsed_url.netloc:
+                        graph.add((root, codemeta_key, rdflib.URIRef(v)))
                 elif k in self.uri_fields and isinstance(v, list):
                     for item in v:
                         if isinstance(item, str):
-                            graph.add((root, codemeta_key, rdflib.URIRef(item)))
+                            # ditto
+                            parsed_url = urllib.parse.urlparse(item)
+                            if parsed_url.netloc:
+                                graph.add((root, codemeta_key, rdflib.URIRef(item)))
                 else:
                     continue
 
         self.extra_translation(graph, root, content_dict)
 
+        self.sanitize(graph)
+
         # Convert from rdflib's internal graph representation to JSON
         s = graph.serialize(format="application/ld+json")
 
         # Load from JSON to a list of Python objects
         jsonld_graph = json.loads(s)
 
         # Use JSON-LD framing to turn the graph into a rooted tree
         # frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
         translated_metadata = jsonld.frame(
             jsonld_graph,
             {"@id": root_id},
             options={
                 "documentLoader": _document_loader,
                 "processingMode": "json-ld-1.1",
             },
         )
 
         # Remove the temporary id we added at the beginning
         if isinstance(translated_metadata["@id"], list):
             translated_metadata["@id"].remove(root_id)
         else:
             del translated_metadata["@id"]
 
         return self.normalize_translation(translated_metadata)
 
+    def sanitize(self, graph: rdflib.Graph) -> None:
+        # Remove triples that make PyLD crash
+        for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))):
+            graph.remove((subject, predicate, rdflib.URIRef("")))
+
+        # Should not happen, but we's better check as this may lead to incorrect data
+        invalid = False
+        for triple in graph.triples((rdflib.URIRef(""), None, None)):
+            invalid = True
+            logging.error("Empty triple subject URI: %r", triple)
+        if invalid:
+            raise ValueError("Empty triple subject(s)")
+
     def extra_translation(
         self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any]
-    ):
+    ) -> None:
         """Called at the end of the translation process, and may add arbitrary triples
         to ``graph`` based on the input dictionary (passed as ``d``).
         """
         pass
 
 
 class JsonMapping(DictMapping):
     """Base class for all mappings that use JSON data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict]:
         try:
             raw_content_string: str = raw_content.decode()
         except UnicodeDecodeError:
             self.log.warning("Error unidecoding from %s", self.log_suffix)
             return None
         try:
             content_dict = json.loads(raw_content_string)
         except json.JSONDecodeError:
             self.log.warning("Error unjsoning from %s", self.log_suffix)
             return None
         if isinstance(content_dict, dict):
             return self._translate_dict(content_dict)
         return None
 
 
 class XmlMapping(DictMapping):
     """Base class for all mappings that use XML data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict]:
         try:
             d = xmltodict.parse(raw_content)
         except xml.parsers.expat.ExpatError:
             self.log.warning("Error parsing XML from %s", self.log_suffix)
             return None
         except UnicodeDecodeError:
             self.log.warning("Error unidecoding XML from %s", self.log_suffix)
             return None
         except (LookupError, ValueError):
             # unknown encoding or multi-byte encoding
             self.log.warning("Error detecting XML encoding from %s", self.log_suffix)
             return None
         if not isinstance(d, dict):
             self.log.warning("Skipping ill-formed XML content: %s", raw_content)
             return None
         return self._translate_dict(d)
 
 
 class SafeLoader(yaml.SafeLoader):
     yaml_implicit_resolvers = {
         k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
         for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
     }
 
 
 class YamlMapping(DictMapping, SingleFileIntrinsicMapping):
     """Base class for all mappings that use Yaml data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]:
         raw_content_string: str = raw_content.decode()
         try:
             content_dict = yaml.load(raw_content_string, Loader=SafeLoader)
         except yaml.scanner.ScannerError:
             return None
 
         if isinstance(content_dict, dict):
             return self._translate_dict(content_dict)
 
         return None
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
index fe3b87e..d8d8702 100644
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -1,113 +1,117 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Any, Tuple
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
 
 from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
 from .utils import prettyprint_graph  # noqa
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
 class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
     name = "github"
-    mapping = CROSSWALK_TABLE["GitHub"]
+    mapping = {
+        **CROSSWALK_TABLE["GitHub"],
+        "topics": SCHEMA.keywords,  # TODO: submit this to the official crosswalk
+    }
     string_fields = [
         "archive_url",
         "created_at",
         "updated_at",
         "description",
         "full_name",
         "html_url",
         "issues_url",
+        "topics",
     ]
 
     @classmethod
     def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
         return ("application/vnd.github.v3+json",)
 
     def extra_translation(self, graph, root, content_dict):
         graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
         graph.add((root, RDF.type, FORGEFED.Repository))
 
     @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
     def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
 
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> GitHubMapping().translate_forks_count(graph, root, 42)
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "https://forgefed.org/ns#forks": {
                 "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
                 "https://www.w3.org/ns/activitystreams#totalItems": 42
             }
         }
         """
         if isinstance(v, int):
             collection = BNode()
             graph.add((root, FORGEFED.forks, collection))
             graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection))
             graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
 
     @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
     def translate_stargazers_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
 
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> GitHubMapping().translate_stargazers_count(graph, root, 42)
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "https://www.w3.org/ns/activitystreams#likes": {
                 "@type": "https://www.w3.org/ns/activitystreams#Collection",
                 "https://www.w3.org/ns/activitystreams#totalItems": 42
             }
         }
         """
         if isinstance(v, int):
             collection = BNode()
             graph.add((root, ACTIVITYSTREAMS.likes, collection))
             graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
             graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
 
     @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
     def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
 
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> GitHubMapping().translate_watchers_count(graph, root, 42)
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "https://www.w3.org/ns/activitystreams#followers": {
                 "@type": "https://www.w3.org/ns/activitystreams#Collection",
                 "https://www.w3.org/ns/activitystreams#totalItems": 42
             }
         }
         """
         if isinstance(v, int):
             collection = BNode()
             graph.add((root, ACTIVITYSTREAMS.followers, collection))
             graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
             graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
 
     def normalize_license(self, d):
         """
 
         >>> GitHubMapping().normalize_license({'spdx_id': 'MIT'})
         rdflib.term.URIRef('https://spdx.org/licenses/MIT')
         """
         if isinstance(d, dict) and isinstance(d.get("spdx_id"), str):
             return SPDX + d["spdx_id"]
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
index a374a5e..8b3e48d 100644
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -1,159 +1,162 @@
 # Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 from typing import Any, Dict
 
 from rdflib import Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import SCHEMA
 
 from .base import SingleFileIntrinsicMapping, XmlMapping
 from .utils import prettyprint_graph  # noqa
 
 
 class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
     """
     dedicated class for Maven (pom.xml) mapping and translation
     """
 
     name = "maven"
     filename = b"pom.xml"
     mapping = CROSSWALK_TABLE["Java (Maven)"]
     string_fields = ["name", "version", "description", "email"]
 
     _default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
 
     def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
         return super()._translate_dict(d.get("project") or {})
 
     def extra_translation(self, graph: Graph, root, d):
         self.parse_repositories(graph, root, d)
 
     def parse_repositories(self, graph: Graph, root, d):
         """https://maven.apache.org/pom.html#Repositories
 
         >>> import rdflib
         >>> import xmltodict
         >>> from pprint import pprint
         >>> d = xmltodict.parse('''
         ... <repositories>
         ...   <repository>
         ...     <id>codehausSnapshots</id>
         ...     <name>Codehaus Snapshots</name>
         ...     <url>http://snapshots.maven.codehaus.org/maven2</url>
         ...     <layout>default</layout>
         ...   </repository>
         ... </repositories>
         ... ''')
         >>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
         """
         repositories = d.get("repositories")
         if not repositories:
             self.parse_repository(graph, root, d, self._default_repository)
         elif isinstance(repositories, dict):
             repositories = repositories.get("repository") or []
             if not isinstance(repositories, list):
                 repositories = [repositories]
             for repo in repositories:
                 self.parse_repository(graph, root, d, repo)
 
     def parse_repository(self, graph: Graph, root, d, repo):
         if not isinstance(repo, dict):
             return
         if repo.get("layout", "default") != "default":
             return  # TODO ?
         url = repo.get("url")
         group_id = d.get("groupId")
         artifact_id = d.get("artifactId")
         if (
             isinstance(url, str)
             and isinstance(group_id, str)
             and isinstance(artifact_id, str)
         ):
             repo = os.path.join(url, *group_id.split("."), artifact_id)
+            if "${" in repo:
+                # Often use as templating in pom.xml files collected from VCSs
+                return
             graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
 
     def normalize_groupId(self, id_):
         """https://maven.apache.org/pom.html#Maven_Coordinates
 
         >>> MavenMapping().normalize_groupId('org.example')
         rdflib.term.Literal('org.example')
         """
         if isinstance(id_, str):
             return Literal(id_)
 
     def translate_licenses(self, graph, root, licenses):
         """https://maven.apache.org/pom.html#Licenses
 
         >>> import xmltodict
         >>> import json
         >>> d = xmltodict.parse('''
         ... <licenses>
         ...   <license>
         ...     <name>Apache License, Version 2.0</name>
         ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
         ...   </license>
         ... </licenses>
         ... ''')
         >>> print(json.dumps(d, indent=4))
         {
             "licenses": {
                 "license": {
                     "name": "Apache License, Version 2.0",
                     "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
                 }
             }
         }
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/license": {
                 "@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
             }
         }
 
         or, if there are more than one license:
 
         >>> import xmltodict
         >>> from pprint import pprint
         >>> d = xmltodict.parse('''
         ... <licenses>
         ...   <license>
         ...     <name>Apache License, Version 2.0</name>
         ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
         ...   </license>
         ...   <license>
         ...     <name>MIT License</name>
         ...     <url>https://opensource.org/licenses/MIT</url>
         ...   </license>
         ... </licenses>
         ... ''')
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
         >>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
         {(rdflib.term.URIRef('http://example.org/test-software'),
           rdflib.term.URIRef('http://schema.org/license'),
           rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
          (rdflib.term.URIRef('http://example.org/test-software'),
           rdflib.term.URIRef('http://schema.org/license'),
           rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
         """
 
         if not isinstance(licenses, dict):
             return
         licenses = licenses.get("license")
         if isinstance(licenses, dict):
             licenses = [licenses]
         elif not isinstance(licenses, list):
             return
         for license in licenses:
             if isinstance(license, dict) and isinstance(license.get("url"), str):
                 graph.add((root, SCHEMA.license, URIRef(license["url"])))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
index 1540ef6..f2eaa64 100644
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -1,282 +1,292 @@
 # Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import re
 import urllib.parse
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import SCHEMA
 
 from .base import JsonMapping, SingleFileIntrinsicMapping
 from .utils import add_list, prettyprint_graph  # noqa
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
 class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
     """
     dedicated class for NPM (package.json) mapping and translation
     """
 
     name = "npm"
     mapping = CROSSWALK_TABLE["NodeJS"]
     filename = b"package.json"
     string_fields = ["name", "version", "description", "email"]
     uri_fields = ["homepage"]
 
     _schema_shortcuts = {
         "github": "git+https://github.com/%s.git",
         "gist": "git+https://gist.github.com/%s.git",
         "gitlab": "git+https://gitlab.com/%s.git",
         # Bitbucket supports both hg and git, and the shortcut does not
         # tell which one to use.
         # 'bitbucket': 'https://bitbucket.org/',
     }
 
     def normalize_repository(self, d):
         """https://docs.npmjs.com/files/package.json#repository
 
         >>> NpmMapping().normalize_repository({
         ...     'type': 'git',
         ...     'url': 'https://example.org/foo.git'
         ... })
         rdflib.term.URIRef('git+https://example.org/foo.git')
         >>> NpmMapping().normalize_repository(
         ...     'gitlab:foo/bar')
         rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
         >>> NpmMapping().normalize_repository(
         ...     'foo/bar')
         rdflib.term.URIRef('git+https://github.com/foo/bar.git')
         """
         if (
             isinstance(d, dict)
             and isinstance(d.get("type"), str)
             and isinstance(d.get("url"), str)
         ):
             url = "{type}+{url}".format(**d)
         elif isinstance(d, str):
             if "://" in d:
                 url = d
             elif ":" in d:
                 (schema, rest) = d.split(":", 1)
                 if schema in self._schema_shortcuts:
                     url = self._schema_shortcuts[schema] % rest
                 else:
                     return None
             else:
                 url = self._schema_shortcuts["github"] % d
 
         else:
             return None
 
         return URIRef(url)
 
     def normalize_bugs(self, d):
         """https://docs.npmjs.com/files/package.json#bugs
 
         >>> NpmMapping().normalize_bugs({
         ...     'url': 'https://example.org/bugs/',
         ...     'email': 'bugs@example.org'
         ... })
         rdflib.term.URIRef('https://example.org/bugs/')
         >>> NpmMapping().normalize_bugs(
         ...     'https://example.org/bugs/')
         rdflib.term.URIRef('https://example.org/bugs/')
         """
         if isinstance(d, dict) and isinstance(d.get("url"), str):
             return URIRef(d["url"])
         elif isinstance(d, str):
             return URIRef(d)
         else:
             return None
 
     _parse_author = re.compile(
         r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
     )
 
     def translate_author(self, graph: Graph, root, d):
         r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
 
         >>> from pprint import pprint
         >>> root = URIRef("http://example.org/test-software")
         >>> graph = Graph()
         >>> NpmMapping().translate_author(graph, root, {
         ...     'name': 'John Doe',
         ...     'email': 'john.doe@example.org',
         ...     'url': 'https://example.org/~john.doe',
         ... })
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/author": {
                 "@list": [
                     {
                         "@type": "http://schema.org/Person",
                         "http://schema.org/email": "john.doe@example.org",
                         "http://schema.org/name": "John Doe",
                         "http://schema.org/url": {
                             "@id": "https://example.org/~john.doe"
                         }
                     }
                 ]
             }
         }
         >>> graph = Graph()
         >>> NpmMapping().translate_author(graph, root,
         ...     'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
         ... )
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/author": {
                 "@list": [
                     {
                         "@type": "http://schema.org/Person",
                         "http://schema.org/email": "john.doe@example.org",
                         "http://schema.org/name": "John Doe",
                         "http://schema.org/url": {
                             "@id": "https://example.org/~john.doe"
                         }
                     }
                 ]
             }
         }
         >>> graph = Graph()
         >>> NpmMapping().translate_author(graph, root, {
         ...     'name': 'John Doe',
         ...     'email': 'john.doe@example.org',
         ...     'url': 'https:\\\\example.invalid/~john.doe',
         ... })
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/author": {
                 "@list": [
                     {
                         "@type": "http://schema.org/Person",
                         "http://schema.org/email": "john.doe@example.org",
                         "http://schema.org/name": "John Doe"
                     }
                 ]
             }
         }
         """  # noqa
         author = BNode()
         graph.add((author, RDF.type, SCHEMA.Person))
         if isinstance(d, dict):
             name = d.get("name", None)
             email = d.get("email", None)
             url = d.get("url", None)
         elif isinstance(d, str):
             match = self._parse_author.match(d)
             if not match:
                 return None
             name = match.group("name")
             email = match.group("email")
             url = match.group("url")
         else:
             return None
 
         if name and isinstance(name, str):
             graph.add((author, SCHEMA.name, Literal(name)))
         if email and isinstance(email, str):
             graph.add((author, SCHEMA.email, Literal(email)))
         if url and isinstance(url, str):
             # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
             # URLs that are blatantly invalid early, so PyLD does not crash.
             parsed_url = urllib.parse.urlparse(url)
             if parsed_url.netloc:
                 graph.add((author, SCHEMA.url, URIRef(url)))
 
         add_list(graph, root, SCHEMA.author, [author])
 
     def normalize_description(self, description):
         r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
         mistake that causes issues in the database because of null bytes in JSON.
 
         >>> NpmMapping().normalize_description("foo bar")
         rdflib.term.Literal('foo bar')
         >>> NpmMapping().normalize_description(
         ...     "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
         ... )
         rdflib.term.Literal('foo bar')
         >>> NpmMapping().normalize_description(
         ...     "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
         ... )
         rdflib.term.Literal('foo bar')
         >>> NpmMapping().normalize_description(
         ...     # invalid UTF-16 and meaningless UTF-8:
         ...     "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
         ... ) is None
         True
         >>> NpmMapping().normalize_description(
         ...     # ditto (ut looks like little-endian at first)
         ...     "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
         ... ) is None
         True
         >>> NpmMapping().normalize_description(None) is None
         True
         """
         if not isinstance(description, str):
             return None
         # XXX: if this function ever need to support more cases, consider
         # switching to https://pypi.org/project/ftfy/ instead of adding more hacks
         if description.startswith("\ufffd\ufffd") and "\x00" in description:
             # 2 unicode replacement characters followed by '# ' encoded as UTF-16
             # is a common mistake, which indicates a README.md was saved as UTF-16,
             # and some NPM tool opened it as UTF-8 and used the first line as
             # description.
 
             description_bytes = description.encode()
 
             # Strip the the two unicode replacement characters
             assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
             description_bytes = description_bytes[6:]
 
             # If the following attempts fail to recover the description, discard it
             # entirely because the current indexer storage backend (postgresql) cannot
             # store zero bytes in JSON columns.
             description = None
 
             if not description_bytes.startswith(b"\x00"):
                 # try UTF-16 little-endian (the most common) first
                 try:
                     description = description_bytes.decode("utf-16le")
                 except UnicodeDecodeError:
                     pass
             if description is None:
                 # if it fails, try UTF-16 big-endian
                 try:
                     description = description_bytes.decode("utf-16be")
                 except UnicodeDecodeError:
                     pass
 
             if description:
                 if description.startswith("# "):
                     description = description[2:]
                 return Literal(description.rstrip())
             else:
                 return None
         return Literal(description)
 
     def normalize_license(self, s):
         """https://docs.npmjs.com/files/package.json#license
 
         >>> NpmMapping().normalize_license('MIT')
         rdflib.term.URIRef('https://spdx.org/licenses/MIT')
         """
         if isinstance(s, str):
+            if s.startswith("SEE LICENSE IN "):
+                # Very common pattern, because it is an example in the specification.
+                # It is followed by the filename; and the indexer architecture currently
+                # does not allow accessing that from metadata mappings.
+                # (Plus, an hypothetical license mapping would eventually pick it up)
+                return
+            if " " in s:
+                # Either an SPDX expression, or unusable data
+                # TODO: handle it
+                return
             return SPDX + s
 
     def normalize_keywords(self, lst):
         """https://docs.npmjs.com/files/package.json#homepage
 
         >>> NpmMapping().normalize_keywords(['foo', 'bar'])
         [rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
         """
         if isinstance(lst, list):
             return [Literal(x) for x in lst if isinstance(x, str)]
diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py
index c0592dc..3085bcc 100644
--- a/swh/indexer/tests/metadata_dictionary/test_github.py
+++ b/swh/indexer/tests/metadata_dictionary/test_github.py
@@ -1,142 +1,156 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 CONTEXT = [
     "https://doi.org/10.5063/schema/codemeta-2.0",
     {
         "as": "https://www.w3.org/ns/activitystreams#",
         "forge": "https://forgefed.org/ns#",
     },
 ]
 
 
 def test_compute_metadata_none():
     """
     testing content empty content is empty
     should return None
     """
     content = b""
 
     # None if no metadata was found or an error occurred
     declared_metadata = None
     result = MAPPINGS["GitHubMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_supported_terms():
     terms = MAPPINGS["GitHubMapping"].supported_terms()
     assert {
         "http://schema.org/name",
         "http://schema.org/license",
         "https://forgefed.org/ns#forks",
         "https://www.w3.org/ns/activitystreams#totalItems",
     } <= terms
 
 
 def test_compute_metadata_github():
-    """
-    testing only computation of metadata with hard_mapping_npm
-    """
     content = b"""
 {
   "id": 80521091,
   "node_id": "MDEwOlJlcG9zaXRvcnk4MDUyMTA5MQ==",
   "name": "swh-indexer",
   "full_name": "SoftwareHeritage/swh-indexer",
   "private": false,
   "owner": {
     "login": "SoftwareHeritage",
     "id": 18555939,
     "node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5",
     "avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4",
     "gravatar_id": "",
     "url": "https://api.github.com/users/SoftwareHeritage",
     "type": "Organization",
     "site_admin": false
   },
   "html_url": "https://github.com/SoftwareHeritage/swh-indexer",
   "description": "GitHub mirror of Metadata indexer",
   "fork": false,
   "url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer",
   "created_at": "2017-01-31T13:05:39Z",
   "updated_at": "2022-06-22T08:02:20Z",
   "pushed_at": "2022-06-29T09:01:08Z",
   "git_url": "git://github.com/SoftwareHeritage/swh-indexer.git",
   "ssh_url": "git@github.com:SoftwareHeritage/swh-indexer.git",
   "clone_url": "https://github.com/SoftwareHeritage/swh-indexer.git",
   "svn_url": "https://github.com/SoftwareHeritage/swh-indexer",
   "homepage": "https://forge.softwareheritage.org/source/swh-indexer/",
   "size": 2713,
   "stargazers_count": 13,
   "watchers_count": 12,
   "language": "Python",
   "has_issues": false,
   "has_projects": false,
   "has_downloads": true,
   "has_wiki": false,
   "has_pages": false,
   "forks_count": 1,
   "mirror_url": null,
   "archived": false,
   "disabled": false,
   "open_issues_count": 0,
   "license": {
     "key": "gpl-3.0",
     "name": "GNU General Public License v3.0",
     "spdx_id": "GPL-3.0",
     "url": "https://api.github.com/licenses/gpl-3.0",
     "node_id": "MDc6TGljZW5zZTk="
   },
   "allow_forking": true,
   "is_template": false,
   "web_commit_signoff_required": false,
   "topics": [
 
   ],
   "visibility": "public",
   "forks": 1,
   "open_issues": 0,
   "watchers": 13,
   "default_branch": "master",
   "temp_clone_token": null,
   "organization": {
     "login": "SoftwareHeritage",
     "id": 18555939,
     "node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5",
     "avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4",
     "gravatar_id": "",
     "type": "Organization",
     "site_admin": false
   },
   "network_count": 1,
   "subscribers_count": 6
 }
 
     """
     result = MAPPINGS["GitHubMapping"]().translate(content)
     assert result == {
         "@context": CONTEXT,
         "type": "forge:Repository",
         "forge:forks": {
             "as:totalItems": 1,
             "type": "as:OrderedCollection",
         },
         "as:likes": {
             "as:totalItems": 13,
             "type": "as:Collection",
         },
         "as:followers": {
             "as:totalItems": 12,
             "type": "as:Collection",
         },
         "license": "https://spdx.org/licenses/GPL-3.0",
         "name": "SoftwareHeritage/swh-indexer",
         "description": "GitHub mirror of Metadata indexer",
         "schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer",
         "schema:dateCreated": "2017-01-31T13:05:39Z",
         "schema:dateModified": "2022-06-22T08:02:20Z",
     }
+
+
+def test_github_topics():
+    content = b"""
+{
+  "topics": [
+    "foo",
+    "bar"
+  ]
+}
+    """
+    result = MAPPINGS["GitHubMapping"]().translate(content)
+    assert set(result.pop("keywords", [])) == {"foo", "bar"}, result
+    assert result == {
+        "@context": CONTEXT,
+        "type": "forge:Repository",
+    }
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
index 0267e95..afde286 100644
--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -1,365 +1,406 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 
 from hypothesis import HealthCheck, given, settings
 
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 from ..utils import xml_document_strategy
 
 
 def test_compute_metadata_maven():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <repositories>
         <repository>
           <id>central</id>
           <name>Maven Repository Switchboard</name>
           <layout>default</layout>
           <url>http://repo1.maven.org/maven2</url>
           <snapshots>
             <enabled>false</enabled>
           </snapshots>
         </repository>
       </repositories>
       <licenses>
         <license>
           <name>Apache License, Version 2.0</name>
           <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
           <distribution>repo</distribution>
           <comments>A business-friendly OSS license</comments>
         </license>
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
         "codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
     }
 
 
 def test_compute_metadata_maven_empty():
     raw_content = b"""
     <project>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
     }
 
 
 def test_compute_metadata_maven_almost_empty():
     raw_content = b"""
     <project>
       <foo/>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
     }
 
 
 def test_compute_metadata_maven_invalid_xml(caplog):
     expected_warning = (
         "swh.indexer.metadata_dictionary.maven.MavenMapping",
         logging.WARNING,
         "Error parsing XML from foo",
     )
     caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
 
     raw_content = b"""
     <project>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
     raw_content = b"""
     """
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
 
 def test_compute_metadata_maven_unknown_encoding(caplog):
     expected_warning = (
         "swh.indexer.metadata_dictionary.maven.MavenMapping",
         logging.WARNING,
         "Error detecting XML encoding from foo",
     )
     caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
 
     raw_content = b"""<?xml version="1.0" encoding="foo"?>
     <project>
     </project>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
     raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
     <project>
     </project>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
 
 def test_compute_metadata_maven_invalid_encoding(caplog):
     expected_warning = [
         # libexpat1 <= 2.2.10-2+deb11u1
         [
             (
                 "swh.indexer.metadata_dictionary.maven.MavenMapping",
                 logging.WARNING,
                 "Error unidecoding XML from foo",
             )
         ],
         # libexpat1 >= 2.2.10-2+deb11u2
         [
             (
                 "swh.indexer.metadata_dictionary.maven.MavenMapping",
                 logging.WARNING,
                 "Error parsing XML from foo",
             )
         ],
     ]
     caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
 
     raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
     <foo\xe5ct>
     </foo>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples in expected_warning, result
     assert result is None
 
 
 def test_compute_metadata_maven_minimal():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
 
 def test_compute_metadata_maven_empty_nodes():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <repositories>
       </repositories>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version></version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <name></name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <licenses>
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <groupId></groupId>
       <version>1.2.3</version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "version": "1.2.3",
     }
 
 
 def test_compute_metadata_maven_invalid_licenses():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <licenses>
         foo
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
 
 def test_compute_metadata_maven_multiple():
     """Tests when there are multiple code repos and licenses."""
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <repositories>
         <repository>
           <id>central</id>
           <name>Maven Repository Switchboard</name>
           <layout>default</layout>
           <url>http://repo1.maven.org/maven2</url>
           <snapshots>
             <enabled>false</enabled>
           </snapshots>
         </repository>
         <repository>
           <id>example</id>
           <name>Example Maven Repo</name>
           <layout>default</layout>
           <url>http://example.org/maven2</url>
         </repository>
       </repositories>
       <licenses>
         <license>
           <name>Apache License, Version 2.0</name>
           <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
           <distribution>repo</distribution>
           <comments>A business-friendly OSS license</comments>
         </license>
         <license>
           <name>MIT license</name>
           <url>https://opensource.org/licenses/MIT</url>
         </license>
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert set(result.pop("license")) == {
         "https://www.apache.org/licenses/LICENSE-2.0.txt",
         "https://opensource.org/licenses/MIT",
     }, result
     assert set(result.pop("codeRepository")) == {
         "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
         "http://example.org/maven2/com/mycompany/app/my-app",
     }, result
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
     }
 
 
+def test_compute_metadata_maven_invalid_repository():
+    raw_content = b"""
+    <project>
+      <name>Maven Default Project</name>
+      <modelVersion>4.0.0</modelVersion>
+      <groupId>com.mycompany.app</groupId>
+      <artifactId>my-app</artifactId>
+      <version>1.2.3</version>
+      <repositories>
+        <repository>
+          <id>tcc-transaction-internal-releases</id>
+          <name>internal repository for released artifacts</name>
+          <url>${repo.internal.releases.url}</url>
+          <snapshots>
+              <enabled>false</enabled>
+          </snapshots>
+          <releases>
+              <enabled>true</enabled>
+          </releases>
+        </repository>
+      </repositories>
+      <licenses>
+        <license>
+          <name>Apache License, Version 2.0</name>
+          <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+          <distribution>repo</distribution>
+          <comments>A business-friendly OSS license</comments>
+        </license>
+      </licenses>
+    </project>"""
+    result = MAPPINGS["MavenMapping"]().translate(raw_content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "name": "Maven Default Project",
+        "schema:identifier": "com.mycompany.app",
+        "version": "1.2.3",
+        "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+    }
+
+
 @settings(suppress_health_check=[HealthCheck.too_slow])
 @given(
     xml_document_strategy(
         keys=list(MAPPINGS["MavenMapping"].mapping),  # type: ignore
         root="project",
         xmlns="http://maven.apache.org/POM/4.0.0",
     )
 )
 def test_maven_adversarial(doc):
     MAPPINGS["MavenMapping"]().translate(doc)
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index b0ead25..cdaf6b7 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -1,335 +1,420 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 
 from hypothesis import HealthCheck, given, settings
 import pytest
 
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.storage.model import ContentMetadataRow
 
 from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
 from ..utils import (
     BASE_TEST_CONFIG,
     MAPPING_DESCRIPTION_CONTENT_SHA1,
     json_document_strategy,
 )
 
 
 def test_compute_metadata_none():
     """
     testing content empty content is empty
     should return None
     """
     content = b""
 
     # None if no metadata was found or an error occurred
     declared_metadata = None
     result = MAPPINGS["NpmMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_compute_metadata_npm():
     """
     testing only computation of metadata with hard_mapping_npm
     """
     content = b"""
         {
             "name": "test_metadata",
             "version": "0.0.2",
             "description": "Simple package.json test for indexer",
               "repository": {
                 "type": "git",
                 "url": "https://github.com/moranegg/metadata_test"
             },
             "author": {
                 "email": "moranegg@example.com",
                 "name": "Morane G"
             }
         }
     """
     declared_metadata = {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "test_metadata",
         "version": "0.0.2",
         "description": "Simple package.json test for indexer",
         "codeRepository": "git+https://github.com/moranegg/metadata_test",
         "author": [
             {
                 "type": "Person",
                 "name": "Morane G",
                 "email": "moranegg@example.com",
             }
         ],
     }
 
     result = MAPPINGS["NpmMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_compute_metadata_invalid_description_npm():
     """
     testing only computation of metadata with hard_mapping_npm
     """
     content = b"""
         {
             "name": "test_metadata",
             "version": "0.0.2",
             "description": 1234
     }
     """
     declared_metadata = {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "test_metadata",
         "version": "0.0.2",
     }
 
     result = MAPPINGS["NpmMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_index_content_metadata_npm(storage, obj_storage):
     """
     testing NPM with package.json
     - one sha1 uses a file that can't be translated to metadata and
       should return None in the translated metadata
     """
     sha1s = [
         MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"],
         MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"],
         MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"],
     ]
 
     # this metadata indexer computes only metadata for package.json
     # in npm context with a hard mapping
     config = BASE_TEST_CONFIG.copy()
     config["tools"] = [TRANSLATOR_TOOL]
     metadata_indexer = ContentMetadataTestIndexer(config=config)
     metadata_indexer.run(sha1s, log_suffix="unknown content")
     results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
 
     expected_results = [
         ContentMetadataRow(
             id=sha1s[0],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "codeRepository": "git+https://github.com/moranegg/metadata_test",
                 "description": "Simple package.json test for indexer",
                 "name": "test_metadata",
                 "version": "0.0.1",
             },
         ),
         ContentMetadataRow(
             id=sha1s[1],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "issueTracker": "https://github.com/npm/npm/issues",
                 "author": [
                     {
                         "type": "Person",
                         "name": "Isaac Z. Schlueter",
                         "email": "i@izs.me",
                         "url": "http://blog.izs.me",
                     }
                 ],
                 "codeRepository": "git+https://github.com/npm/npm",
                 "description": "a package manager for JavaScript",
                 "license": "https://spdx.org/licenses/Artistic-2.0",
                 "version": "5.0.3",
                 "name": "npm",
                 "url": "https://docs.npmjs.com/",
             },
         ),
     ]
 
     for result in results:
         del result.tool["id"]
         result.metadata.pop("keywords", None)
 
     # The assertion below returns False sometimes because of nested lists
     assert expected_results == results
 
 
 def test_npm_null_list_item_normalization():
     package_json = b"""{
         "name": "foo",
         "keywords": [
             "foo",
             null
         ],
         "homepage": [
             "http://example.org/",
             null
         ]
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "type": "SoftwareSourceCode",
         "url": "http://example.org/",
         "keywords": "foo",
     }
 
 
 def test_npm_bugs_normalization():
     # valid dictionary
     package_json = b"""{
         "name": "foo",
         "bugs": {
             "url": "https://github.com/owner/project/issues",
             "email": "foo@example.com"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "issueTracker": "https://github.com/owner/project/issues",
         "type": "SoftwareSourceCode",
     }
 
     # "invalid" dictionary
     package_json = b"""{
         "name": "foo",
         "bugs": {
             "email": "foo@example.com"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "type": "SoftwareSourceCode",
     }
 
     # string
     package_json = b"""{
         "name": "foo",
         "bugs": "https://github.com/owner/project/issues"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "issueTracker": "https://github.com/owner/project/issues",
         "type": "SoftwareSourceCode",
     }
 
 
 def test_npm_repository_normalization():
     # normal
     package_json = b"""{
         "name": "foo",
         "repository": {
             "type" : "git",
             "url" : "https://github.com/npm/cli.git"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "codeRepository": "git+https://github.com/npm/cli.git",
         "type": "SoftwareSourceCode",
     }
 
     # missing url
     package_json = b"""{
         "name": "foo",
         "repository": {
             "type" : "git"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "type": "SoftwareSourceCode",
     }
 
     # github shortcut
     package_json = b"""{
         "name": "foo",
         "repository": "github:npm/cli"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     expected_result = {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "codeRepository": "git+https://github.com/npm/cli.git",
         "type": "SoftwareSourceCode",
     }
     assert result == expected_result
 
     # github shortshortcut
     package_json = b"""{
         "name": "foo",
         "repository": "npm/cli"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == expected_result
 
     # gitlab shortcut
     package_json = b"""{
         "name": "foo",
         "repository": "gitlab:user/repo"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "codeRepository": "git+https://gitlab.com/user/repo.git",
         "type": "SoftwareSourceCode",
     }
 
 
+def test_npm_invalid_uris():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "",
+  "author": {
+    "name": "foo",
+    "url": "http://example.org"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "http://example.org",
+  "author": {
+    "name": "foo",
+    "url": ""
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "url": "http://example.org",
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "",
+  "author": {
+    "name": "foo",
+    "url": ""
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "http:example.org",
+  "author": {
+    "name": "foo",
+    "url": "http:example.com"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+
+def test_npm_invalid_licenses():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "license": "SEE LICENSE IN LICENSE.md",
+  "author": {
+    "name": "foo",
+    "url": "http://example.org"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+        "version": "1.0.0",
+    }
+
+
 @settings(suppress_health_check=[HealthCheck.too_slow])
 @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping)))  # type: ignore
 def test_npm_adversarial(doc):
     raw = json.dumps(doc).encode()
     MAPPINGS["NpmMapping"]().translate(raw)
 
 
 @pytest.mark.parametrize(
     "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
 )
 def test_detect_metadata_package_json(filename):
     df = [
         {
             "sha1_git": b"abc",
             "name": b"index.js",
             "target": b"abc",
             "length": 897,
             "status": "visible",
             "type": "file",
             "perms": 33188,
             "dir_id": b"dir_a",
             "sha1": b"bcd",
         },
         {
             "sha1_git": b"aab",
             "name": filename,
             "target": b"aab",
             "length": 712,
             "status": "visible",
             "type": "file",
             "perms": 33188,
             "dir_id": b"dir_a",
             "sha1": b"cde",
         },
     ]
     results = detect_metadata(df)
 
     expected_results = {"NpmMapping": [b"cde"]}
     assert expected_results == results
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 20c49c0..3ba7ad8 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,283 +1,312 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 from unittest.mock import call
 
 import attr
 
 from swh.indexer.metadata import (
     ContentMetadataIndexer,
     DirectoryMetadataIndexer,
     ExtrinsicMetadataIndexer,
 )
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
     OriginExtrinsicMetadataRow,
 )
 from swh.indexer.tests.utils import DIRECTORY2
 from swh.model.model import (
     Directory,
     DirectoryEntry,
     MetadataAuthority,
     MetadataAuthorityType,
     MetadataFetcher,
     RawExtrinsicMetadata,
 )
 from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
 
 from .utils import (
     BASE_TEST_CONFIG,
     MAPPING_DESCRIPTION_CONTENT_SHA1,
     MAPPING_DESCRIPTION_CONTENT_SHA1GIT,
     YARN_PARSER_METADATA,
     fill_obj_storage,
     fill_storage,
 )
 
 TRANSLATOR_TOOL = {
     "name": "swh-metadata-translator",
     "version": "0.0.2",
     "configuration": {"type": "local", "context": "NpmMapping"},
 }
 
 
 class ContentMetadataTestIndexer(ContentMetadataIndexer):
     """Specific Metadata whose configuration is enough to satisfy the
     indexing tests.
     """
 
     def parse_config_file(self, *args, **kwargs):
         assert False, "should not be called; the dir indexer configures it."
 
 
 DIRECTORY_METADATA_CONFIG = {
     **BASE_TEST_CONFIG,
     "tools": TRANSLATOR_TOOL,
 }
 
 REMD = RawExtrinsicMetadata(
     target=ExtendedSWHID(
         object_type=ExtendedObjectType.ORIGIN,
         object_id=b"\x01" * 20,
     ),
     discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
     authority=MetadataAuthority(
         type=MetadataAuthorityType.FORGE,
         url="https://example.org/",
     ),
     fetcher=MetadataFetcher(
         name="example-fetcher",
         version="1.0.0",
     ),
     format="application/vnd.github.v3+json",
     metadata=b'{"full_name": "test software"}',
 )
 
 
 class TestMetadata:
     """
     Tests metadata_mock_tool tool for Metadata detection
     """
 
     def test_directory_metadata_indexer(self):
         metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
         dir_ = DIRECTORY2
 
         assert (
             dir_.entries[0].target
             == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
         )
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
                     ],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
             ]
         )
 
         metadata_indexer.run([dir_.id])
 
         results = list(
             metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id])
         )
 
         expected_results = [
             DirectoryIntrinsicMetadataRow(
                 id=dir_.id,
                 tool=TRANSLATOR_TOOL,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
             )
         ]
 
         for result in results:
             del result.tool["id"]
 
         assert results == expected_results
 
     def test_directory_metadata_indexer_single_root_dir(self):
         metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # Add a parent directory, that is the only directory at the root
         # of the directory
         dir_ = DIRECTORY2
         assert (
             dir_.entries[0].target
             == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
         )
 
         new_dir = Directory(
             entries=(
                 DirectoryEntry(
                     name=b"foobar-1.0.0",
                     type="dir",
                     target=dir_.id,
                     perms=16384,
                 ),
             ),
         )
         assert new_dir.id is not None
         metadata_indexer.storage.directory_add([new_dir])
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
                     ],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
             ]
         )
 
         metadata_indexer.run([new_dir.id])
 
         results = list(
             metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
         )
 
         expected_results = [
             DirectoryIntrinsicMetadataRow(
                 id=new_dir.id,
                 tool=TRANSLATOR_TOOL,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
             )
         ]
 
         for result in results:
             del result.tool["id"]
 
         assert results == expected_results
 
     def test_extrinsic_metadata_indexer_unknown_format(self, mocker):
         """Should be ignored when unknown format"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(REMD, format="unknown format")
 
         results = metadata_indexer.index(remd.id, data=remd)
 
         assert metadata_indexer.storage.method_calls == []
         assert results == []
 
     def test_extrinsic_metadata_indexer_github(self, mocker):
         """Nominal case, calling the mapping and storing the result"""
         origin = "https://example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
             {"raw_extrinsic_metadata": [REMD.to_dict()]}
         ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
 
         results = list(
             metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
         )
         assert results == [
             OriginExtrinsicMetadataRow(
                 id="https://example.org/jdoe/myrepo",
                 tool={"id": tool["id"], **TRANSLATOR_TOOL},
                 metadata={
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                     "type": "https://forgefed.org/ns#Repository",
                     "name": "test software",
                 },
                 from_remd_id=REMD.id,
                 mappings=["github"],
             )
         ]
 
     def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
         """Early abort on non-forge authorities"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(
             REMD,
             authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY),
         )
 
         results = metadata_indexer.index(remd.id, data=remd)
 
         assert metadata_indexer.storage.method_calls == []
         assert results == []
 
     def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker):
         """Should be ignored when authority URL does not match the origin"""
 
         origin = "https://different-domain.example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         results = metadata_indexer.index(REMD.id, data=REMD)
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
         assert results == []
+
+    def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker):
+        """Nominal case, calling the mapping and storing the result"""
+        origin = "https://example.org/jdoe/myrepo"
+
+        metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+        metadata_indexer.catch_exceptions = False
+        metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+        metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+        tool = metadata_indexer.idx_storage.indexer_configuration_get(
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+        )
+        assert tool is not None
+
+        assert metadata_indexer.process_journal_objects(
+            {
+                "raw_extrinsic_metadata": [
+                    REMD.to_dict(),
+                    {**REMD.to_dict(), "id": b"\x00" * 20},
+                ]
+            }
+        ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+        results = list(
+            metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+        )
+        assert len(results) == 1, results
+        assert results[0].from_remd_id == b"\x00" * 20
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 567f479..4b7057e 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,356 +1,409 @@
-# Copyright (C) 2018-2020  The Software Heritage developers
+# Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import copy
 from unittest.mock import patch
 
+import attr
 import pytest
 
 from swh.indexer.metadata import OriginMetadataIndexer
 from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.indexer.storage.model import (
     DirectoryIntrinsicMetadataRow,
     OriginIntrinsicMetadataRow,
 )
 from swh.model.model import Origin
 from swh.storage.interface import StorageInterface
 
 from .test_metadata import TRANSLATOR_TOOL
 from .utils import DIRECTORY2, YARN_PARSER_METADATA
 
 
 @pytest.fixture
 def swh_indexer_config(swh_indexer_config):
     """Override the default configuration to override the tools entry"""
     cfg = copy.deepcopy(swh_indexer_config)
     cfg["tools"] = TRANSLATOR_TOOL
     return cfg
 
 
 def test_origin_metadata_indexer_release(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://npm.example.org/yarn-parser"
     indexer.run([origin])
 
     tool = swh_indexer_config["tools"]
 
     dir_id = DIRECTORY2.id
     dir_metadata = DirectoryIntrinsicMetadataRow(
         id=dir_id,
         tool=tool,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
     origin_metadata = OriginIntrinsicMetadataRow(
         id=origin,
         tool=tool,
         from_directory=dir_id,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
 
     dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
     for dir_result in dir_results:
         assert dir_result.tool
         del dir_result.tool["id"]
     assert dir_results == [dir_metadata]
 
     orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
     for orig_result in orig_results:
         assert orig_result.tool
         del orig_result.tool["id"]
     assert orig_results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_revision(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     indexer.run([origin])
 
     tool = swh_indexer_config["tools"]
 
     dir_id = DIRECTORY2.id
     dir_metadata = DirectoryIntrinsicMetadataRow(
         id=dir_id,
         tool=tool,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
     origin_metadata = OriginIntrinsicMetadataRow(
         id=origin,
         tool=tool,
         from_directory=dir_id,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
 
     dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
     for dir_result in dir_results:
         assert dir_result.tool
         del dir_result.tool["id"]
     assert dir_results == [dir_metadata]
 
     orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
     for orig_result in orig_results:
         assert orig_result.tool
         del orig_result.tool["id"]
     assert orig_results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_origin(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["https://github.com/librariesio/yarn-parser"])
     indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
 
     origin = "https://github.com/librariesio/yarn-parser"
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert len(dir_results) == 1
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert len(orig_results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     storage.origin_add([Origin(url="https://example.com")])
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.run(["https://example.com"])
 
     origin = "https://example.com"
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     origin1 = "https://example.com"
     origin2 = "https://github.com/librariesio/yarn-parser"
     storage.origin_add([Origin(url=origin1)])
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.run([origin1, origin2])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == [
         DirectoryIntrinsicMetadataRow(
             id=dir_id,
             metadata=YARN_PARSER_METADATA,
             mappings=["npm"],
             tool=dir_results[0].tool,
         )
     ]
 
     orig_results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     for orig_result in orig_results:
         assert orig_results == [
             OriginIntrinsicMetadataRow(
                 id=origin2,
                 from_directory=dir_id,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
                 tool=orig_results[0].tool,
             )
         ]
 
 
 def test_origin_metadata_indexer_duplicate_directory(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.catch_exceptions = False
     origin1 = "https://github.com/librariesio/yarn-parser"
     origin2 = "https://github.com/librariesio/yarn-parser.git"
     indexer.run([origin1, origin2])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert len(dir_results) == 1
 
     orig_results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     assert len(orig_results) == 2
 
 
+def test_origin_metadata_indexer_duplicate_directory_different_result(
+    swh_indexer_config,
+    idx_storage: IndexerStorageInterface,
+    storage: StorageInterface,
+    obj_storage,
+    mocker,
+) -> None:
+    """Same as above, but indexing the same directory twice resulted in different
+    data (because list order differs).
+    """
+    indexer = OriginMetadataIndexer(config=swh_indexer_config)
+    indexer.storage = storage
+    indexer.idx_storage = idx_storage
+    indexer.catch_exceptions = False
+    origin1 = "https://github.com/librariesio/yarn-parser"
+    origin2 = "https://github.com/librariesio/yarn-parser.git"
+
+    directory_index = indexer.directory_metadata_indexer.index
+
+    nb_calls = 0
+
+    def side_effect(dir_id):
+        nonlocal nb_calls
+        if nb_calls == 0:
+            keywords = ["foo", "bar"]
+        elif nb_calls == 1:
+            keywords = ["bar", "foo"]
+        else:
+            assert False, nb_calls
+        nb_calls += 1
+        return [
+            attr.evolve(row, metadata={**row.metadata, "keywords": keywords})
+            for row in directory_index(dir_id)
+        ]
+
+    mocker.patch.object(
+        indexer.directory_metadata_indexer, "index", side_effect=side_effect
+    )
+
+    indexer.run([origin1, origin2])
+
+    dir_id = DIRECTORY2.id
+
+    dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+    assert len(dir_results) == 1
+
+    orig_results = list(
+        indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
+    )
+    assert len(orig_results) == 2
+
+
 def test_origin_metadata_indexer_no_metadata_file(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
         indexer.run([origin])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_no_metadata(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch(
         "swh.indexer.metadata.DirectoryMetadataIndexer"
         ".translate_directory_intrinsic_metadata",
         return_value=(["npm"], {"@context": "foo"}),
     ):
         indexer.run([origin])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 @pytest.mark.parametrize("catch_exceptions", [True, False])
 def test_origin_metadata_indexer_directory_error(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
     sentry_events,
     catch_exceptions,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
 
     indexer.catch_exceptions = catch_exceptions
 
     with patch(
         "swh.indexer.metadata.DirectoryMetadataIndexer"
         ".translate_directory_intrinsic_metadata",
         return_value=None,
     ):
         indexer.run([origin])
 
     assert len(sentry_events) == 1
     sentry_event = sentry_events.pop()
     assert sentry_event.get("tags") == {
         "swh-indexer-origin-head-swhid": (
             "swh:1:rev:a78410ce2f78f5078fd4ee7edb8c82c02a4a712c"
         ),
         "swh-indexer-origin-url": origin,
     }
     assert "'TypeError'" in str(sentry_event)
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 @pytest.mark.parametrize("catch_exceptions", [True, False])
 def test_origin_metadata_indexer_content_exception(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
     sentry_events,
     catch_exceptions,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
 
     indexer.catch_exceptions = catch_exceptions
 
     class TestException(Exception):
         pass
 
     with patch(
         "swh.indexer.metadata.ContentMetadataRow",
         side_effect=TestException(),
     ):
         indexer.run([origin])
 
     assert len(sentry_events) == 1
     sentry_event = sentry_events.pop()
     assert sentry_event.get("tags") == {
         "swh-indexer-content-sha1": "df9d3bcc0158faa446bd1af225f8e2e4afa576d7",
         "swh-indexer-origin-head-swhid": (
             "swh:1:rev:a78410ce2f78f5078fd4ee7edb8c82c02a4a712c"
         ),
         "swh-indexer-origin-url": origin,
     }
     assert ".TestException'" in str(sentry_event), sentry_event
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_unknown_origin(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     result = indexer.index_list([Origin("https://unknown.org/foo")])
     assert not result